In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



from sklearn.feature_selection import RFE
import statsmodels.formula.api as smf

In [2]:
df = pd.read_csv('cleaned_data.csv')
df.columns

Index(['director_name', 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes',
       'gross', 'actor_1_name', 'movie_title', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'language', 'country', 'content_rating', 'budget',
       'title_year', 'actor_2_facebook_likes', 'aspect_ratio', 'profit',
       'gross_bin', 'action', 'adventure', 'animation', 'biography', 'comedy',
       'crime', 'documentary', 'drama', 'family', 'fantasy', 'film-noir',
       'history', 'horror', 'music', 'musical', 'mystery', 'romance', 'sci_fi',
       'sport', 'thriller', 'war', 'western', 'director_high', 'director_low',
       'actor_1_high', 'actor_1_low', 'actor_2_high', 'actor_2_low',
       'actor_3_high', 'actor_3_low', 'is_english', 'profit_binary'],
      dtype='object')

In [3]:
X = df[['duration',
       'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_1_facebook_likes', 'cast_total_facebook_likes',
       'facenumber_in_poster', 'title_year', 'actor_2_facebook_likes',
       'aspect_ratio', 'profit']]

In [4]:
#looking at VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
X = add_constant(X)
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

for i in range(len(X.columns)):
    vif_data.loc[i,'VIF'] = variance_inflation_factor(X.values, i)

print(vif_data)

                      feature           VIF
0                       const  46228.792712
1                    duration      1.112963
2     director_facebook_likes      1.059676
3      actor_3_facebook_likes      8.436235
4      actor_1_facebook_likes    207.038770
5   cast_total_facebook_likes    312.230193
6        facenumber_in_poster      1.021622
7                  title_year      1.133140
8      actor_2_facebook_likes     19.744161
9                aspect_ratio      1.090800
10                     profit      1.094960


There is high VIF values for actor_1_facebook_likes, actor_1_facebook_likes, actor_2_facebook_likes.     

## Looking at correlation table:

In [5]:
X.corr()

Unnamed: 0,const,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,cast_total_facebook_likes,facenumber_in_poster,title_year,actor_2_facebook_likes,aspect_ratio,profit
const,,,,,,,,,,,
duration,,1.0,0.180935,0.126184,0.084123,0.120861,0.027091,-0.13056,0.129689,0.152889,0.071213
director_facebook_likes,,0.180935,1.0,0.117329,0.089717,0.118631,-0.047899,-0.044568,0.116076,0.037902,0.102291
actor_3_facebook_likes,,0.126184,0.117329,1.0,0.252165,0.489614,0.105984,0.117313,0.553635,0.046303,0.174435
actor_1_facebook_likes,,0.084123,0.089717,0.252165,1.0,0.944793,0.057065,0.095866,0.390949,0.056757,0.057217
cast_total_facebook_likes,,0.120861,0.118631,0.489614,0.944793,1.0,0.080325,0.126692,0.642853,0.068771,0.112756
facenumber_in_poster,,0.027091,-0.047899,0.105984,0.057065,0.080325,1.0,0.069762,0.072864,0.018824,-0.026529
title_year,,-0.13056,-0.044568,0.117313,0.095866,0.126692,0.069762,1.0,0.122183,0.216508,-0.115179
actor_2_facebook_likes,,0.129689,0.116076,0.553635,0.390949,0.642853,0.072864,0.122183,1.0,0.063894,0.127142
aspect_ratio,,0.152889,0.037902,0.046303,0.056757,0.068771,0.018824,0.216508,0.063894,1.0,-0.059317


There is a high correlation between actor_1_facebook_likes and cast_total_facebook_likes (0.94), as well as cast_total_facebook_likes and actor_2_facebook_likes (0.64). We shall regulize the model using Ridge or Lasso to take this into account.

In [6]:
train = pd.read_csv('train.csv')

In [7]:
train.columns

Index(['director_name', 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes',
       'gross', 'genres', 'actor_1_name', 'movie_title',
       'cast_total_facebook_likes', 'actor_3_name', 'facenumber_in_poster',
       'plot_keywords', 'movie_imdb_link', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'aspect_ratio', 'profit', 'gross_bin', 'action', 'adventure',
       'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama',
       'family', 'fantasy', 'film-noir', 'history', 'horror', 'music',
       'musical', 'mystery', 'romance', 'sci_fi', 'sport', 'thriller', 'war',
       'western', 'director_high', 'director_low', 'actor_1_high',
       'actor_1_low', 'actor_2_high', 'actor_2_low', 'actor_3_high',
       'actor_3_low', 'is_english', 'profit_binary'],
      dtype='object')

In [8]:
X = df[[ 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes',  'actor_1_facebook_likes',
       'cast_total_facebook_likes',  'facenumber_in_poster',
       'actor_2_facebook_likes',
       'aspect_ratio', 'action', 'adventure',
       'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama',
       'family', 'fantasy', 'film-noir', 'history', 'horror', 'music',
       'musical', 'mystery', 'romance', 'sci_fi', 'sport', 'thriller', 'war',
       'western', 'director_high', 'director_low', 'actor_1_high',
       'actor_1_low', 'actor_2_high', 'actor_2_low', 'actor_3_high',
       'actor_3_low', 'is_english', 'profit_binary']]

In [9]:
#looking at VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
X = add_constant(X)
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

for i in range(len(X.columns)):
    vif_data.loc[i,'VIF'] = variance_inflation_factor(X.values, i)

print(vif_data)
print(vif_data.loc[vif_data.VIF > 15])

  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


                      feature         VIF
0                       const    0.000000
1                    duration    1.469866
2     director_facebook_likes    1.069845
3      actor_3_facebook_likes    8.629169
4      actor_1_facebook_likes  209.600412
5   cast_total_facebook_likes  316.586311
6        facenumber_in_poster    1.106666
7      actor_2_facebook_likes   20.004890
8                aspect_ratio    1.092222
9                      action    1.602375
10                  adventure    1.595014
11                  animation    1.605578
12                  biography    1.260912
13                     comedy    1.788665
14                      crime    1.368476
15                documentary    1.086365
16                      drama    1.693902
17                     family    1.814367
18                    fantasy    1.262095
19                  film-noir    1.006200
20                    history    1.323985
21                     horror    1.443268
22                      music    1

31              director_high         inf

32               director_low         inf

33               actor_1_high         inf

34                actor_1_low         inf

35               actor_2_high         inf

36                actor_2_low         inf

37               actor_3_high         inf

38                actor_3_low         inf

## This indicates that there is very high multicolinearity amoung the data. This might be due to the fact that there is high correlation between high and low variables. In order to take this into account, I eliminated the low variables:

4      actor_1_facebook_likes  209.600412

5   cast_total_facebook_likes  316.586311

7      actor_2_facebook_likes   20.004890



In [10]:
X = df[[ 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes',  'actor_1_facebook_likes',
       'cast_total_facebook_likes',  'facenumber_in_poster',
       'actor_2_facebook_likes',
       'aspect_ratio', 'action', 'adventure',
       'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama',
       'family', 'fantasy', 'film-noir', 'history', 'horror', 'music',
       'musical', 'mystery', 'romance', 'sci_fi', 'sport', 'thriller', 'war',
       'western', 'director_high',  'actor_1_high',
        'actor_2_high', 'actor_3_high',
        'is_english', 'profit_binary']]

In [11]:
#looking at VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
X = add_constant(X)
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

for i in range(len(X.columns)):
    vif_data.loc[i,'VIF'] = variance_inflation_factor(X.values, i)

print(vif_data)

                      feature         VIF
0                       const  100.426246
1                    duration    1.469866
2     director_facebook_likes    1.069845
3      actor_3_facebook_likes    8.629169
4      actor_1_facebook_likes  209.600412
5   cast_total_facebook_likes  316.586311
6        facenumber_in_poster    1.106666
7      actor_2_facebook_likes   20.004890
8                aspect_ratio    1.092222
9                      action    1.602375
10                  adventure    1.595014
11                  animation    1.605578
12                  biography    1.260912
13                     comedy    1.788665
14                      crime    1.368476
15                documentary    1.086365
16                      drama    1.693902
17                     family    1.814367
18                    fantasy    1.262095
19                  film-noir    1.006200
20                    history    1.323985
21                     horror    1.443268
22                      music    1

31              director_high    1.451423

32               actor_1_high    1.420830

33               actor_2_high    1.736591

34               actor_3_high    2.079257

## We can now see that there is less multicolinearity amongst the cast variables. We shall now use this variables in our model to see if this fixes the previous issues.