In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



from sklearn.feature_selection import RFE
import statsmodels.formula.api as smf

In [2]:
df = pd.read_csv('cleaned_data.csv')
df.columns

Index(['director_name', 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes',
       'gross', 'actor_1_name', 'movie_title', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'language', 'country', 'content_rating', 'budget',
       'title_year', 'actor_2_facebook_likes', 'aspect_ratio', 'profit',
       'gross_bin', 'action', 'adventure', 'animation', 'biography', 'comedy',
       'crime', 'documentary', 'drama', 'family', 'fantasy', 'film-noir',
       'history', 'horror', 'music', 'musical', 'mystery', 'romance', 'sci_fi',
       'sport', 'thriller', 'war', 'western', 'director_high', 'director_low',
       'actor_1_high', 'actor_1_low', 'actor_2_high', 'actor_2_low',
       'actor_3_high', 'actor_3_low', 'is_english', 'profit_binary'],
      dtype='object')

In [3]:
X = df[['duration',
       'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_1_facebook_likes', 'cast_total_facebook_likes',
       'facenumber_in_poster', 'title_year', 'actor_2_facebook_likes',
       'aspect_ratio', 'profit']]

In [4]:
#looking at VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
X = add_constant(X)
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

for i in range(len(X.columns)):
    vif_data.loc[i,'VIF'] = variance_inflation_factor(X.values, i)

print(vif_data)

                      feature           VIF
0                       const  45633.468439
1                    duration      1.112283
2     director_facebook_likes      1.054721
3      actor_3_facebook_likes      8.363941
4      actor_1_facebook_likes    201.815449
5   cast_total_facebook_likes    304.323437
6        facenumber_in_poster      1.020413
7                  title_year      1.117374
8      actor_2_facebook_likes     19.341341
9                aspect_ratio      1.088387
10                     profit      1.007980


There is high VIF values for actor_1_facebook_likes, actor_1_facebook_likes, actor_2_facebook_likes.     

## Looking at correlation table:

In [5]:
X.corr()

Unnamed: 0,const,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,cast_total_facebook_likes,facenumber_in_poster,title_year,actor_2_facebook_likes,aspect_ratio,profit
const,,,,,,,,,,,
duration,,1.0,0.180659,0.125808,0.083661,0.12021,0.026967,-0.130258,0.128955,0.152966,0.0083
director_facebook_likes,,0.180659,1.0,0.117414,0.089655,0.118573,-0.04802,-0.044704,0.116076,0.037482,0.024458
actor_3_facebook_likes,,0.125808,0.117414,1.0,0.25242,0.4898,0.106213,0.117428,0.55376,0.046322,0.052425
actor_1_facebook_likes,,0.083661,0.089655,0.25242,1.0,0.944816,0.05746,0.096089,0.391188,0.056862,0.027909
cast_total_facebook_likes,,0.12021,0.118573,0.4898,0.944816,1.0,0.080765,0.126927,0.643003,0.06886,0.043866
facenumber_in_poster,,0.026967,-0.04802,0.106213,0.05746,0.080765,1.0,0.069937,0.073149,0.018864,0.011518
title_year,,-0.130258,-0.044704,0.117428,0.096089,0.126927,0.069937,1.0,0.12233,0.216728,-0.029582
actor_2_facebook_likes,,0.128955,0.116076,0.55376,0.391188,0.643003,0.073149,0.12233,1.0,0.06389,0.042224
aspect_ratio,,0.152966,0.037482,0.046322,0.056862,0.06886,0.018864,0.216728,0.06389,1.0,-0.005286


There is a high correlation between actor_1_facebook_likes and cast_total_facebook_likes (0.94), as well as cast_total_facebook_likes and actor_2_facebook_likes (0.64). We shall regulize the model using Ridge or Lasso to take this into account.

In [6]:
train = pd.read_csv('train.csv')

In [7]:
train.columns

Index(['director_name', 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes',
       'gross', 'genres', 'actor_1_name', 'movie_title',
       'cast_total_facebook_likes', 'actor_3_name', 'facenumber_in_poster',
       'plot_keywords', 'movie_imdb_link', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'aspect_ratio', 'profit', 'gross_bin', 'action', 'adventure',
       'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama',
       'family', 'fantasy', 'film-noir', 'history', 'horror', 'music',
       'musical', 'mystery', 'romance', 'sci_fi', 'sport', 'thriller', 'war',
       'western', 'director_high', 'director_low', 'actor_1_high',
       'actor_1_low', 'actor_2_high', 'actor_2_low', 'actor_3_high',
       'actor_3_low', 'is_english', 'profit_binary'],
      dtype='object')

In [8]:
X = df[[ 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes',  'actor_1_facebook_likes',
       'cast_total_facebook_likes',  'facenumber_in_poster',
       'actor_2_facebook_likes',
       'aspect_ratio', 'action', 'adventure',
       'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama',
       'family', 'fantasy', 'film-noir', 'history', 'horror', 'music',
       'musical', 'mystery', 'romance', 'sci_fi', 'sport', 'thriller', 'war',
       'western', 'director_high', 'director_low', 'actor_1_high',
       'actor_1_low', 'actor_2_high', 'actor_2_low', 'actor_3_high',
       'actor_3_low', 'is_english', 'profit_binary']]

In [9]:
#looking at VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
X = add_constant(X)
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

for i in range(len(X.columns)):
    vif_data.loc[i,'VIF'] = variance_inflation_factor(X.values, i)

print(vif_data)
print(vif_data.loc[vif_data.VIF > 15])

  return 1 - self.ssr/self.centered_tss


                      feature         VIF
0                       const    0.000000
1                    duration    1.465983
2     director_facebook_likes    1.069819
3      actor_3_facebook_likes    8.629599
4      actor_1_facebook_likes  209.638774
5   cast_total_facebook_likes  316.703639
6        facenumber_in_poster    1.106950
7      actor_2_facebook_likes   20.008131
8                aspect_ratio    1.092236
9                      action    1.604189
10                  adventure    1.595073
11                  animation    1.599686
12                  biography    1.260675
13                     comedy    1.784374
14                      crime    1.367508
15                documentary    1.086076
16                      drama    1.690636
17                     family    1.808023
18                    fantasy    1.262247
19                  film-noir    1.006198
20                    history    1.321349
21                     horror    1.439738
22                      music    1

  vif = 1. / (1. - r_squared_i)


31              director_high         inf

32               director_low         inf

33               actor_1_high         inf

34                actor_1_low         inf

35               actor_2_high         inf

36                actor_2_low         inf

37               actor_3_high         inf

38                actor_3_low         inf

## This indicates that there is very high multicolinearity amoung the data. This might be due to the fact that there is high correlation between high and low variables. In order to take this into account, I eliminated the low variables:

4      actor_1_facebook_likes  209.600412

5   cast_total_facebook_likes  316.586311

7      actor_2_facebook_likes   20.004890



In [10]:
X = df[[ 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes',  'actor_1_facebook_likes',
       'cast_total_facebook_likes',  'facenumber_in_poster',
       'actor_2_facebook_likes',
       'aspect_ratio', 'action', 'adventure',
       'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama',
       'family', 'fantasy', 'film-noir', 'history', 'horror', 'music',
       'musical', 'mystery', 'romance', 'sci_fi', 'sport', 'thriller', 'war',
       'western', 'director_high',  'actor_1_high',
        'actor_2_high', 'actor_3_high',
        'is_english', 'profit_binary']]

In [11]:
#looking at VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
X = add_constant(X)
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

for i in range(len(X.columns)):
    vif_data.loc[i,'VIF'] = variance_inflation_factor(X.values, i)

print(vif_data)

                      feature         VIF
0                       const   99.425530
1                    duration    1.465983
2     director_facebook_likes    1.069819
3      actor_3_facebook_likes    8.629599
4      actor_1_facebook_likes  209.638774
5   cast_total_facebook_likes  316.703639
6        facenumber_in_poster    1.106950
7      actor_2_facebook_likes   20.008131
8                aspect_ratio    1.092236
9                      action    1.604189
10                  adventure    1.595073
11                  animation    1.599686
12                  biography    1.260675
13                     comedy    1.784374
14                      crime    1.367508
15                documentary    1.086076
16                      drama    1.690636
17                     family    1.808023
18                    fantasy    1.262247
19                  film-noir    1.006198
20                    history    1.321349
21                     horror    1.439738
22                      music    1

31              director_high    1.451423

32               actor_1_high    1.420830

33               actor_2_high    1.736591

34               actor_3_high    2.079257

## We can now see that there is less multicolinearity amongst the cast variables. We shall now use this variables in our model to see if this fixes the previous issues.