In [118]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [94]:
## Unpickle the data
## This is cleaned data with all features, including dummies for categorical variables.
df = pd.read_pickle('intial_data_all_features')
df.head()

Unnamed: 0_level_0,domestic_gross,international_gross,opening_dollars,running_time_mins,days_in_release,widest_release_in_theatres,franchise,title_length,Drama,Comedy,...,Unrated,2016,2017,2018,2019,2020,2021,Spring,Summer,Winter
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Spider-Man: No Way Home,750429025.0,1027000000.0,260138569.0,148,54,4336,1,23,0,0,...,0,0,0,0,0,0,1,0,0,1
Shang-Chi and the Legend of the Ten Rings,224543292.0,207689700.0,75388688.0,132,159,4300,1,41,0,0,...,0,0,0,0,0,0,1,0,0,0
Venom: Let There Be Carnage,213550366.0,288500000.0,90033210.0,97,131,4225,0,27,0,0,...,0,0,0,0,0,0,1,0,0,0
Black Widow,183651655.0,195979700.0,80366312.0,134,215,4275,1,11,0,0,...,0,0,0,0,0,0,1,0,1,0
F9: The Fast Saga,173005945.0,553223600.0,70043165.0,143,229,4203,1,17,0,0,...,0,0,0,0,0,0,1,0,1,0


In [100]:
## Due to the many catagorial variables, I ended up with 1176 observations of 156 features
## (and one target variable).
df.shape

(1176, 157)

In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1176 entries, Spider-Man: No Way Home to The Other Side of the Door
Columns: 157 entries, domestic_gross to Winter
dtypes: float64(3), int64(26), uint8(128)
memory usage: 422.6+ KB


In [102]:
## Separate target and other features
X = df.drop(columns = ['international_gross'])
y = df['international_gross']
y.head()

name
Spider-Man: No Way Home                      1.027000e+09
Shang-Chi and the Legend of the Ten Rings    2.076897e+08
Venom: Let There Be Carnage                  2.885000e+08
Black Widow                                  1.959797e+08
F9: The Fast Saga                            5.532236e+08
Name: international_gross, dtype: float64

In [103]:
X.shape

(1176, 156)

In [104]:
y.shape
#X.dtypes

(1176,)

In [105]:
## Split into train/validate/test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .25, random_state = 3)

In [106]:
X_train.shape, y_train.shape

((940, 156), (940,))

In [107]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [108]:
predictions = model.predict(X_train)
mean_absolute_error(y_train, predictions)

42139559.79708731

In [109]:
model.score(X_train, y_train)

0.8282652351052222

In [110]:
model.intercept_

-123999675.17849867

In [111]:
model.coef_

array([ 1.10829653e+00,  1.14554403e+00,  6.98325108e+05,  8.50033733e+04,
       -2.04065179e+03,  1.95220476e+07, -3.51469557e+05, -9.72651785e+06,
       -1.56236406e+07,  4.69589872e+06,  1.17248531e+07,  3.21994459e+07,
       -1.90598189e+06, -1.86796760e+07,  7.55457608e+05,  2.07596167e+07,
        7.79120901e+06, -4.15123166e+06, -2.33035807e+07,  5.43862650e+06,
        8.62518668e+06, -5.61541604e+06,  2.56473423e+07, -2.54930414e+06,
        3.14991991e+06, -7.52435005e+06, -1.00450861e+07, -5.68732392e+06,
        1.16474789e+07, -3.07017779e+06, -3.68797170e+06,  9.58454248e+06,
        4.86425927e+05,  1.93159963e+07,  5.65606976e+06,  1.30385160e-04,
       -2.99203788e+03, -2.43297162e+06,  7.58652511e+06,  1.08730839e+07,
        3.83102674e+07, -9.91784195e+05, -3.94135714e-05,  3.56532232e+06,
       -2.89012475e+07,  6.46343115e+08, -5.22710654e+06, -3.99339120e+07,
        2.17424767e+07, -5.33328863e+06,  7.72997737e-06,  2.14720063e+07,
        1.16390910e+07,  

In [112]:
lm = LinearRegression()
kf = KFold(n_splits = 5, shuffle = True, random_state = 6)
lm_cv = cross_val_score(lm, X_train, y_train, cv = kf, scoring = 'r2')
print(lm_cv)

[0.77329668 0.6094725  0.84569344 0.68019479 0.77257538]


In [113]:
## A lot of variation in these scores looks like overfitting!

In [114]:
lm_cv_mae = cross_val_score(lm, X_train, y_train, cv = kf, scoring = 'neg_mean_absolute_error')
print(lm_cv_mae)

[-52990227.15342609 -53969330.70963119 -46295605.69862169
 -55904136.8285153  -51790258.17496086]


## VIF

In [116]:
## Check VIF becasue I know from MVP that there may be some multicollinearity issues.
## Code adapted from Collinearity slides

VIF_x = X_train
VIF_x = sm.add_constant(X_train)
VIF_y = y_train
sm_model = sm.OLS(VIF_y, VIF_x)
results = sm_model.fit()
results.summary()

0,1,2,3
Dep. Variable:,international_gross,R-squared:,0.828
Model:,OLS,Adj. R-squared:,0.797
Method:,Least Squares,F-statistic:,26.2
Date:,"Mon, 21 Feb 2022",Prob (F-statistic):,5.63e-224
Time:,16:35:24,Log-Likelihood:,-18364.0
No. Observations:,940,AIC:,37020.0
Df Residuals:,793,BIC:,37730.0
Df Model:,146,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.24e+08,6.79e+07,-1.826,0.068,-2.57e+08,9.28e+06
domestic_gross,1.1083,0.107,10.354,0.000,0.898,1.318
opening_dollars,1.1455,0.319,3.593,0.000,0.520,1.771
running_time_mins,6.983e+05,2.2e+05,3.173,0.002,2.66e+05,1.13e+06
days_in_release,8.5e+04,2.61e+04,3.261,0.001,3.38e+04,1.36e+05
widest_release_in_theatres,-2040.6523,4382.447,-0.466,0.642,-1.06e+04,6561.916
franchise,1.952e+07,9.13e+06,2.138,0.033,1.6e+06,3.74e+07
title_length,-3.515e+05,3.64e+05,-0.964,0.335,-1.07e+06,3.64e+05
Drama,-9.727e+06,7.99e+06,-1.218,0.224,-2.54e+07,5.95e+06

0,1,2,3
Omnibus:,450.437,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8288.128
Skew:,1.731,Prob(JB):,0.0
Kurtosis:,17.129,Cond. No.,8.5e+24


In [120]:
variables = results.model.exog
vif = [variance_inflation_factor(variables, i) for i in range(variables.shape[1])]
vif

  return 1 - self.ssr/self.centered_tss


[670.536495328843,
 15.257596924925682,
 14.71040186360107,
 2.492221075659603,
 2.2501347593324774,
 5.364976658790342,
 2.0230891792390113,
 1.6032990829352607,
 2.307225772148774,
 1.9646845861867375,
 2.4315130807738314,
 2.3182768301038883,
 2.9391386392625454,
 1.8889651233674043,
 1.675620299970943,
 1.7484530702816872,
 5.167991068455112,
 1.5512045419335279,
 1.658105350927591,
 2.2191544436196855,
 1.642987220548234,
 3.1636042263903845,
 1.4956379836305296,
 1.3556318017503972,
 3.128210632843584,
 1.377688552904644,
 1.3475827630392183,
 1.5117658919078774,
 1.1930944339939036,
 1.6847241315354893,
 3.251990011337855,
 1.3204785585609908,
 8.642756325883452,
 1.6849849209094312,
 3.2448906081104583,
 2.9031554226977097,
 nan,
 1.8898605165868685,
 1.9904443381140557,
 3.268003416426693,
 7.773561275234182,
 1.3859919895821318,
 1.3874798024418098,
 nan,
 2.3107828554544096,
 1.412787852998325,
 1.831977747573928,
 1.2986555653425162,
 1.29471424118413,
 1.3589200225653526,


In [None]:
## Remove -- years, opening dollars to see if this reduces multicollinearity/fixes VIF issues