In [9]:
# !pip3 install -U scikit-learn scipy matplotlib


In [12]:
%matplotlib inline

from matplotlib import pylab
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', 200)

In [13]:
nrg_bom_df = pd.read_csv('./nrg_bom_df.p')

# Get rid of bad rows.
nrg_bom_df = nrg_bom_df.loc[
    nrg_bom_df.num_of_theaters_opening_weekend > 1
]

nrg_bom_df['def/pos'] = nrg_bom_df.definite_int / nrg_bom_df.positive_int

In [14]:
nrg_bom_df.sort_values(by='opening_weekend_gross')

Unnamed: 0,movie,unaided_aw,definite_int,positive_int,definite_not_int,unaided_intent,total_aw,first_choice,opening_weekend_gross,production_budget,num_of_theaters_opening_weekend,release_date,def/pos
118,The Neon Demon,0.0,34.0,63.0,8.0,0.0,25.5,0.5,589014.0,,783.0,2016-06-24,0.539683
18,Billy Lynns Long Halftime Walk,0.0,32.5,67.75,4.5,0.0,31.75,1.0,901062.0,,1176.0,2016-11-11,0.479705
123,The Resurrection Of Gavin Ston,0.0,33.0,61.5,5.5,0.0,26.5,0.0,1206771.0,,890.0,2017-01-20,0.536585
97,Swiss Army Man,0.0,29.0,62.75,6.0,0.0,30.5,1.0,1414751.0,,636.0,2016-06-24,0.462151
24,Collide,0.5,28.75,65.5,3.25,0.0,33.25,0.75,1512824.0,,2045.0,2017-02-24,0.438931
87,Rules Dont Apply,0.0,31.25,67.75,3.5,0.0,34.0,1.25,1589625.0,25000000.0,2382.0,2016-11-23,0.461255
78,Phoenix Forgotten,0.0,33.5,70.25,3.25,0.0,20.75,0.75,1816499.0,,1626.0,2017-04-21,0.476868
67,Miss Sloane,0.0,30.0,66.75,3.5,0.0,31.0,1.0,1844972.0,,1648.0,2016-11-25,0.449438
89,Silence,1.0,32.0,68.75,2.75,0.0,39.75,1.75,1984530.0,,747.0,2016-12-23,0.465455
19,Bleed for This,0.25,31.0,68.5,4.75,0.0,27.75,1.0,2366810.0,6000000.0,1549.0,2016-11-18,0.452555


In [16]:
df = nrg_bom_df.copy()
df['def/pos_sq'] = df['def/pos'] * df['def/pos']

predictor_columns = [
    'unaided_intent',
    'first_choice',
    'def/pos',
    'def/pos_sq'
]
target_column = 'opening_weekend_gross'

X = df.loc[:, predictor_columns].values
y = df.loc[:, target_column].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

df['prediction'] = knn.predict(X)
test_predictions = knn.predict(X_test)

y1 = y
y2 = df.prediction.values

print('R^2: {}'.format(knn.score(X, y)))
print('RMSE: {}'.format(np.sqrt(mse(y1, y2))))
print('Test RMSE: {}'.format(np.sqrt(mse(y_test, test_predictions))))

R^2: 0.8472007150985956
RMSE: 12594130.853059301
Test RMSE: 12876123.811197562


In [17]:
df['residual'] = df.opening_weekend_gross - df.prediction

df.sort_values('residual')

Unnamed: 0,movie,unaided_aw,definite_int,positive_int,definite_not_int,unaided_intent,total_aw,first_choice,opening_weekend_gross,production_budget,num_of_theaters_opening_weekend,release_date,def/pos,def/pos_sq,prediction,residual
79,Pirates of the Caribbean Dead,18.666667,56.0,79.666667,3.333333,5.0,94.333333,15.333333,62983253.0,230000000.0,4276.0,2017-05-26,0.702929,0.494109,97815613.6,-34832360.6
138,X-Men Apocalypse,13.0,62.25,83.25,3.5,6.5,86.5,15.75,65769562.0,178000000.0,4150.0,2016-05-27,0.747748,0.559127,97815613.6,-32046051.6
14,Bad Santa 2,4.75,41.0,69.0,8.5,1.75,76.25,7.25,6176680.0,26000000.0,2920.0,2016-11-23,0.594203,0.353077,37613514.8,-31436834.8
56,King Arthur Legend of the Swor,9.333333,42.666667,71.666667,3.0,2.0,73.0,7.0,15371270.0,175000000.0,3702.0,2017-05-12,0.595349,0.35444,42582883.8,-27211613.8
10,Assassins Creed,7.0,47.0,75.0,5.0,2.0,78.0,9.0,10278225.0,125000000.0,2970.0,2016-12-21,0.626667,0.392711,33767733.0,-23489508.0
90,Sing,9.0,48.0,74.0,5.0,3.0,63.0,7.0,35258145.0,75000000.0,4022.0,2016-12-21,0.648649,0.420745,57312006.0,-22053861.0
76,Passengers,8.0,46.0,79.0,3.0,3.0,68.0,6.0,14869736.0,110000000.0,3478.0,2016-12-21,0.582278,0.339048,33660126.6,-18790390.6
41,Girls Trip,10.24598,32.928113,57.072471,10.346604,2.631495,64.275872,6.530148,31201920.0,19000000.0,2591.0,2017-07-21,0.576953,0.332874,48193032.6,-16991112.6
15,Baywatch,18.5,32.75,59.25,10.25,3.0,87.75,6.0,18503871.0,69000000.0,3647.0,2017-05-25,0.552743,0.305524,33660126.6,-15156255.6
128,Transformers The Last Knight,20.032918,44.857529,68.265368,8.055388,3.076106,89.566543,7.320225,44680073.0,217000000.0,4069.0,2017-06-21,0.657105,0.431787,58591249.6,-13911176.6
