In [1]:
from datetime import datetime as dt

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    LabelEncoder, 
    OneHotEncoder, 
    StandardScaler
)

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
nrg_bom_df = pd.read_csv('./nrg_bom_df.p')
nrg_bom_df['def/pos'] = nrg_bom_df.definite_int / nrg_bom_df.positive_int

One-hot encode release month.

In [3]:
def get_month(date_str):
    date = dt.strptime(date_str, '%Y-%m-%d')
    month = date.month
    return month

nrg_bom_df['month'] = nrg_bom_df.release_date.apply(get_month)

months = nrg_bom_df.month

le = LabelEncoder()
le.fit(months)
month_labels = le.transform(months)

enc = OneHotEncoder()
enc.fit(month_labels.reshape(-1, 1))

month_ohe = enc.transform(month_labels.reshape(-1, 1)).toarray()

month_columns = ['month_' + str(x) for x in le.classes_]
nrg_bom_df[month_columns] = pd.DataFrame(month_ohe)

In [19]:
# IY. Get rid of this cell after!
nrg_bom_df

Unnamed: 0,movie,unaided_aw,definite_int,positive_int,definite_not_int,unaided_intent,total_aw,first_choice,opening_weekend_gross,production_budget,num_of_theaters_opening_weekend,release_date,def/pos,month,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_10,month_11,month_12
0,47 Meters Down,2.509812,31.604176,61.029671,4.572116,0.414572,39.184463,1.595005,11205561.0,,2270.0,2017-06-16,0.517849,6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,A Cure For Wellness,1.000000,33.000000,63.750000,5.750000,0.000000,39.000000,1.750000,4356941.0,40000000.0,2704.0,2017-02-17,0.517647,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A Dogs Purpose,4.250000,46.750000,74.250000,5.000000,1.750000,61.750000,7.500000,18222810.0,22000000.0,3059.0,2017-01-27,0.629630,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Alice Through the Looking Glas,6.750000,45.000000,73.250000,4.500000,2.250000,81.250000,9.500000,26858726.0,170000000.0,3763.0,2016-05-27,0.614334,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Alien Covenant,15.333333,45.000000,70.333333,6.666667,3.333333,75.000000,10.000000,36160621.0,97000000.0,3761.0,2017-05-19,0.639810,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,All Eyez On Me,5.286682,38.017271,63.906721,5.305977,1.565210,39.791328,4.292060,26435354.0,40000000.0,2471.0,2017-06-16,0.594887,6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,Allied,2.250000,34.500000,68.250000,3.750000,0.250000,50.750000,2.250000,12701743.0,85000000.0,3160.0,2016-11-23,0.505495,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,Almost Christmas,1.750000,43.500000,71.250000,5.000000,0.000000,55.000000,3.500000,15134235.0,17000000.0,2376.0,2016-11-11,0.610526,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,Annabelle Creation,9.663329,38.404342,61.092788,12.620311,2.273302,58.637532,5.495515,35006404.0,15000000.0,3502.0,2017-08-11,0.628623,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,Arrival,2.250000,40.000000,74.750000,3.000000,0.750000,49.750000,2.750000,24074047.0,47000000.0,2317.0,2016-11-11,0.535117,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [5]:
predictor_columns = ['unaided_intent', 'first_choice', 'def/pos']
predictor_columns += ['month_{}'.format(n) for n in range(1, 13)]
predictor_columns.remove('month_9')

target_column = 'opening_weekend_gross'

X = nrg_bom_df.loc[:, predictor_columns].values
y = nrg_bom_df.loc[:, target_column].values

test_size = .25

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

rfr = RandomForestRegressor(n_estimators=300)
rfr.fit(X_train, y_train)

print(rfr.score(X, y))
print(rfr.score(X_test, y_test))

test_predictions = rfr.predict(X_test)

print('Test RMSE: {}'.format(np.sqrt(mse(y_test, test_predictions))))

0.957745068834
0.88341869799
Test RMSE: 9524717.695552114


In [6]:
features_df = pd.DataFrame(list(zip(predictor_columns, rfr.feature_importances_)))
features_df.sort_values(1, ascending=False)

Unnamed: 0,0,1
0,unaided_intent,0.506562
2,def/pos,0.268357
1,first_choice,0.192004
9,month_7,0.006889
5,month_3,0.006575
7,month_5,0.005354
8,month_6,0.004286
6,month_4,0.002573
13,month_12,0.002534
12,month_11,0.001806


#### Look at peak vs non-peak months.

In [6]:
non_peak_months = [1, 2, 3, 4, 8, 9, 10]

non_peak_df = nrg_bom_df.loc[
    nrg_bom_df.month.isin(non_peak_months)
]
peak_df = nrg_bom_df.loc[
    ~nrg_bom_df.month.isin(non_peak_months)
]

X_non_peak = non_peak_df.loc[:, predictor_columns]
y_non_peak = non_peak_df.loc[:, target_column]
X_peak = peak_df.loc[:, predictor_columns]
y_peak = peak_df.loc[:, target_column]

non_peak_predictions = rfr.predict(X_non_peak)
peak_predictions = rfr.predict(X_peak)

non_peak_rmse = np.sqrt(mse(y_non_peak, non_peak_predictions))
print(non_peak_rmse)

peak_rmse = np.sqrt(mse(y_peak, peak_predictions))
print(peak_rmse)

7473954.96187
7966209.95591
