In [1]:
%matplotlib inline

from random import sample
import re

from matplotlib import pylab
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 200)

#### Get the train/test data.

For linear regression.

In [2]:
nrg_bom_df = pd.read_csv('./nrg_bom_df.p')

# Get rid of bad rows.
nrg_bom_df = nrg_bom_df.loc[
    nrg_bom_df.num_of_theaters_opening_weekend > 1
]

nrg_bom_df['def/pos'] = nrg_bom_df.definite_int / nrg_bom_df.positive_int

For random forest.

In [3]:
metrics_df = pd.read_csv('./rf_data_1.csv')

# Get rid of bad row.
metrics_df = metrics_df.loc[
    ~((metrics_df.movie == 'Beauty And The Beast') &
    (metrics_df.opening_weekend_gross < 10000.))
]

Merge the data sets.

In [4]:
metrics_df = nrg_bom_df.merge(metrics_df)

# IY. Perhaps scaling is not necessary.
# Scale the BO opens. 
scaler = StandardScaler()
bo_opens = metrics_df.opening_weekend_gross.values.reshape(-1, 1)
scaler.fit(bo_opens)

bo_opens_scaled = scaler.transform(bo_opens)
bo_opens_scaled = bo_opens_scaled.reshape(metrics_df.index.size)

metrics_df['bo_open_scaled'] = bo_opens_scaled

# Get def/pos squared.
metrics_df['def/pos_sq'] = metrics_df['def/pos'] * metrics_df['def/pos']

Get the predictor columns for lr and rf.

In [5]:
lr_predictor_columns = [
    'unaided_intent',
    'first_choice',
    'def/pos',
    'def/pos_sq'
]

rf_predictor_columns = []
for column in metrics_df.columns:
    pattern = '_\d+'
    match = re.search(pattern, column)
    if match:
        rf_predictor_columns.append(column)

target_column = 'opening_weekend_gross'

Get train/test sets for lr and rf.

In [6]:
N = metrics_df.index.size
TEST_SIZE = round(N * .25)

test_ixs = sample(list(range(N)), TEST_SIZE)
train_ixs = list(set(range(N)) - set(test_ixs))

train_df = metrics_df.iloc[train_ixs]
test_df = metrics_df.iloc[test_ixs]

X_train_lr = train_df.loc[:, lr_predictor_columns].values
X_test_lr = test_df.loc[:, lr_predictor_columns].values

X_train_rf = train_df.loc[:, rf_predictor_columns].values
X_test_rf = test_df.loc[:, rf_predictor_columns].values

y_train = train_df.loc[:, target_column].values
y_test = test_df.loc[:, target_column].values

X_lr = np.concatenate((X_train_lr, X_test_lr))
X_rf = np.concatenate((X_train_rf, X_test_rf))
y = np.concatenate((y_train, y_test))

#### Train linear regression.

In [24]:
lr = LinearRegression()
lr.fit(X_train_lr, y_train)

test_predictions = lr.predict(X_test_lr)

print('R^2: {}'.format(lr.score(X_lr, y)))
print('Test R^2: {}'.format(lr.score(X_test_lr, y_test)))
print('Test RMSE: {}'.format(np.sqrt(mse(y_test, test_predictions))))

R^2: 0.8844650456693388
Test R^2: 0.8836578388710788
Test RMSE: 12963230.935446262


#### Train random forest.

In [55]:
rfr = RandomForestRegressor(n_estimators=250)
rfr.fit(X_train_rf, y_train)

test_predictions = rfr.predict(X_test_rf)

print('R^2: {}'.format(rfr.score(X_rf, y)))
print('Test R^2: {}'.format(rfr.score(X_test_rf, y_test)))
print('Test RMSE: {}'.format(np.sqrt(mse(y_test, test_predictions))))

R^2: 0.9247450297720723
Test R^2: 0.7799856416820695
Test RMSE: 14487147.805829288


#### Train knn regressor.

Use lr predictors.

In [42]:
knn_lr = KNeighborsRegressor()
knn_lr.fit(X_train_lr, y_train)

test_predictions = knn_lr.predict(X_test_lr)

print('R^2: {}'.format(knn_lr.score(X_lr, y)))
print('Test R^2: {}'.format(knn_lr.score(X_test_lr, y_test)))
print('Test RMSE: {}'.format(np.sqrt(mse(y_test, test_predictions))))

R^2: 0.828809235973071
Test R^2: 0.731763620854236
Test RMSE: 15996177.624984656


In [60]:
pickle.dump(knn_lr, open('./knn_lr.p', 'wb'))

Use rf predictors.

In [27]:
knn_rf = KNeighborsRegressor()
knn_rf.fit(X_train_rf, y_train)

test_predictions = knn_rf.predict(X_test_rf)

print('R^2: {}'.format(knn_rf.score(X_rf, y)))
print('Test R^2: {}'.format(knn_rf.score(X_test_rf, y_test)))
print('Test RMSE: {}'.format(np.sqrt(mse(y_test, test_predictions))))

R^2: 0.7680382136904574
Test R^2: 0.7561771050821549
Test RMSE: 15250870.053728005


Use all predictors.

In [28]:
X_train = np.concatenate((X_train_lr, X_train_rf), axis=1)
X_test = np.concatenate((X_test_lr, X_test_rf), axis=1)
X = np.concatenate((X_train, X_test))

knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

test_predictions = knn.predict(X_test)

print('R^2: {}'.format(knn.score(X, y)))
print('Test R^2: {}'.format(knn.score(X_test, y_test)))
print('Test RMSE: {}'.format(np.sqrt(mse(y_test, test_predictions))))

R^2: 0.7667185563811508
Test R^2: 0.7504300381383696
Test RMSE: 15429559.776140347


#### Average all predictions.

In [29]:
test_predictions_lr = lr.predict(X_test_lr)
test_predictions_rfr = rfr.predict(X_test_rf)
test_predictions_knn = knn_lr.predict(X_test_lr)

In [33]:
test_predictions = (
    (test_predictions_lr +
     test_predictions_rfr +
     test_predictions_knn) / 3.
)

print('Test RMSE: {}'.format(np.sqrt(mse(y_test, test_predictions))))

Test RMSE: 12963230.935446262


In [35]:
test_predictions = (
    (test_predictions_lr +
     test_predictions_rfr) / 2.
)

print('Test RMSE: {}'.format(np.sqrt(mse(y_test, test_predictions))))

Test RMSE: 12010291.156839198


In [36]:
test_predictions = (
    (test_predictions_lr +
     test_predictions_knn) / 2.
)

print('Test RMSE: {}'.format(np.sqrt(mse(y_test, test_predictions))))

Test RMSE: 12698290.061664687


In [37]:
test_predictions = (
     (test_predictions_rfr +
     test_predictions_knn) / 3.
)

print('Test RMSE: {}'.format(np.sqrt(mse(y_test, test_predictions))))

Test RMSE: 21273023.6766243
