In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
metrics_df = pd.read_csv('./rf_data_1.csv')

In [3]:
# Scale the BO opens. 
scaler = StandardScaler()
bo_opens = metrics_df.opening_weekend_gross.values.reshape(-1, 1)
scaler.fit(bo_opens)

bo_opens_scaled = scaler.transform(bo_opens)
bo_opens_scaled = bo_opens_scaled.reshape(metrics_df.index.size)

metrics_df['bo_open_scaled'] = bo_opens_scaled

In [4]:
predictor_columns = list(metrics_df.columns[1:-2])
target_column = 'bo_open_scaled'

X = metrics_df.loc[:, predictor_columns].values
y = metrics_df.loc[:, target_column].values

#### Train random forest regressor.

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

In [27]:
rfr = RandomForestRegressor(n_estimators=400)
rfr.fit(X_train, y_train)

print(rfr.score(X, y))
print(rfr.score(X_test, y_test))

actual = scaler.inverse_transform(y_test)
test_predictions = rfr.predict(X_test)
test_predictions = scaler.inverse_transform(test_predictions)

print('Test RMSE: {}'.format(np.sqrt(mse(actual, test_predictions))))

0.831094871701
0.685346093564
Test RMSE: 11874096.921333175


#### Feature importances.

In [26]:
features_df = pd.DataFrame(list(zip(predictor_columns, rfr.feature_importances_)))
features_df.sort_values(1, ascending=False)

Unnamed: 0,0,1
23,def/pos_11,0.173472
12,def/pos_0,0.101578
70,definite_interest_10,0.075145
8,unaided_intent_8,0.046848
82,positive_interest_10,0.037042
62,definite_interest_2,0.032992
11,unaided_intent_11,0.028683
22,def/pos_10,0.026532
40,unaided_awareness_4,0.025653
10,unaided_intent_10,0.02456
