In [38]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from pandas.plotting import autocorrelation_plot
pd.options.display.float_format = '{:.0f}'.format

In [39]:
data = pd.read_csv("../data_cleaning/features.csv")

In [40]:
data

Unnamed: 0.1,Unnamed: 0,index,id,release_date,runtime,release_month,adj_revenue,Documentary,week_num,adj_revenue_millions,...,December,February,January,July,June,March,May,November,October,September
0,0,0,27205,2010-07-15,148,July,1165491209,0,28,1165,...,False,False,False,True,False,False,False,False,False,False
1,1,1,157336,2014-11-05,169,November,914603498,0,45,915,...,False,False,False,False,False,False,False,True,False,False
2,2,2,155,2008-07-16,152,July,1405648374,0,29,1406,...,False,False,False,True,False,False,False,False,False,False
3,5,5,293660,2016-02-09,108,February,1016526293,0,6,1017,...,False,True,False,False,False,False,False,False,False,False
4,7,7,550,1999-10-15,139,October,184552175,0,41,185,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11554,11568,14364,1079863,2007-10-12,105,October,4419377,0,41,4,...,False,False,False,False,False,False,False,False,True,False
11555,11569,14369,1105011,1977-03-26,87,March,1034585,0,12,1,...,False,False,False,False,False,True,False,False,False,False
11556,11570,14375,953480,1988-09-10,86,September,841500,0,36,1,...,False,False,False,False,False,False,False,False,False,True
11557,11571,14376,721807,2006-11-24,135,November,1481664,0,47,1,...,False,False,False,False,False,False,False,True,False,False


In [41]:
data['adj_revenue_millions'].describe()

count   11559
mean       95
std       198
min         1
25%         4
50%        21
75%        90
max      1995
Name: adj_revenue_millions, dtype: float64

Here i was going to attempt to use a time series model for prediction, but I couldn't figure out how to use it properly. The autocorrelation seems to be insignificant, so I'm not sure that predictions would have any validity anyway.

In [42]:
# test = data[['release_date','adj_revenue_millions']]
# test['release_date'] = pd.to_datetime(test['release_date'], format="%Y-%M-%d")
# test  = test.set_index('release_date')
# test.index = pd.DatetimeIndex(test.index).to_period('M')
# #test.index = test.index.to_timestamp()
# test = test.sort_index()
# autocorrelation_plot(test)
# plt.show()

In [43]:
# from statsmodels.tsa.arima.model import ARIMA
#
# model = ARIMA(test, order=(10,1,0))
# model_fit = model.fit()

In [44]:
# print(model_fit.summary())

I am going to use a random forest regressor to try to predict revenue based on genre, runtime, release month, and the number of common words in the synopsis. I will then save the model, and in the website I will run the model 12 times with a different release month each time, use the other parameters based on the user input and return the model from the release month that predicted the highest revenue.

In [45]:
X = data[['runtime','Documentary','action_adv_war_west','horror_thriller','family_animate','scifi_fantasy','hist_drama','crime_mystery','comedy_romance_music','common_word_count','January','February','March','April','May','June','July','August','September','October','November','December']]
y = data['adj_revenue_millions']

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

the distribution of the revenue data is wildly imbalanced and has some extreme outliers. random forest regression is tolerant to outliers though so we don't have to worry about it.

In [47]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, max_depth=2)

model.fit(X_train,y_train)

y_pred = model.predict(X_test)

In [48]:
from sklearn import metrics

print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
mape = np.mean(np.abs((y_test - y_pred) / np.abs(y_test)))
print('Mean Absolute Percentage Error (MAPE):', round(mape * 100, 2))
print('Accuracy:', round(100*(1 - mape), 2))

Mean Absolute Error (MAE): 102.73150427839656
Mean Squared Error (MSE): 34668.878373157015
Root Mean Squared Error (RMSE): 186.19580654020385
Mean Absolute Percentage Error (MAPE): 1483.39
Accuracy: -1383.39


The MAPE of this model is extraordinarily high, which is likely because some of the predictions are overshooting or undershooting by such a high degree that the average is thrown off completely. The RMSE is still quite high, but perhaps slightly more acceptable than the MAPE would indicate. This particular research question is not well answered by this data. There are many factors that contribute to the success of a film, and it is not surprising that it is difficult to tell in advance how a film will perform, especially absent the contextual information about the cultural significance (e.g. superhero movies perform better than other action movies, franchise movies perform differently than ones with unknown characters and stories, etc etc). This model is not strong at predicting revenue, but I will still use it in searching for the best month in which to release a film. The exact revenue number is unreliable, but perhaps a small trend will still be useful. Going to perform a randomized search and then a grid search to tune hyperparameters.

In [49]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start=200, stop=1000, num=10)]
max_features = [ 1,'sqrt']
max_depth = [int(x) for x in np.linspace(10,110, num=11)]
max_depth.append(None)
min_samples_split = [2,5,8,10,12]
min_samples_leaf = [1, 2, 3, 4, 5]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [50]:
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions=random_grid, n_iter=50, cv=3, verbose=2)

rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=288; total time=   1.6s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=288; total time=   1.5s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=288; total time=   1.4s
[CV] END bootstrap=False, max_depth=110, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=822; total time=   5.7s
[CV] END bootstrap=False, max_depth=110, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=822; total time=   5.7s
[CV] END bootstrap=False, max_depth=110, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=822; total time=   5.8s
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimato

In [51]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': False}

In [52]:
from sklearn.model_selection import GridSearchCV
n_estimators = [175, 200, 225, 250]
max_features = ['sqrt']
max_depth = [60, 70, 80]
min_samples_split = [4, 5, 6]
min_samples_leaf = [2,3,4]
bootstrap = [False]

second_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [53]:
rf = RandomForestRegressor()

rf_grid = GridSearchCV(estimator = rf, param_grid=second_grid, cv=3, verbose=2)

rf_grid.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=175; total time=   1.4s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=175; total time=   1.4s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=175; total time=   1.4s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time=   1.5s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time=   1.5s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time=   1.5s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=22

In [54]:
rf_grid.best_params_

{'bootstrap': False,
 'max_depth': 70,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 6,
 'n_estimators': 175}

In [55]:
model_best = RandomForestRegressor(n_estimators=175,
                              min_samples_split=6,
                              min_samples_leaf=4,
                              max_features='sqrt',
                              max_depth=70,
                              bootstrap=False)

model_best.fit(X_train,y_train)

y_pred = model_best.predict(X_test)

In [56]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
mape = np.mean(np.abs((y_test - y_pred) / np.abs(y_test)))
print('Mean Absolute Percentage Error (MAPE):', round(mape * 100, 2))
print('Accuracy:', round(100*(1 - mape), 2))

Mean Absolute Error (MAE): 93.62528709787244
Mean Squared Error (MSE): 29500.2380863798
Root Mean Squared Error (RMSE): 171.75633346802616
Mean Absolute Percentage Error (MAPE): 1223.68
Accuracy: -1123.68


this is the best I'm going to do with this model setup, so I'm going to pickle it so that it can be used in my stuff later on.

In [57]:
with open('model.pkl', 'wb') as f_pickle:
    pickle.dump(model_best, f_pickle)