In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from pandas.plotting import autocorrelation_plot
pd.options.display.float_format = '{:.0f}'.format

In [2]:
data = pd.read_csv("../data_cleaning/features.csv")

In [3]:
data

Unnamed: 0.1,Unnamed: 0,index,id,release_date,runtime,release_month,adj_revenue,Documentary,week_num,adj_revenue_millions,...,December,February,January,July,June,March,May,November,October,September
0,0,0,27205,2010-07-15,148,July,1165491209,0,28,1165,...,False,False,False,True,False,False,False,False,False,False
1,1,1,157336,2014-11-05,169,November,914603498,0,45,915,...,False,False,False,False,False,False,False,True,False,False
2,2,2,155,2008-07-16,152,July,1405648374,0,29,1406,...,False,False,False,True,False,False,False,False,False,False
3,3,3,19995,2009-12-15,162,December,4167116097,0,51,4167,...,True,False,False,False,False,False,False,False,False,False
4,4,4,24428,2012-04-25,143,April,2031747869,0,17,2032,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11568,11568,14364,1079863,2007-10-12,105,October,4419377,0,41,4,...,False,False,False,False,False,False,False,False,True,False
11569,11569,14369,1105011,1977-03-26,87,March,1034585,0,12,1,...,False,False,False,False,False,True,False,False,False,False
11570,11570,14375,953480,1988-09-10,86,September,841500,0,36,1,...,False,False,False,False,False,False,False,False,False,True
11571,11571,14376,721807,2006-11-24,135,November,1481664,0,47,1,...,False,False,False,False,False,False,False,True,False,False


In [4]:
data['adj_revenue_millions'].describe()

count   11573
mean       98
std       222
min         1
25%         4
50%        21
75%        90
max      4315
Name: adj_revenue_millions, dtype: float64

Here i was going to attempt to use a time series model for prediction, but I couldn't figure out how to use it properly. The autocorrelation seems to be insignificant, so I'm not sure that predictions would have any validity anyway. When I attempted to run these models it ran for a very long time and seemed to throw errors that I was unable to troubleshoot.

In [5]:
# test = data[['release_date','adj_revenue_millions']]
# test['release_date'] = pd.to_datetime(test['release_date'], format="%Y-%M-%d")
# test  = test.set_index('release_date')
# test.index = pd.DatetimeIndex(test.index).to_period('M')
# #test.index = test.index.to_timestamp()
# test = test.sort_index()
# autocorrelation_plot(test)
# plt.show()

In [6]:
# from statsmodels.tsa.arima.model import ARIMA
#
# model = ARIMA(test, order=(10,1,0))
# model_fit = model.fit()

In [7]:
# print(model_fit.summary())

I am going to use a random forest regressor to try to predict revenue based on genre, runtime, release month, and the number of common words in the synopsis. I will then save the model, and in the website I will run the model 12 times with a different release month each time, use the other parameters based on the user input and return the model from the release month that predicted the highest revenue.

In [8]:
X = data[['runtime','Documentary','action_adv_war_west','horror_thriller','family_animate','scifi_fantasy','hist_drama','crime_mystery','comedy_romance_music','common_word_count','January','February','March','April','May','June','July','August','September','October','November','December']]
y = data['adj_revenue_millions']

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

the distribution of the revenue data is wildly imbalanced and has some extreme outliers. random forest regression is tolerant to outliers though so we don't have to worry about it.

In [10]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, max_depth=2)

model.fit(X_train,y_train)

y_pred = model.predict(X_test)

In [11]:
from sklearn import metrics

print('mean abs error:', metrics.mean_absolute_error(y_test, y_pred))
print('mean sq error:', metrics.mean_squared_error(y_test, y_pred))
print('rmse:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
mape = np.mean(np.abs((y_test - y_pred) / np.abs(y_test)))
print('mean abs percent error:', round(mape * 100, 2))
print('accuracy:', round(100*(1 - mape), 2))

Mean Absolute Error (MAE): 105.03220595818999
Mean Squared Error (MSE): 36277.90332564094
Root Mean Squared Error (RMSE): 190.46759127379372
Mean Absolute Percentage Error (MAPE): 1604.18
Accuracy: -1504.18


The MAPE of this model is extraordinarily high, which is likely because some of the predictions are overshooting or undershooting by such a high degree that the average is thrown off completely. The RMSE is still quite high, but perhaps slightly more acceptable than the MAPE would indicate. This particular research question is not well answered by this data. There are many factors that contribute to the success of a film, and it is not surprising that it is difficult to tell in advance how a film will perform, especially absent the contextual information about the cultural significance (e.g. superhero movies perform better than other action movies, franchise movies perform differently than ones with unknown characters and stories, etc etc). This model is not strong at predicting revenue, but I will still use it in searching for the best month in which to release a film. The exact revenue number is unreliable, but perhaps a small trend will still be useful. Going to perform a randomized search and then a grid search to tune hyperparameters.

In [12]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start=200, stop=1000, num=10)]
max_features = [ 1,'sqrt']
max_depth = [int(x) for x in np.linspace(10,110, num=11)]
max_depth.append(None)
min_samples_split = [2,5,8,10,12]
min_samples_leaf = [1, 2, 3, 4, 5]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [13]:
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions=random_grid, n_iter=50, cv=3, verbose=2)

rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   1.0s
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   1.3s
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   0.9s
[CV] END bootstrap=True, max_depth=20, max_features=1, min_samples_leaf=3, min_samples_split=5, n_estimators=644; total time=   2.2s
[CV] END bootstrap=True, max_depth=20, max_features=1, min_samples_leaf=3, min_samples_split=5, n_estimators=644; total time=   2.4s
[CV] END bootstrap=True, max_depth=20, max_features=1, min_samples_leaf=3, min_samples_split=5, n_estimators=644; total time=   2.3s
[CV] END bootstrap=True, max_depth=100, max_features=1, min_samples_leaf=3, min_samples_split=5, n_estimators=466; total time

In [14]:
rf_random.best_params_

{'n_estimators': 644,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 40,
 'bootstrap': True}

In [17]:
from sklearn.model_selection import GridSearchCV
n_estimators = [600, 625, 650, 675, 700]
max_features = ['sqrt']
max_depth = [30, 40, 50]
min_samples_split = [9, 10, 11]
min_samples_leaf = [1,2,3]
bootstrap = [True]

second_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [18]:
rf = RandomForestRegressor()

rf_grid = GridSearchCV(estimator = rf, param_grid=second_grid, cv=3, verbose=2)

rf_grid.fit(X_train, y_train)

Fitting 3 folds for each of 135 candidates, totalling 405 fits
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=9, n_estimators=600; total time=   3.6s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=9, n_estimators=600; total time=   3.2s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=9, n_estimators=600; total time=   3.2s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=9, n_estimators=625; total time=   3.4s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=9, n_estimators=625; total time=   3.4s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=9, n_estimators=625; total time=   3.3s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=9, n_estimators=650; tota

In [19]:
rf_grid.best_params_

{'bootstrap': True,
 'max_depth': 30,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 600}

In [20]:
model_best = RandomForestRegressor(n_estimators=600,
                              min_samples_split=10,
                              min_samples_leaf=2,
                              max_features='sqrt',
                              max_depth=30,
                              bootstrap=True)

model_best.fit(X_train,y_train)

y_pred = model_best.predict(X_test)

In [23]:
print('mean abs error:', metrics.mean_absolute_error(y_test, y_pred))
print('mean sq error:', metrics.mean_squared_error(y_test, y_pred))
print('rmse:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
mape = np.mean(np.abs((y_test - y_pred) / np.abs(y_test)))
print('mean abs percent error:', round(mape * 100, 2))
print('accuracy:', round(100*(1 - mape), 2))
print('r squared:', metrics.r2_score(y_test, y_pred))

mean abs error: 96.00272271535073
mean sq error: 29848.231916169254
rmse: 172.76640852946284
mean abs percent error: 1333.99
accuracy: -1233.99
r squared: 0.27490526409367466


this is the best I'm going to do with this model setup, so I'm going to pickle it so that it can be used in my stuff later on.

In [22]:
with open('model.pkl', 'wb') as f_pickle:
    pickle.dump(model_best, f_pickle)