In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pickle

## Save RandomForest Model with pickle
def save_data(filename, data):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

## Load pickle file data 
def load_data(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [5]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)

# create data frame for model relevant columns
df = pd.read_csv('../Data/all_movie_data_dropna.csv')
# display(df)
df_model = df[['Release_month', 'Budget', 'Box_office', 'Box_office_opening', 'Rating']]

#get dummy variables for movie rating
df_dum = pd.get_dummies(df_model)
display(df_dum)

x = df_dum.drop('Box_office', axis=1)
y = df_dum.Box_office.values

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

Unnamed: 0,Release_month,Budget,Box_office,Box_office_opening,Rating_G,Rating_NC-17,Rating_PG,Rating_PG-13,Rating_R,Rating_Unknown
0,7.0,20000000.0,21400000.0,6374391.0,0,0,0,0,0,1
1,5.0,6000000.0,2700000.0,244919.0,0,0,0,0,0,1
2,8.0,30000000.0,57700000.0,8064480.0,0,0,0,0,0,1
3,6.0,50000000.0,153500000.0,19475559.0,0,0,0,0,0,1
4,12.0,12000000.0,78900000.0,4369868.0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
4242,12.0,80000000.0,75500000.0,6619870.0,0,0,1,0,0,0
4243,12.0,100000000.0,171600000.0,13354798.0,0,0,1,0,0,0
4244,12.0,40000000.0,218900000.0,16755310.0,0,0,1,0,0,0
4245,12.0,95000000.0,384900000.0,576216.0,0,0,0,0,1,0


In [4]:


rf = RandomForestRegressor()

## Cross Validation
np.mean(cross_val_score(rf, X_train, y_train, scoring='neg_mean_absolute_error', cv=3))

# tuning
parameters = {'n_estimators': range(10,300,10), 'criterion':('mse','mae'), 'max_features':('auto', 'sqrt', 'log2')}
gs = GridSearchCV(rf, parameters, scoring='neg_mean_absolute_error', cv=3)
gs.fit(X_train, y_train)


GridSearchCV(cv=3, estimator=RandomForestRegressor(),
             param_grid={'criterion': ('mse', 'mae'),
                         'max_features': ('auto', 'sqrt', 'log2'),
                         'n_estimators': range(10, 300, 10)},
             scoring='neg_mean_absolute_error')

In [6]:
model = load_data('RandomForestRegressionModel.pickle')
# gs.best_score_
predict =  model.predict(X_test)
mean_absolute_error(y_test, predict)

45438170.6718552

In [6]:
   
        
save_data('RandomForestRegressionModel.pickle', gs.best_estimator_)

