In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


### Read the csv

In [3]:
df = pd.read_csv("../data/movies_cleaned.csv", encoding='utf-8')
df.head()

Unnamed: 0.1,Unnamed: 0,profit,budget,company,country,director,genre,gross,name,rating,released,runtime,star,writer,year,ROI_pct,release_dt
0,0,44287414,8000000,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414,Stand by Me,R,8/22/1986,89,Wil Wheaton,Stephen King,1986,553.592675,1986-08-22
1,1,64136369,6000000,Paramount Pictures,USA,John Hughes,Comedy,70136369,Ferris Bueller's Day Off,PG-13,6/11/1986,103,Matthew Broderick,John Hughes,1986,1068.939483,1986-06-11
2,2,164800601,15000000,Paramount Pictures,USA,Tony Scott,Action,179800601,Top Gun,PG,5/16/1986,110,Tom Cruise,Jim Cash,1986,1098.670673,1986-05-16
3,3,66660248,18500000,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,85160248,Aliens,R,7/18/1986,137,Sigourney Weaver,James Cameron,1986,360.325665,1986-07-18
4,4,9564613,9000000,Walt Disney Pictures,USA,Randal Kleiser,Adventure,18564613,Flight of the Navigator,PG,8/1/1986,90,Joey Cramer,Mark H. Baker,1986,106.273478,1986-08-01


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4638 entries, 0 to 4637
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  4638 non-null   int64  
 1   profit      4638 non-null   int64  
 2   budget      4638 non-null   int64  
 3   company     4638 non-null   object 
 4   country     4638 non-null   object 
 5   director    4638 non-null   object 
 6   genre       4638 non-null   object 
 7   gross       4638 non-null   int64  
 8   name        4638 non-null   object 
 9   rating      4638 non-null   object 
 10  released    4638 non-null   object 
 11  runtime     4638 non-null   int64  
 12  star        4638 non-null   object 
 13  writer      4638 non-null   object 
 14  year        4638 non-null   int64  
 15  ROI_pct     4638 non-null   float64
 16  release_dt  4638 non-null   object 
dtypes: float64(1), int64(6), object(10)
memory usage: 616.1+ KB


In [5]:
# drop 'Unnamed' column
df=df.drop(columns='Unnamed: 0', axis=1)
df.head()

Unnamed: 0,profit,budget,company,country,director,genre,gross,name,rating,released,runtime,star,writer,year,ROI_pct,release_dt
0,44287414,8000000,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414,Stand by Me,R,8/22/1986,89,Wil Wheaton,Stephen King,1986,553.592675,1986-08-22
1,64136369,6000000,Paramount Pictures,USA,John Hughes,Comedy,70136369,Ferris Bueller's Day Off,PG-13,6/11/1986,103,Matthew Broderick,John Hughes,1986,1068.939483,1986-06-11
2,164800601,15000000,Paramount Pictures,USA,Tony Scott,Action,179800601,Top Gun,PG,5/16/1986,110,Tom Cruise,Jim Cash,1986,1098.670673,1986-05-16
3,66660248,18500000,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,85160248,Aliens,R,7/18/1986,137,Sigourney Weaver,James Cameron,1986,360.325665,1986-07-18
4,9564613,9000000,Walt Disney Pictures,USA,Randal Kleiser,Adventure,18564613,Flight of the Navigator,PG,8/1/1986,90,Joey Cramer,Mark H. Baker,1986,106.273478,1986-08-01


In [6]:
# --> drop temp
df.drop(['profit','gross','name','star','writer','release_dt'], axis=1, inplace=True) 

df.head()

Unnamed: 0,budget,company,country,director,genre,rating,released,runtime,year,ROI_pct
0,8000000,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,R,8/22/1986,89,1986,553.592675
1,6000000,Paramount Pictures,USA,John Hughes,Comedy,PG-13,6/11/1986,103,1986,1068.939483
2,15000000,Paramount Pictures,USA,Tony Scott,Action,PG,5/16/1986,110,1986,1098.670673
3,18500000,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,R,7/18/1986,137,1986,360.325665
4,9000000,Walt Disney Pictures,USA,Randal Kleiser,Adventure,PG,8/1/1986,90,1986,106.273478


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4638 entries, 0 to 4637
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   budget    4638 non-null   int64  
 1   company   4638 non-null   object 
 2   country   4638 non-null   object 
 3   director  4638 non-null   object 
 4   genre     4638 non-null   object 
 5   rating    4638 non-null   object 
 6   released  4638 non-null   object 
 7   runtime   4638 non-null   int64  
 8   year      4638 non-null   int64  
 9   ROI_pct   4638 non-null   float64
dtypes: float64(1), int64(3), object(6)
memory usage: 362.5+ KB


In [8]:
# check 'rating' and replace some values
df['rating'].unique()

array(['R', 'PG-13', 'PG', 'UNRATED', 'G', 'NC-17', 'NOT RATED',
       'Not specified'], dtype=object)

In [9]:
df['rating'].value_counts()

R                2247
PG-13            1561
PG                659
G                 100
NOT RATED          38
UNRATED            20
NC-17               9
Not specified       4
Name: rating, dtype: int64

In [10]:
df.replace(to_replace= ['NOT RATED', 'Not specified'], value='UNRATED', inplace=True)
df.head()

Unnamed: 0,budget,company,country,director,genre,rating,released,runtime,year,ROI_pct
0,8000000,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,R,8/22/1986,89,1986,553.592675
1,6000000,Paramount Pictures,USA,John Hughes,Comedy,PG-13,6/11/1986,103,1986,1068.939483
2,15000000,Paramount Pictures,USA,Tony Scott,Action,PG,5/16/1986,110,1986,1098.670673
3,18500000,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,R,7/18/1986,137,1986,360.325665
4,9000000,Walt Disney Pictures,USA,Randal Kleiser,Adventure,PG,8/1/1986,90,1986,106.273478


In [136]:
#df['rating'].value_counts()

In [11]:
# convert 'released' column to datetime
df['released'] = pd.to_datetime(df['released'])

# drop year and date but leave month only
df['month'] = df['released'].dt.month.astype('object')
df.drop('released', axis=1, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4638 entries, 0 to 4637
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   budget    4638 non-null   int64  
 1   company   4638 non-null   object 
 2   country   4638 non-null   object 
 3   director  4638 non-null   object 
 4   genre     4638 non-null   object 
 5   rating    4638 non-null   object 
 6   runtime   4638 non-null   int64  
 7   year      4638 non-null   int64  
 8   ROI_pct   4638 non-null   float64
 9   month     4638 non-null   object 
dtypes: float64(1), int64(3), object(6)
memory usage: 362.5+ KB


### Preprocessing

In [13]:
# how many unique values
{column: len(df[column].unique()) for column in df.columns}


{'budget': 350,
 'company': 1340,
 'country': 45,
 'director': 1892,
 'genre': 16,
 'rating': 6,
 'runtime': 123,
 'year': 31,
 'ROI_pct': 4636,
 'month': 12}

In [14]:
# convert 'ROI' to int
df['ROI_pct'] = df['ROI_pct'].astype(int)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4638 entries, 0 to 4637
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   budget    4638 non-null   int64 
 1   company   4638 non-null   object
 2   country   4638 non-null   object
 3   director  4638 non-null   object
 4   genre     4638 non-null   object
 5   rating    4638 non-null   object
 6   runtime   4638 non-null   int64 
 7   year      4638 non-null   int64 
 8   ROI_pct   4638 non-null   int32 
 9   month     4638 non-null   object
dtypes: int32(1), int64(3), object(6)
memory usage: 344.4+ KB


In [16]:
# create function to preprocess data
def preprocess_data(df):
    data = df.copy()
    
    # Convert categorical values to numerical       
    for column in ['company','country','director','genre','rating','month']:    
        dumm_col = pd.get_dummies(data[column],prefix=column)
        data = pd.concat([data, dumm_col], axis=1)
        data = data.drop(column, axis=1)
        
    # Split data into X (data) and y (target)
    y = data['ROI_pct']#.values.reshape(-1,1)
    X = data.drop('ROI_pct', axis=1)
    
    
    # Split the data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)
        
        
#     # Scale X
#     scaler = StandardScaler()
#     scaler.fit(X_train)
#     X_train = pd.DataFrame(scaler.transform(X_train),index=X_train.index, columns=X_train.columns) 
#     X_test = pd.DataFrame(scaler.transform(X_test),index=X_test.index, columns=X_test.columns)
    
    
    
    # return data
    return X_train, X_test, y_train, y_test

In [17]:
X_train, X_test, y_train, y_test = preprocess_data(df)

In [18]:
X_train

Unnamed: 0,budget,runtime,year,"company_""DIA"" Productions GmbH & Co. KG",company_10th Hole Productions,company_1492 Pictures,company_1821 Pictures,company_19 Entertainment,company_1984 Private Defense Contractors,company_2 Loop Films,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
912,25000000,108,1994,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3726,27000000,100,2011,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4188,20000000,148,2014,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2876,9000000,85,2006,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3918,1000000,95,2012,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4426,8500000,95,2015,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
466,22000000,85,1990,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3092,35000000,107,2007,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3772,14000000,150,2011,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [19]:
y_train

912      28
3726     42
4188    -59
2876    156
3918    500
       ... 
4426    -97
466     -32
3092    -68
3772    -99
860     183
Name: ROI_pct, Length: 3478, dtype: int32

### Create Model

In [20]:
print(X_train.shape, X_test.shape)

(3478, 3314) (1160, 3314)


In [21]:
print(y_train.shape, y_test.shape)

(3478,) (1160,)


#### Create a Decision Tree Classifier

In [22]:
#from sklearn import tree

# Create and score a decision tree classifier
dtc = tree.DecisionTreeClassifier() # clf
dtc.fit(X_train, y_train)
dtc_sc = dtc.score(X_test, y_test)
dtc_sc

0.009482758620689655

#### Create a Desision Tree Regressor

In [23]:
#from sklearn import tree

dtr = tree.DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr_sc = dtr.score(X_test, y_test)
dtr_sc

-0.3407491504233624

#### Create a Random Forest Classifier

In [24]:
#from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=200, criterion='entropy')
rfc.fit(X_train, y_train) 
rfc_sc = rfc.score(X_test, y_test)
rfc_sc

0.011206896551724138

#### Create Random Forest Regression

In [25]:
#from sklearn.ensemble import RandomForestRegressor

rfreg = RandomForestRegressor(n_estimators=200, criterion='mse')
rfreg.fit(X_train, y_train) 
rfreg_sc = rfreg.score(X_test, y_test)
rfreg_sc

-0.6418948175047916

In [27]:
print('Test Acc for dtc: %.3f' % dtc_sc)
print('Test Acc for dtr: %.3f' % dtr_sc)
print('Test Acc for rfc: %.3f' % rfc_sc)
print('Test Acc for rfreg: %.3f' % rfreg_sc)

Test Acc for dtc: 0.009
Test Acc for dtr: -0.341
Test Acc for rfc: 0.011
Test Acc for rfreg: -0.642


##### Feature importances

In [28]:
# Set features
feature_names = X_train.columns
feature_names

Index(['budget', 'runtime', 'year', 'company_"DIA" Productions GmbH & Co. KG',
       'company_10th Hole Productions', 'company_1492 Pictures',
       'company_1821 Pictures', 'company_19 Entertainment',
       'company_1984 Private Defense Contractors', 'company_2 Loop Films',
       ...
       'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8',
       'month_9', 'month_10', 'month_11', 'month_12'],
      dtype='object', length=3314)

In [29]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rfreg.feature_importances_
importances

array([5.69776468e-02, 5.40877356e-03, 4.19306082e-03, ...,
       2.40207728e-05, 8.98885074e-05, 2.37838773e-05])

In [30]:
# We can sort the features by their importance
sorted(zip(rfreg.feature_importances_, feature_names), reverse=True)

[(0.2929341260649603, 'director_Oren Peli'),
 (0.2761907734325065, 'company_Solana Films'),
 (0.11802768302108144, 'company_Haxan Films'),
 (0.1137044032766325, 'director_Daniel Myrick'),
 (0.05697764680323649, 'budget'),
 (0.03093206288759251, 'company_Brothers McMullen Productions'),
 (0.023597275311749568, 'director_Edward Burns'),
 (0.023238941453412196, 'company_Plunge Pictures LLC'),
 (0.006098348953934884, 'director_Chris Kentis'),
 (0.005890026134971359, 'director_Alex Kendrick'),
 (0.005778477645731063, 'director_Robert Rodriguez'),
 (0.005408773561370206, 'runtime'),
 (0.004193060817982437, 'year'),
 (0.004069904036390563, 'director_Jared Hess'),
 (0.003842757861710544, 'company_Carmel Entertainment'),
 (0.003273942561985092, 'company_Columbia Pictures Corporation'),
 (0.0025344792730529193, 'company_Prototype'),
 (0.001973398323288097, 'month_2'),
 (0.001103245080274135, 'genre_Action'),
 (0.000951446642372968, 'company_Can I Watch'),
 (0.0009240996992335451, 'company_Too As

### Training and Tuning

In [None]:
# model = rfreg
# model

In [168]:
# Create the GridSearchCV model
#from sklearn.model_selection import GridSearchCV

param_grid = {
    #'bootstrap': [True],
    #'max_depth': 
    #'max_features': #[2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Base model
model = RandomForestRegressor()

#grid_m = GridSearchCV(model, param_grid, verbose=3)

grid_m = GridSearchCV(model, param_grid, cv = 10, n_jobs = -1, verbose = 2)
                            


In [169]:
# Train the model with GridSearch
grid_m.fit(X_train, y_train)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


GridSearchCV(cv=10, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             verbose=2)

In [170]:
# List the best parameters for this dataset
print(grid_m.best_params_)

{'min_samples_leaf': 5, 'min_samples_split': 12, 'n_estimators': 1000}


In [171]:
# # List the best parameters for this dataset
print(grid_m.best_params_)

{'min_samples_leaf': 5, 'min_samples_split': 12, 'n_estimators': 1000}


In [172]:
# List the best score
print(grid_m.best_score_)

0.42329424402545124


In [174]:
print('Test Acc: %.3f' % grid_m.score(X_test, y_test))

Test Acc: 0.459


#### Evaluating the model

In [184]:
grid_m
#rfreg

GridSearchCV(cv=10, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             verbose=2)

In [173]:
# Make predictions with the hypertuned model
predictions = grid_m.predict(X_test)
predictions

array([ 9684763.34973476, 14106950.37440659, 16909616.94443956, ...,
       19430649.99185421, 38834282.1206519 , 11474949.08201418])

In [31]:
predictions2 = rfreg.predict(X_test)
predictions2

array([ 12.135, -53.265,  13.73 , ...,  -4.3  , -13.27 ,  68.49 ])

In [32]:
result = pd.DataFrame({'Test': y_test, 'Predictions': predictions2})

result.head()

Unnamed: 0,Test,Predictions
354,23,12.135
1864,-66,-53.265
416,71,13.73
4517,-57,-22.3
1897,-89,50.58


### Saving a Trained Model

In [33]:
# Save the model using pickle
import pickle

output_file = open('../model/model_trained_pkl.pkl', 'wb')
                        

# dump the model into that file
pickle.dump(grid_m, output_file)

### Loading a model

### Test