In [1]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score, classification_report, log_loss, make_scorer
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import date
import datetime

%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_csv('data/train.csv')

In [17]:
df['runtime'].fillna((df['runtime'].mean()), inplace=True)
df['release_date'] = pd.to_datetime(df['release_date'])
df['release_date'] = datetime.date.today().year

In [4]:
df.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2019,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,2019,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,2019,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,2019,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,2019,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970


In [5]:
# for i in range(len(temp)):
#     print(temp[i]['name'])


In [6]:
y = df.revenue
X = df[['budget', 'popularity', 'runtime','release_date']]
#'production_countries'

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 4 columns):
budget          3000 non-null int64
popularity      3000 non-null float64
runtime         3000 non-null float64
release_date    3000 non-null int64
dtypes: float64(2), int64(2)
memory usage: 93.8 KB


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
rf = RandomForestRegressor(n_estimators=100,
                            n_jobs=-1,
                            random_state=1)

In [10]:
random_forest_grid = {'max_depth': [10],
                      'max_features': ['sqrt', 'log2', None],
                      'min_samples_split': [2,4],
                      'min_samples_leaf': [1, 2, 4],
                      'bootstrap': [True, False],
                      'n_estimators': [10,20,40,80,160],
                      'random_state': [1]}

rf_gridsearch = GridSearchCV(RandomForestRegressor(),
                             random_forest_grid,
                             n_jobs=-1,
                             verbose=True,
                             scoring='neg_mean_squared_error')

rf_gridsearch.fit(X_train, y_train)

print( "best parameters:", rf_gridsearch.best_params_ )

best_rf_model = rf_gridsearch.best_estimator_

Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:   17.2s finished


best parameters: {'bootstrap': True, 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 160, 'random_state': 1}


In [11]:
best_rf_model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=4, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=160, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [12]:
best_rf_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=4, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=160, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [13]:
y_hat = best_rf_model.predict(X_test)

In [14]:
# model = best_rf_model.fit(X_train, y_train)
# acc, pre, rec, f1s = [], [], [], []
# for i in range(100):
#     y_hat = model.predict(X_test) > (i/100)
#     acc.append(accuracy_score(y_test, y_hat));
#     #pre.append(precision_score(y_test, y_hat));
#     #rec.append(recall_score(y_test, y_hat));
#     #f1s.append(f1_score(y_test, y_hat));

# plt.figure(figsize = (15,8))
# plt.plot(range(100), acc, label='accuracy', alpha=0.5)
# #plt.plot(range(100), pre, label='precision', alpha=0.5)
# #plt.plot(range(100), rec, label='recall', alpha=0.5)
# #plt.plot(range(100), f1s, label='f1 score', alpha=0.5)
# plt.axhline(0.50, color='k', alpha=0.5, linestyle='--', dashes=(7,10), label='50%')
# plt.axhline(0.75, color='k', alpha=0.5, linestyle='--', dashes=(7,10), label='75%')
# plt.axvline(42, color='#ff33cc', alpha=0.8, linestyle='-.', label='chosen')
# plt.title('All Metrics')
# plt.xlabel('Threshold, P')
# plt.ylabel('Score')
# plt.legend()
# plt.show()

In [15]:
accuracy_score(y_test, y_hat)

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [16]:
mean_squared_error(y_test,y_hat)

5342674425231271.0