In [50]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score, classification_report, log_loss, make_scorer
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import date

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
df = pd.read_csv('data/train.csv')

In [52]:
df['runtime'].fillna((df['runtime'].mean()), inplace=True)
df['release_date'] = pd.to_datetime(df['release_date'])

In [59]:
y = df.revenue
X = df[['budget', 'popularity', 'runtime','release_date']]
#'production_countries'

In [60]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 4 columns):
budget          3000 non-null int64
popularity      3000 non-null float64
runtime         3000 non-null float64
release_date    3000 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(1)
memory usage: 93.8 KB


In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [62]:
rf = RandomForestRegressor(n_estimators=100,
                            n_jobs=-1,
                            random_state=1)

In [63]:
random_forest_grid = {'max_depth': [10],
                      'max_features': ['sqrt', 'log2', None],
                      'min_samples_split': [2,4],
                      'min_samples_leaf': [1, 2, 4],
                      'bootstrap': [True, False],
                      'n_estimators': [10,20,40,80,160],
                      'random_state': [1]}

rf_gridsearch = GridSearchCV(RandomForestRegressor(),
                             random_forest_grid,
                             n_jobs=-1,
                             verbose=True,
                             scoring='neg_mean_squared_error')

rf_gridsearch.fit(X_train, y_train)

print( "best parameters:", rf_gridsearch.best_params_ )

best_rf_model = rf_gridsearch.best_estimator_

Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


TypeError: float() argument must be a string or a number, not 'Timestamp'

In [None]:
best_rf_model

In [None]:
best_rf_model.fit(X_train, y_train)

In [None]:
y_hat = best_rf_model.predict(X_test)

In [None]:
model = best_rf_model.fit(X_train, y_train)
acc, pre, rec, f1s = [], [], [], []
for i in range(100):
    y_hat = model.predict(X_test) > (i/100)
    acc.append(accuracy_score(y_test, y_hat));
    #pre.append(precision_score(y_test, y_hat));
    #rec.append(recall_score(y_test, y_hat));
    #f1s.append(f1_score(y_test, y_hat));

plt.figure(figsize = (15,8))
plt.plot(range(100), acc, label='accuracy', alpha=0.5)
#plt.plot(range(100), pre, label='precision', alpha=0.5)
#plt.plot(range(100), rec, label='recall', alpha=0.5)
#plt.plot(range(100), f1s, label='f1 score', alpha=0.5)
plt.axhline(0.50, color='k', alpha=0.5, linestyle='--', dashes=(7,10), label='50%')
plt.axhline(0.75, color='k', alpha=0.5, linestyle='--', dashes=(7,10), label='75%')
plt.axvline(42, color='#ff33cc', alpha=0.8, linestyle='-.', label='chosen')
plt.title('All Metrics')
plt.xlabel('Threshold, P')
plt.ylabel('Score')
plt.legend()
plt.show()

In [None]:
accuracy_score(y_test, y_hat)

In [64]:
mean_squared_error(y_test,y_hat)

2.068695053415216e+16