In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
X = pd.read_csv('./intermediate_results/X_PLUS.csv') # using the data created last notebook
y = X['worldwide_gross']
X = X.drop('worldwide_gross',axis=1).drop('Unnamed: 0', axis=1)

# Applying Random Forest

In [19]:
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import cross_validate

In [20]:
forest = RFR(200)
results = cross_validate(forest,X,y,cv=5,scoring='r2',return_train_score=True)

In [21]:
test_scores = results['test_score']
train_scores = results['train_score']
print(np.mean(train_scores))
print(np.mean(test_scores)) # Random has good results but they can be even better

0.966124674331985
0.7511354321253354


# Applying Gradient Boosting Tree

In [22]:
from sklearn.ensemble import GradientBoostingRegressor as GBR

ensemble = GBR() # one the most powerfull method 

In [23]:
results = cross_validate(ensemble,X,y,cv=5,scoring='r2',return_train_score=True)

In [24]:
test_scores = results['test_score']
train_scores = results['train_score']
print(np.mean(train_scores))
print(np.mean(test_scores)) # it has good result, but les optimize its parameters

0.9226960450130575
0.7447140813109439


# Optimazing Hiperparameters

In [25]:
from sklearn.model_selection import GridSearchCV

param_test1 = {'n_estimators': range(20,501,20)} # parameter to change

In [26]:
estimator =GBR(learning_rate = 0.1,
              min_samples_split=500,
              min_samples_leaf=50,
              max_depth=8,
              max_features='sqrt',
              subsample=0.8,
              random_state=10) # Best configuration of parameter for this problem

In [27]:
gsearch1 = GridSearchCV(estimator,
                       param_grid = param_test1,
                       scoring='r2',
                       cv=5) # ensembling the searcher for the best parameter

In [28]:
gsearch1.fit(X_train,y_train) # applying the searcher

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingRegressor(alpha=0.9,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=8,
                                                 max_features='sqrt',
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=50,
                                                 min_samples_split=500,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
           

In [29]:
gsearch1.cv_results_ ,gsearch1.best_params_,gsearch1.best_score_ # the best parameter is 220 for number of estimators

({'mean_fit_time': array([0.02688909, 0.04561648, 0.0676096 , 0.08964581, 0.11042566,
         0.13307352, 0.15445266, 0.19933877, 0.25923696, 0.25033464,
         0.26515322, 0.33765407, 0.30567536, 0.39145484, 0.36665106,
         0.36425552, 0.43579473, 0.41354012, 0.4399024 , 0.47031817,
         0.59667292, 0.53775158, 0.51537786, 0.57125645, 0.58378201]),
  'std_fit_time': array([0.0037622 , 0.00010856, 0.00081749, 0.00285226, 0.00101993,
         0.00138933, 0.00240895, 0.02639117, 0.03124356, 0.02129826,
         0.02834715, 0.00890734, 0.02706268, 0.06060587, 0.03134385,
         0.0118647 , 0.02273173, 0.0181373 , 0.01113533, 0.01821028,
         0.05519223, 0.03614403, 0.01281186, 0.03008992, 0.01463472]),
  'mean_score_time': array([0.00202451, 0.00219421, 0.00245242, 0.00259228, 0.00276699,
         0.00301538, 0.00316353, 0.00340958, 0.0039268 , 0.00427794,
         0.00428729, 0.00468807, 0.00452285, 0.00577946, 0.00473971,
         0.00495496, 0.00566115, 0.00525637, 0.

In [30]:
final_result = cross_validate(gsearch1.best_estimator_,X_train,y_train,return_train_score=True)
#cross validating

In [31]:
test_scores = final_result['test_score']
train_scores = final_result['train_score']
print(np.mean(train_scores))
print(np.mean(test_scores)) # they seems to be not good enough but let's try with the test values

0.7889689900229396
0.6990678884540423


In [32]:
estimator =GBR(learning_rate = 0.1,
              min_samples_split=500,
              min_samples_leaf=50,
              max_depth=8,
              max_features='sqrt',
              subsample=0.8,
              random_state=10,
              n_estimators = 220)

In [33]:
estimator.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=8,
                          max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=50, min_samples_split=500,
                          min_weight_fraction_leaf=0.0, n_estimators=220,
                          n_iter_no_change=None, presort='auto',
                          random_state=10, subsample=0.8, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [34]:
estimator.score(X_test,y_test) # Gradient Boosting Tree has the best correlation for this problem

0.8064928781402682