In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline

### Decision Trees

In [4]:
from sklearn.tree import DecisionTreeRegressor as DTG
model = DTG(max_depth=2)

In [5]:
X = pd.read_csv('..\..\..\DataSets\Resultados intermedios peliculas DS\X_opening.csv')
y = X['worldwide_gross']
X = X.drop('worldwide_gross', axis=1)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [7]:
model.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [8]:
model.score(X_test,y_test)

0.6966161697635206

In [9]:
import graphviz

In [10]:
from sklearn.tree import export_graphviz

treedot = export_graphviz(model,
                         out_file=None,
                         feature_names=X.columns)

In [11]:
treedot

'digraph Tree {\nnode [shape=box] ;\n0 [label="opening_gross <= 41613376.0\\nmse = 4.491994363696598e+16\\nsamples = 1665\\nvalue = 141540319.054"] ;\n1 [label="opening_gross <= 22074047.0\\nmse = 1.3333822193127824e+16\\nsamples = 1506\\nvalue = 92999937.199"] ;\n0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;\n2 [label="mse = 4923666241203546.0\\nsamples = 1257\\nvalue = 64781848.271"] ;\n1 -> 2 ;\n3 [label="mse = 3.147813101998764e+16\\nsamples = 249\\nvalue = 235450289.735"] ;\n1 -> 3 ;\n4 [label="opening_gross <= 70351576.0\\nmse = 1.10398118715936e+17\\nsamples = 159\\nvalue = 601300162.289"] ;\n0 -> 4 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;\n5 [label="mse = 4.067538845924493e+16\\nsamples = 92\\nvalue = 440868287.554"] ;\n4 -> 5 ;\n6 [label="mse = 1.2226485798747571e+17\\nsamples = 67\\nvalue = 821594676.851"] ;\n4 -> 6 ;\n}'

## Random Forest

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate

In [13]:
forest = RandomForestRegressor(200)
results = cross_validate(forest, X, y,return_train_score=True, cv=5, scoring='r2')

In [14]:
results

{'fit_time': array([1.75591898, 1.75592017, 1.76191211, 1.75292253, 1.76691198]),
 'score_time': array([0.02398562, 0.02798104, 0.02798557, 0.02898216, 0.02698302]),
 'test_score': array([0.46288398, 0.68984111, 0.57450978, 0.42459556, 0.44330136]),
 'train_score': array([0.96146691, 0.96859762, 0.96396188, 0.96559008, 0.96659333])}

In [15]:
test_scores = results['test_score']
train_scores = results['train_score']
print(np.mean(train_scores))
print(np.mean(test_scores))

0.9652419647495718
0.5190263588083983


In [16]:
#Buenos resultados de entrenamiento pero tiene overfitting

## Gradiente Boosting trees

In [17]:
from sklearn.ensemble import GradientBoostingRegressor

ensemble = GradientBoostingRegressor()
results = cross_validate(ensemble, X, y, return_train_score=True, cv=5, scoring='r2')

In [18]:
test_scores = results['test_score']
train_scores = results['train_score']
print(np.mean(train_scores))
print(np.mean(test_scores))

0.9151392143549633
0.5248041593477997


## Hiperparámetros

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [20]:
from sklearn.model_selection import GridSearchCV

param_test1 = {'n_estimators' : range(20, 501, 20)}

In [21]:
param_test1

{'n_estimators': range(20, 501, 20)}

In [26]:
list(param_test1['n_estimators'])

[20,
 40,
 60,
 80,
 100,
 120,
 140,
 160,
 180,
 200,
 220,
 240,
 260,
 280,
 300,
 320,
 340,
 360,
 380,
 400,
 420,
 440,
 460,
 480,
 500]

In [27]:
estimator = GradientBoostingRegressor(learning_rate=0.1,
                                      min_samples_split=500,
                                      min_samples_leaf=50,
                                      max_depth=8,
                                      max_features='sqrt',
                                      subsample=0.8,
                                      random_state=10)

In [28]:
gsearch1 = GridSearchCV(estimator,
                       param_grid = param_test1,
                       scoring='r2',
                       cv=5)

In [29]:
gsearch1.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=8,
                                                 max_features='sqrt',
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=50,
                                                 min_samples_split=500,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
            

In [31]:
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

({'mean_fit_time': array([0.03138151, 0.05056834, 0.07795162, 0.11093173, 0.12652197,
         0.15050755, 0.17789063, 0.20347085, 0.25684152, 0.26703644,
         0.31680427, 0.32020631, 0.37301073, 0.38394032, 0.38786449,
         0.39188781, 0.39955802, 0.42271285, 0.44568396, 0.46427574,
         0.49309659, 0.5144824 , 0.54266562, 0.55885634, 0.58104162]),
  'std_fit_time': array([0.00988534, 0.00272536, 0.00178814, 0.00791931, 0.00149545,
         0.00224426, 0.00244756, 0.00688388, 0.01956397, 0.00318673,
         0.02119911, 0.01303835, 0.01337193, 0.02172286, 0.02299109,
         0.00646991, 0.00490314, 0.00588467, 0.00400661, 0.0021812 ,
         0.01509777, 0.00710558, 0.0060633 , 0.01084476, 0.00708409]),
  'mean_score_time': array([0.00139894, 0.00119958, 0.0019989 , 0.00199866, 0.00199871,
         0.00240006, 0.00239849, 0.00259876, 0.00279818, 0.00299711,
         0.00299854, 0.00306907, 0.00359817, 0.00399709, 0.00399718,
         0.0033978 , 0.02014313, 0.00359759, 0.

In [39]:
final_results = cross_validate(gsearch1.best_estimator_,X_train,y_train,return_train_score=True)

In [40]:
final_results

{'fit_time': array([0.28682232, 0.26683545, 0.27083325, 0.29381871, 0.29082274]),
 'score_time': array([0.00299811, 0.00299788, 0.00299811, 0.00300074, 0.00299764]),
 'test_score': array([0.70503564, 0.66126336, 0.84330169, 0.79674271, 0.77018645]),
 'train_score': array([0.84023253, 0.84280927, 0.80824339, 0.82833448, 0.82368469])}

In [41]:
test_scores = final_results['test_score']
train_scores = final_results['train_score']
print(np.mean(train_scores))
print(np.mean(test_scores))

0.8286608694466742
0.7553059694284988


In [42]:
estimator = GradientBoostingRegressor(learning_rate=0.1,
                                     min_samples_split=500,
                                     min_samples_leaf=50,
                                     max_depth=8,
                                     max_features='sqrt',
                                     subsample=0.8,
                                     random_state=10,
                                     n_estimators=240)

In [43]:
estimator.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=8,
                          max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=50, min_samples_split=500,
                          min_weight_fraction_leaf=0.0, n_estimators=240,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=10, subsample=0.8, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [45]:
estimator.score(X_test,y_test)

0.8092888852563106