Creamos los modelos por tipo de cliente

In [28]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor 

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('once') 

In [3]:
from google.colab import files
uploaded = files.upload() 

Saving casual_encoded.csv to casual_encoded.csv


In [29]:
df_casual = pd.read_csv("casual_encoded.csv", index_col = 0)
df_casual.head()

Unnamed: 0,dteday,temp_es,windspeed_es,casual_es,dia_sem_encoded,laborables_0,laborables_1,weathersit_encoded,month_encoded,year_0,year_1
0,01-01-2018,-0.827613,-0.387833,-0.755455,4,1,0,2,1,1,0
1,02-01-2018,-0.722069,0.748899,-1.046996,3,0,1,2,1,1,0
2,03-01-2018,-1.635432,0.745931,-1.063031,1,0,1,3,1,1,0
3,04-01-2018,-1.61556,-0.389769,-1.080523,1,0,1,3,1,1,0
4,05-01-2018,-1.468226,-0.046477,-1.118424,0,0,1,3,1,1,0


In [5]:
from google.colab import files
uploaded = files.upload() 

Saving registered_encoded.csv to registered_encoded.csv


In [30]:
df_registered = pd.read_csv("registered_encoded.csv", index_col = 0)
df_registered.head()

Unnamed: 0,dteday,temp_es,windspeed_es,registered_es,dia_sem_encoded,laborables_0,laborables_1,weathersit_encoded,month_encoded,year_0,year_1
0,01-01-2018,-0.827613,-0.387833,-1.927745,1,1,0,2,1,1,0
1,02-01-2018,-0.722069,0.748899,-1.91748,1,0,1,2,1,1,0
2,03-01-2018,-1.635432,0.745931,-1.558846,2,0,1,3,1,1,0
3,04-01-2018,-1.61556,-0.389769,-1.414494,3,0,1,3,1,1,0
4,05-01-2018,-1.468226,-0.046477,-1.373434,3,0,1,3,1,1,0


In [31]:
X = df_casual.drop(["casual_es", "dteday"], axis = 1)
y = df_casual["casual_es"]

In [32]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [10]:
arbol = DecisionTreeRegressor(random_state =0)

arbol.fit(x_train, y_train)

DecisionTreeRegressor(random_state=0)

In [11]:
#fig = plt.figure(figsize = (10,6))
#tree.plot_tree(arbol, feature_names = x_train.columns, filled = True)
#plt.show()

In [12]:
max_features = np.sqrt(len(x_train.columns))
max_features

3.0

In [13]:
print(arbol.tree_.max_depth)

19


In [9]:
param = {"max_depth": [4,5,6,7,8,9,10,12, 13, 14, 15], # teniendo en cuenta que teníamos overfitting tendremos que reducir la profundidad del modelo, la nuestra anterior era de 17. Bajaremos mucho este valor ya que teníamos un overfitting muy claro
        "max_features": [1,2,3],# calculamos en celdas anteriores, probaremos a hacer el modelo como una variable, 2, 3 y 4. Ponemos como límite el 4 ya que es el resultado de la raiz cuadrada. 
        # estos dos hiperparámetros son más difíciles de definir, pero usualmente se suelen elegir los siguientes valores
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100],
        "random_state":[0]} 

In [35]:
gs = GridSearchCV(
            estimator=DecisionTreeRegressor(), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1, # para que no nos printee ningún mensaje en pantalla
            return_train_score = True, # para que nos devuelva el valor de las métricas de set de datos de entrenamiento
            scoring="neg_mean_squared_error") # la métrica que queremos que nos devuelva

In [36]:
gs.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [4, 5, 6, 7, 8, 9, 10, 12],
                         'max_features': [1, 2, 3],
                         'min_samples_leaf': [10, 50, 100],
                         'min_samples_split': [10, 50, 100],
                         'random_state': [0]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=-1)

In [37]:
mejor_modelo = gs.best_estimator_
mejor_modelo

DecisionTreeRegressor(max_depth=5, max_features=3, min_samples_leaf=10,
                      min_samples_split=10, random_state=0)

In [38]:
y_pred_test_dt2 = mejor_modelo.predict(x_test)
y_pred_train_dt2 = mejor_modelo.predict(x_train)

In [33]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [40]:
dt_results2 = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Decision tree II")
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.380847,0.326443,0.571352,0.680226,test,Decision tree II
1,0.345174,0.269338,0.518978,0.72899,train,Decision tree II


In [41]:
param2 = {"max_depth": 5,
        "max_features": 3,
        "min_samples_split":10,
        "min_samples_leaf": 10,
        "random_state":0} 

In [None]:
forest = "max_depth": 5,
        "max_features": 3,
        "min_samples_split":10,
        "min_samples_leaf": 10,
        "random_state":0

In [34]:
gs_rf = GridSearchCV(
            estimator=RandomForestRegressor(), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1, # para que no nos printee ningún mensaje en pantalla
            return_train_score = True, # para que nos devuelva el valor de las métricas de set de datos de entrenamiento
            scoring="neg_mean_squared_error") # la métrica que queremos que nos devuelva

In [12]:
gs_rf.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15],
                         'max_features': [1, 2, 3],
                         'min_samples_leaf': [10, 50, 100],
                         'min_samples_split': [10, 50, 100],
                         'random_state': [0]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=-1)

In [13]:
bosque = gs_rf.best_estimator_
bosque

RandomForestRegressor(max_depth=6, max_features=3, min_samples_leaf=10,
                      min_samples_split=10, random_state=0)

In [14]:
# Le añadimos max_depths mayores y nos volvió a tomar la max_depth = 6, las métricas que devuelve son las mismas
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)

In [15]:
dt_results3 = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest I")
dt_results3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.324401,0.216808,0.465626,0.787621,test,Random Forest I
1,0.33323,0.239996,0.489894,0.758515,train,Random Forest I


Como nos salieron el mismo best estimator y las mismas métricas, vamos a probar a retirar del modelo alguna variable.

In [None]:
# Vimos en los boxplots que en los usuarios casuales no había mucha diferencia entre las medianas según si el día era laborable o no.

In [54]:
X.head()

Unnamed: 0,temp_es,windspeed_es,dia_sem_encoded,laborables_0,laborables_1,weathersit_encoded,month_encoded,year_0,year_1
0,-0.827613,-0.387833,4,1,0,2,1,1,0
1,-0.722069,0.748899,3,0,1,2,1,1,0
2,-1.635432,0.745931,1,0,1,3,1,1,0
3,-1.61556,-0.389769,1,0,1,3,1,1,0
4,-1.468226,-0.046477,0,0,1,3,1,1,0


In [35]:
# Retiramos las columnas de laborables
x_test = x_test.drop(['laborables_0', 'laborables_1'], axis=1)
x_test.head(2)

Unnamed: 0,temp_es,windspeed_es,dia_sem_encoded,weathersit_encoded,month_encoded,year_0,year_1
468,-0.003234,0.005358,2,3,5,0,1
148,0.939567,0.302416,3,3,6,1,0


In [36]:
x_train = x_train.drop(['laborables_0', 'laborables_1'], axis=1)
x_train.head(2)

Unnamed: 0,temp_es,windspeed_es,dia_sem_encoded,weathersit_encoded,month_encoded,year_0,year_1
247,0.971447,0.286389,1,2,7,1,0
335,-0.991579,-1.158085,2,3,3,1,0


In [None]:
# Vamos a por el modelo de Random Forest con las nuevas predictoras


In [37]:
gs_rf2 = GridSearchCV(
            estimator=RandomForestRegressor(), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1, # para que no nos printee ningún mensaje en pantalla
            return_train_score = True, # para que nos devuelva el valor de las métricas de set de datos de entrenamiento
            scoring="neg_mean_squared_error") # la métrica que queremos que nos devuelva

In [38]:
gs_rf2.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15],
                         'max_features': [1, 2, 3],
                         'min_samples_leaf': [10, 50, 100],
                         'min_samples_split': [10, 50, 100],
                         'random_state': [0]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=-1)

In [39]:
bosque2 = gs_rf2.best_estimator_
bosque2

RandomForestRegressor(max_depth=8, max_features=3, min_samples_leaf=10,
                      min_samples_split=10, random_state=0)

In [40]:
y_pred_test_rf2 = bosque2.predict(x_test)
y_pred_train_rf2 = bosque2.predict(x_train)

In [41]:
dt_results4 = metricas(y_test, y_train, y_pred_test_rf2, y_pred_train_rf2, "Random Forest II")
dt_results4

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.339536,0.234219,0.483962,0.770566,test,Random Forest II
1,0.348676,0.264978,0.51476,0.733377,train,Random Forest II
