# Bagging y Random Forest

![](Bagging.jpg)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import  StandardScaler
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder, OrdinalEncoder, TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config

set_config(display = 'diagram')



# Generando un modelo Predictivo en el Titanic

In [3]:
df = pd.read_csv('titanic.csv', index_col = 0)
df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Signing_date
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1911-05-17
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1911-07-23
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1911-09-08
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1911-06-26
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1911-10-25
...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,1911-08-17
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1911-08-07
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,1912-01-30
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1911-08-08


In [4]:
df = df.drop(columns = ['Signing_date','Cabin','Ticket','Name'])
df[['Pclass','Sex','Embarked']] = df[['Pclass','Sex','Embarked']].astype('category')
df.dtypes

Survived       int64
Pclass      category
Sex         category
Age          float64
SibSp          int64
Parch          int64
Fare         float64
Embarked    category
dtype: object

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = 'Survived'), df.Survived, test_size = 0.3, random_state = 123)
is_cat = X_train.dtypes == 'category'

## Logistic Regression / Baseline

In [6]:
%%time
cat = Pipeline(steps = [
    ('imp_cat', SimpleImputer(strategy = 'most_frequent')), 
    ('enc', OneHotEncoder())
])

num = Pipeline(steps = [
    ('imp_num', SimpleImputer(strategy = 'mean')), 
    ('sc', StandardScaler())
])

prep = ColumnTransformer(transformers = [
    ('cat', cat, is_cat), 
    ('num', num, ~is_cat)
])

pipe = Pipeline(steps = [
    ('prep', prep), 
    ('rf', LogisticRegression(random_state = 123))
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
y_pred_train = pipe.predict(X_train)

print(classification_report(y_train, y_pred_train, digits = 3))
print(classification_report(y_test, y_pred, digits = 3))


### Ojo que logramos un acuracy muy parecido a lo que logramos con el Decision Tree pero sin sobreajustar


              precision    recall  f1-score   support

           0      0.827     0.847     0.837       379
           1      0.753     0.725     0.739       244

    accuracy                          0.799       623
   macro avg      0.790     0.786     0.788       623
weighted avg      0.798     0.799     0.799       623

              precision    recall  f1-score   support

           0      0.843     0.824     0.833       170
           1      0.706     0.735     0.720        98

    accuracy                          0.791       268
   macro avg      0.775     0.779     0.777       268
weighted avg      0.793     0.791     0.792       268

Wall time: 113 ms


  elif pd.api.types.is_categorical(cols):


## Bagging

Se realiza un ensamble de Regresiones Lineales en la cual se utilizarán 20 regresiones combinadas con Bagging a modo de regularizador.

In [101]:
%%time
cat = Pipeline(steps = [
    ('imp_cat', SimpleImputer(strategy = 'most_frequent')), 
    ('enc', OneHotEncoder())
])

num = Pipeline(steps = [
    ('imp_num', SimpleImputer(strategy = 'mean')), 
    ('sc', StandardScaler())
])

prep = ColumnTransformer(transformers = [
    ('cat', cat, is_cat), 
    ('num', num, ~is_cat)
])

pipe = Pipeline(steps = [
    ('prep', prep), 
    ('rf', BaggingClassifier(base_estimator = LogisticRegression(), n_estimators = 20, random_state = 123, max_samples = 0.8, n_jobs = -1, max_features = 0.6))
])


pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
y_pred_train = pipe.predict(X_train)

print(classification_report(y_train, y_pred_train, digits = 3))
print(classification_report(y_test, y_pred, digits = 3))

              precision    recall  f1-score   support

           0      0.822     0.863     0.842       379
           1      0.769     0.709     0.738       244

    accuracy                          0.803       623
   macro avg      0.795     0.786     0.790       623
weighted avg      0.801     0.803     0.801       623

              precision    recall  f1-score   support

           0      0.838     0.853     0.845       170
           1      0.737     0.714     0.725        98

    accuracy                          0.802       268
   macro avg      0.787     0.784     0.785       268
weighted avg      0.801     0.802     0.802       268

Wall time: 213 ms


## Random Forest 

Implementación básica de un Random Forest

In [103]:
cat = Pipeline(steps = [
    ('imp_cat', SimpleImputer(strategy = 'most_frequent')), 
    ('OneHotEncoder', OrdinalEncoder())
])

num = Pipeline(steps = [
    ('imp_num', SimpleImputer(strategy = 'mean')), 
    ('sc', StandardScaler())
])

prep = ColumnTransformer(transformers = [
    ('cat', cat, is_cat), 
    ('num', num, ~is_cat)
])

pipe = Pipeline(steps = [
    ('prep', prep), 
    ('rf', RandomForestClassifier(random_state = 123, n_jobs = -1))
])


pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
y_pred_train = pipe.predict(X_train)

print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred))

# Overfitted

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       379
           1       0.98      0.99      0.98       244

    accuracy                           0.99       623
   macro avg       0.99      0.99      0.99       623
weighted avg       0.99      0.99      0.99       623

              precision    recall  f1-score   support

           0       0.85      0.84      0.84       170
           1       0.73      0.73      0.73        98

    accuracy                           0.80       268
   macro avg       0.79      0.79      0.79       268
weighted avg       0.80      0.80      0.80       268



## Implementación RF avanzado

Se implementa un modelo de Random Forest más avanzado en el cual se realiza un GridSearch para encontrar parámetros óptimos.

Se presentan varios resultados mostrando el efecto de diversa combinación de hiperparámetros.

In [168]:
%%time
cat = Pipeline(steps = [
    ('imp_cat', SimpleImputer(strategy = 'most_frequent')), 
    ('enc', OrdinalEncoder())
])

num = Pipeline(steps = [
    ('imp_num', SimpleImputer(strategy = 'mean')), 
    ('sc', StandardScaler())
])

prep = ColumnTransformer(transformers = [
    ('cat', cat, is_cat), 
    ('num', num, ~is_cat)
])

pipe = Pipeline(steps = [
    ('prep', prep), 
    ('rf', RandomForestClassifier(random_state = 123, n_jobs = -1, oob_score = True))
])

params = {#'prep__num__sc': [StandardScaler(), 'passthorugh'],
        #'rf__criterion': ['gini', 'entropy'],
        #'prep__cat__enc': [OneHotEncoder(use_cat_names = True), OrdinalEncoder(), TargetEncoder()],
        'rf__ccp_alpha': [0.001, 0.01,0.1],#[0.1, 0.3, 0.5], #[0.001, 0.01,0.1]
        'rf__max_depth': [1, 5, 10],
        'rf__n_estimators': [300, 500],
        #'rf__min_samples_split': [0.01, 0.1]
         }

#params1 = {}

search = GridSearchCV(pipe, params, cv = 5, scoring = 'f1', n_jobs = -1)

#probar sólo con Scaling y sin escaling
# probar distintos encoders
# probar con distintos alpha mayores a 0.1
# acortar la complejidad del arbol con max_depth sin utilizar ccp_alpha, luego dejar con sólo valores menores a 1
# agrandar el bagging usando más estimadores
# probar si es que min_samples_split ayuda un poco más

Wall time: 1.01 ms


In [143]:
search.fit(X_train, y_train)
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_train = best_model.predict(X_train)

print(classification_report(y_train, y_pred_train, digits = 3))
print(classification_report(y_test, y_pred, digits = 3))

#probar sólo con Scaling y sin escaling, overfitted

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       379
           1       0.98      0.99      0.98       244

    accuracy                           0.99       623
   macro avg       0.99      0.99      0.99       623
weighted avg       0.99      0.99      0.99       623

              precision    recall  f1-score   support

           0       0.85      0.84      0.84       170
           1       0.73      0.73      0.73        98

    accuracy                           0.80       268
   macro avg       0.79      0.79      0.79       268
weighted avg       0.80      0.80      0.80       268



In [145]:
search.fit(X_train, y_train)
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_train = best_model.predict(X_train)

print(classification_report(y_train, y_pred_train, digits = 3))
print(classification_report(y_test, y_pred, digits = 3))

# posiblemente óptimo o aun underfitted

              precision    recall  f1-score   support

           0       0.80      0.85      0.83       379
           1       0.74      0.68      0.71       244

    accuracy                           0.78       623
   macro avg       0.77      0.76      0.77       623
weighted avg       0.78      0.78      0.78       623

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       170
           1       0.74      0.69      0.72        98

    accuracy                           0.80       268
   macro avg       0.78      0.78      0.78       268
weighted avg       0.80      0.80      0.80       268



In [167]:
%%time
search.fit(X_train, y_train)
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_train = best_model.predict(X_train)

print(classification_report(y_train, y_pred_train, digits = 3))
print(classification_report(y_test, y_pred, digits = 3))
# distintos encoders

              precision    recall  f1-score   support

           0      0.992     0.987     0.989       379
           1      0.980     0.988     0.984       244

    accuracy                          0.987       623
   macro avg      0.986     0.987     0.987       623
weighted avg      0.987     0.987     0.987       623

              precision    recall  f1-score   support

           0      0.840     0.865     0.852       170
           1      0.753     0.714     0.733        98

    accuracy                          0.810       268
   macro avg      0.796     0.789     0.793       268
weighted avg      0.808     0.810     0.809       268

Wall time: 2.58 s


In [158]:
%%time
search.fit(X_train, y_train)
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_train = best_model.predict(X_train)

print(classification_report(y_train, y_pred_train, digits = 3))
print(classification_report(y_test, y_pred, digits = 3))

# posiblemente otro óptimo o aun underfitted, es posible quizás sacarle más

              precision    recall  f1-score   support

           0      0.812     0.947     0.875       379
           1      0.890     0.660     0.758       244

    accuracy                          0.835       623
   macro avg      0.851     0.804     0.816       623
weighted avg      0.842     0.835     0.829       623

              precision    recall  f1-score   support

           0      0.839     0.947     0.890       170
           1      0.882     0.684     0.770        98

    accuracy                          0.851       268
   macro avg      0.860     0.815     0.830       268
weighted avg      0.854     0.851     0.846       268

Wall time: 5.75 s


In [160]:
%%time
search.fit(X_train, y_train)
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_train = best_model.predict(X_train)

print(classification_report(y_train, y_pred_train, digits = 3))
print(classification_report(y_test, y_pred, digits = 3))

#subir n_estimators

              precision    recall  f1-score   support

           0      0.835     0.950     0.889       379
           1      0.901     0.709     0.794       244

    accuracy                          0.856       623
   macro avg      0.868     0.829     0.841       623
weighted avg      0.861     0.856     0.852       623

              precision    recall  f1-score   support

           0      0.834     0.947     0.887       170
           1      0.880     0.673     0.763        98

    accuracy                          0.847       268
   macro avg      0.857     0.810     0.825       268
weighted avg      0.851     0.847     0.842       268

Wall time: 35.2 s


In [169]:
%%time
search.fit(X_train, y_train)
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_train = best_model.predict(X_train)

print(classification_report(y_train, y_pred_train, digits = 3))
print(classification_report(y_test, y_pred, digits = 3))
# modelo final

              precision    recall  f1-score   support

           0      0.835     0.950     0.889       379
           1      0.901     0.709     0.794       244

    accuracy                          0.856       623
   macro avg      0.868     0.829     0.841       623
weighted avg      0.861     0.856     0.852       623

              precision    recall  f1-score   support

           0      0.834     0.947     0.887       170
           1      0.880     0.673     0.763        98

    accuracy                          0.847       268
   macro avg      0.857     0.810     0.825       268
weighted avg      0.851     0.847     0.842       268

Wall time: 37.9 s


In [170]:
pd.DataFrame(search.cv_results_).sort_values(by = 'rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__ccp_alpha,param_rf__max_depth,param_rf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,3.1614,0.247645,0.2874,0.026874,0.001,5,500,"{'rf__ccp_alpha': 0.001, 'rf__max_depth': 5, '...",0.777778,0.717391,0.744186,0.776471,0.758621,0.754889,0.022469,1
2,1.975599,0.141807,0.2042,0.029526,0.001,5,300,"{'rf__ccp_alpha': 0.001, 'rf__max_depth': 5, '...",0.772727,0.717391,0.744186,0.767442,0.744186,0.749187,0.01974,2
9,3.692174,0.275891,0.3616,0.054481,0.01,5,500,"{'rf__ccp_alpha': 0.01, 'rf__max_depth': 5, 'r...",0.795455,0.709677,0.729412,0.761905,0.738095,0.746909,0.029501,3
8,2.209053,0.140062,0.392806,0.030942,0.01,5,300,"{'rf__ccp_alpha': 0.01, 'rf__max_depth': 5, 'r...",0.795455,0.709677,0.729412,0.761905,0.731707,0.745631,0.029989,4
11,3.377199,0.187441,0.281001,0.011611,0.01,10,500,"{'rf__ccp_alpha': 0.01, 'rf__max_depth': 10, '...",0.786517,0.702128,0.729412,0.761905,0.738095,0.743611,0.028735,5


In [171]:
best_model.named_steps.rf.oob_score_

0.8170144462279294

### Feature Importance

Se muestra el mecanismo de rescate de Feature Importance para un Random Forest embebido en un Pipeline

In [184]:
pd.Series(best_model.named_steps.rf.feature_importances_, X_train.columns).sort_values(ascending = False)

Sex         0.424795
Embarked    0.185981
SibSp       0.132018
Pclass      0.129398
Parch       0.054344
Fare        0.038931
Age         0.034533
dtype: float64

### Ingresar al Pipeline y extraer valores de Imputación Numérica

In [189]:
best_model.named_steps.prep.named_transformers_.num.named_steps.imp_num.statistics_

array([29.88569106,  0.51685393,  0.36597111, 33.32687159])