# Tema 11 - Ejercicio 2
## Mejorando un modelo de machine learning

2. Usando los diferentes métodos de meta-aprendizaje propuestos en el
capítulo 14 (bagging, boosting, random forests), elabore modelos con el
dataset de la prueba de evaluación del tema 5.


Importamos dependencias

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib.patches as mpatches
import seaborn as sb

%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
#in the doc: "Able to handle both numerical and categorical data. However, the scikit-learn implementation does not support categorical variables for now."
from sklearn.preprocessing import OneHotEncoder  

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Surpress warnings:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

## Paso 1: importar datos

Importamos el fichero (lo exportamos previamente en RStudio)

In [3]:
carseats_0 = pd.read_csv(r"carseats.csv",sep=',')

## Paso 2: procesar datos

In [4]:
carseats_0.describe()

Unnamed: 0.1,Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,200.5,7.496325,124.975,68.6575,6.635,264.84,115.795,53.3225,13.9
std,115.614301,2.824115,15.334512,27.986037,6.650364,147.376436,23.676664,16.200297,2.620528
min,1.0,0.0,77.0,21.0,0.0,10.0,24.0,25.0,10.0
25%,100.75,5.39,115.0,42.75,0.0,139.0,100.0,39.75,12.0
50%,200.5,7.49,125.0,69.0,5.0,272.0,117.0,54.5,14.0
75%,300.25,9.32,135.0,91.0,12.0,398.5,131.0,66.0,16.0
max,400.0,16.27,175.0,120.0,29.0,509.0,191.0,80.0,18.0


Vemos que tenemos un campo "unnamed", y que las variables categóricas no aparecen. <br>
Para mostrar algunas estadísticas sobre ellas:

In [5]:
# We have some categorical variables. To show them
carseats_0.describe(include='object')

Unnamed: 0,ShelveLoc,Urban,US
count,400,400,400
unique,3,2,2
top,Medium,Yes,Yes
freq,219,282,258


Lo primero que hay que hacer es eliminar la primera columna, que no sirve para nada:

In [6]:
#remove first column
carseats = carseats_0.drop(carseats_0.columns[0], axis=1)

In [7]:
carseats.head(5)

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


Debido a que la implementación de sckit-learn no puede tratar variables categóricas, hay que transformar las 3 variables categóricas mediante **"one hot encoding"**

In [8]:
carseats_cat = carseats.select_dtypes(include='object')

In [9]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='error')

In [10]:
carseats_cat_encoded = encoder.fit_transform(carseats_cat)

In [11]:
#categorical columns
#carseats_cat.columns

In [12]:
#Categories the encoder found:
for cat in encoder.categories_:
    print(cat)

['Bad' 'Good' 'Medium']
['No' 'Yes']
['No' 'Yes']


In [13]:
#more pythonic style:
categorical_columns = [f"{col}_{cat}" for i, col in enumerate(carseats_cat.columns) for cat in encoder.categories_[i]]
#categorical_columns

In [14]:
#put the one-hot encoded features into their own dataframe
one_hot_features = pd.DataFrame(carseats_cat_encoded, columns=categorical_columns)
#one_hot_features.head(5)

In [15]:
#dataframe with only numerical features
carseats = carseats.select_dtypes(exclude='object')
#carseats.head(5)

In [16]:
#Putting all together
carseats =  carseats.join(one_hot_features)
#carseats.head(5)

In [17]:
#carseats.describe()

Finalmente, queda tranformar la variable numérica "Sales" en una variable categórica tipo "Sí"/"No". Al igual que en R, consideramos que si el valor es superior a la media, el valor será "Sí".

In [18]:
sales_mean = np.mean(carseats["Sales"])
sales_mean

7.496325000000001

In [19]:
sales_max = np.max(carseats["Sales"])
sales_max

16.27

In [20]:
#use pd.cut
carseats["SalesCat"] = pd.cut(x=carseats["Sales"], bins=[0.0, sales_mean, sales_max], labels=["No", "Yes"])
carseats["SalesCat"].head(175)

0      Yes
1      Yes
2      Yes
3       No
4       No
      ... 
170    Yes
171    Yes
172    Yes
173     No
174    NaN
Name: SalesCat, Length: 175, dtype: category
Categories (2, object): ['No' < 'Yes']

Por alguna extraña razón, en la fila 174, en lugar de transformar 0.0 en "No", se obtiene un Nan:

In [21]:
#find nan in SalesCat 
carseats.loc[pd.isna(carseats["SalesCat"]), :].index  

Index([174], dtype='int64')

Como es evidente que 0.0 es menor que la media, asignamos "No" de manera manual

In [22]:
#For a reason I cannot get, pd.cut above transforms the value in row 174 from 0.0 into "NaN":
# ..
# 173     No
# 174    NaN
# As the original value was 0.0 < sales_mean, we assign "No" manually
carseats["SalesCat"][174] = "No"
carseats["SalesCat"][174]

'No'

In [23]:
carseats.head(175)

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Bad,ShelveLoc_Good,ShelveLoc_Medium,Urban_No,Urban_Yes,US_No,US_Yes,SalesCat
0,9.50,138,73,11,276,120,42,17,1.0,0.0,0.0,0.0,1.0,0.0,1.0,Yes
1,11.22,111,48,16,260,83,65,10,0.0,1.0,0.0,0.0,1.0,0.0,1.0,Yes
2,10.06,113,35,10,269,80,59,12,0.0,0.0,1.0,0.0,1.0,0.0,1.0,Yes
3,7.40,117,100,4,466,97,55,14,0.0,0.0,1.0,0.0,1.0,0.0,1.0,No
4,4.15,141,64,3,340,128,38,13,1.0,0.0,0.0,0.0,1.0,1.0,0.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,8.01,128,39,12,356,118,71,10,0.0,0.0,1.0,0.0,1.0,0.0,1.0,Yes
171,12.49,93,106,12,416,55,75,15,0.0,0.0,1.0,0.0,1.0,0.0,1.0,Yes
172,9.03,104,102,13,123,110,35,16,0.0,1.0,0.0,0.0,1.0,0.0,1.0,Yes
173,6.38,135,91,5,207,128,66,18,0.0,0.0,1.0,0.0,1.0,0.0,1.0,No


(Todo ok ahora)

In [24]:
# features and target
y = carseats["SalesCat"]
X = carseats.drop("Sales",axis=1).drop("SalesCat",axis=1)


In [25]:
#to check if there are nan
#y.isnull().any().any()

Ahora ya podemos separar los datos en dos conjuntos diferentes (entrenamiento y test)

In [26]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10, shuffle=True, stratify=y)
print(f"x_train.shape: {x_train.shape}, x_test.shape: {x_test.shape}, y_train.shape: {y_train.shape}, y_test.shape: {y_test.shape}") 

x_train.shape: (300, 14), x_test.shape: (100, 14), y_train.shape: (300,), y_test.shape: (100,)


## Aplicación de técnicas "ensemble"

In [27]:
# aux. function not really needed now
def classifier_testing(clf, X_train, X_test, y_train, y_test):
    # Training
    clf.fit(X_train, y_train)
  
    #Predictions
    y_pred = clf.predict(X_test)

    #Accuracy
    clf_accuracy_score = accuracy_score(y_test, y_pred)
    print("Accuracy Score:\n", clf_accuracy_score, "\n")

    #Classification Report
    class_rep = classification_report(y_test, y_pred)
    print("Classification Report:\n", class_rep, "\n")

    #Confusion Matrix
    conf_mtx = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", conf_mtx, "\n")

### Bagging

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier

In [28]:
from sklearn.ensemble import BaggingClassifier

In [29]:
weak_learner = DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=25)

bagging = BaggingClassifier(weak_learner,                      # If None, then the base estimator is a DecisionTreeClassifier. (but anyway, we explicitly show it)
                            n_estimators = 200,                # The number of base estimators in the ensemble. default= 10 .. increase to 20, 50 .. it improves result :)
                            max_samples = 0.5,                 # If float, then draw max_samples * X.shape[0] samples.
                            max_features = 0.5,                # The number of features to draw from X to train each base estimator
                            random_state = 123
                            )               

In [30]:
bagging.fit(x_train, y_train)
len(bagging.estimators_)

200

In [31]:
#prediction
y_pred = bagging.predict(x_test)

In [32]:
#Confusion Matrix
conf_mtx = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_mtx, "\n")

Confusion Matrix:
 [[42  8]
 [18 32]] 



In [33]:
#Accuracy
clf_accuracy_score = accuracy_score(y_test, y_pred)
print("Accuracy Score:\n", clf_accuracy_score, "\n")


Accuracy Score:
 0.74 



Mejor resultado:  max_leaf_nodes=25, n_estimators = 200, max_samples = 0.5, max_features = 0.5

Accuracy Score: </br>
 0.78  (algunas veces 0.76, otras 0.77 .. otras 0.72 ... antes de añadir random_state = 123)

Confusion Matrix: </br>
 [[45   5] </br>
 [17 33]] 

</br>

**Resultado previo sin bagging:**

Accuracy Score:
 0.72 

Classification Report:
               precision    recall  f1-score   support

          No       0.70      0.76      0.73        50
         Yes       0.74      0.68      0.71        50

    accuracy                           0.72       100
   macro avg       0.72      0.72      0.72       100
weighted avg       0.72      0.72      0.72       100
 

Confusion Matrix:</br>
 [[38 12]</br>
 [16 34]] 

El resultado es muy parecido al obtenido en R (con pequeñas diferencias según el valor de max_leaf_nodes), pero ligeramente inferior.

En R:
        Reference
  Prediction No Yes
      No   41 16
      Yes   9 33

    Accuracy : 0.7475

En R, con bagging, se obtuvo 0.78 también.

### Boosting

**Adaboost**, como en R:

https://scikit-learn.org/stable/auto_examples/ensemble/plot_adaboost_multiclass.html#sphx-glr-auto-examples-ensemble-plot-adaboost-multiclass-py  <br>

In [34]:
from sklearn.ensemble import AdaBoostClassifier

In [35]:
weak_learner = DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=25)
n_estimators = 200

In [36]:
adaboost_clf = AdaBoostClassifier(
    estimator = weak_learner,
    n_estimators = n_estimators,
    random_state = 123,
)

In [37]:
adaboost_clf.fit(x_train, y_train)
len(adaboost_clf.estimators_)

200

In [38]:
#prediction
y_pred = adaboost_clf.predict(x_test)

In [39]:
#Confusion Matrix
conf_mtx = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_mtx, "\n")

Confusion Matrix:
 [[43  7]
 [11 39]] 



In [40]:
#Accuracy
clf_accuracy_score = accuracy_score(y_test, y_pred)
print("Accuracy Score:\n", clf_accuracy_score, "\n")

Accuracy Score:
 0.82 



Con boosting el resultado es ligeramente mejor que en R (eligiendo max_leaf_nodes=8, y también con 25)

En R:<br>
&emsp; &emsp; &emsp; &emsp; Reference<br>
&emsp; Prediction  No Yes<br>
&emsp; &emsp; &emsp; No &emsp; 43  15<br>
&emsp; &emsp; &emsp; Yes &emsp;7  34<br>
                                        
&emsp; &emsp; Accuracy : 0.7778<br>

### Random Forest

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
rf = RandomForestClassifier(n_estimators = 200,   # The number of trees in the forest. default 100
                            #max_depth=50,         # The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
                            random_state = 123)

rf.fit(x_train, y_train)
len(rf.estimators_)

200

In [43]:
#prediction
y_pred = rf.predict(x_test)

In [44]:
#Confusion Matrix
conf_mtx = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_mtx, "\n")

Confusion Matrix:
 [[42  8]
 [18 32]] 



In [45]:
#Accuracy
clf_accuracy_score = accuracy_score(y_test, y_pred)
print("Accuracy Score:\n", clf_accuracy_score, "\n")

Accuracy Score:
 0.74 



TBD:

usar GridSearchCV
