# Desafío - Machine Learning 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid, RepeatedKFold, GridSearchCV, train_test_split
from warnings import WarningMessage
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, scale
from sklearn.svm import SVC
import multiprocessing

WarningMessage = False
plt.rcParams["figure.dpi"] = 150
plt.rcParams["font.family"] = "Fira Sans Extra Condensed"

In [2]:
# Datos de entrenamiento
data_web = pd.read_csv("data/usuarios_win_mac_lin_train.csv")
data_web.describe()

# Datos de validacion
data_validation = pd.read_csv("data/data_validation_without_class.csv")

## Analisis de los datos

In [None]:
hist = data_web.hist(bins=10)

In [None]:
corr = data_web.corr()
sns.heatmap(corr, annot = True, square = True);

In [None]:
sns.pairplot(data_web, hue="clase", height = 2, palette = 'colorblind')

## Revision de outliers

## Análisis de componentes principales (PCA)

In [None]:
# media de cada variable
print('Media de las variables')
print(data_web.mean(axis=0))
# Varianza
print('Varianza de las variables')
print(data_web.var(axis=0))

Hay que estandarizar los datos para que variables con alta media y varianza no dominen el PCA.

In [None]:
# Entrenamiento modelo PCA con escalado de los datos
# ==============================================================================
pca_pipe = make_pipeline(StandardScaler(), PCA()) # Estandar Scaler estandariza, PCA obtiene componentes
pca_pipe.fit(data_web)

# Se extrae el modelo entrenado del pipeline
modelo_pca = pca_pipe.named_steps['pca']

In [None]:
modelo_pca.get_params()

Convierto los componentes del modelo para analizar sus combinaciones lineales e importancia

In [None]:
# Se combierte el array a dataframe para añadir nombres a los ejes.
pd.DataFrame(
    data    = modelo_pca.components_,
    columns = data_web.columns,
    index   = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']
)

Analisis visual

In [None]:
# Heatmap componentes
# ==============================================================================
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 2))
componentes = modelo_pca.components_
plt.imshow(componentes.T, cmap='viridis', aspect='auto')
plt.yticks(range(len(data_web.columns)), data_web.columns)
plt.xticks(range(len(data_web.columns)), np.arange(modelo_pca.n_components_) + 1)
plt.grid(False)
plt.colorbar()

In [None]:
# A partir de los autovalores, calculamos la 
# varianza explicada (% representatividad)

var_exp = modelo_pca.explained_variance_ratio_ *100  # ratio de varianza explicada por 100 (para que quede en %)
cum_var_exp = np.cumsum(var_exp) # varianza acumulada por componente 

n_components = len(modelo_pca.components_)
# Representamos en un diagrama de barras la varianza explicada por cada autovalor, y la acumulada
# with plt.style.context('seaborn-pastel'):
plt.figure(figsize=(4, 3))

plt.bar(range(n_components), var_exp, alpha=0.5, align='center',
        label='Varianza individual explicada', color='g')
plt.step(range(n_components), cum_var_exp, where='mid', linestyle='--', label='Varianza explicada acumulada')
plt.ylabel('Ratio de Varianza Explicada')
plt.xlabel('Componentes Principales')
plt.legend(loc='best')
plt.tight_layout()
## Pendiente -> anotar labels 

# Datos de entrenamiento

In [3]:
X=data_web[['duracion', 'paginas', 'acciones', 'valor']]  # Features
y=data_web['clase']  # Labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y) 

Creamos dict con pesos de cada clase

In [4]:
pesos = {
    0:0.5116279,
    1:0.2558135,
    2:0.23255813
}

# Modelo 1 -  DecisionTree

Clasificador DecisionTree


In [None]:
# Con menos componentes
X=data_web[['acciones','valor']]  # Features
y=data_web['clase']  # Labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y) 

In [None]:
# Árbol de desición
modelo_tree=DecisionTreeClassifier(max_depth = 3, random_state = 1, )
# fit
modelo_tree.fit(X_train,y_train)

y_pred=modelo_tree.predict(X_test)

In [None]:
modelo_tree.feature_importances_

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
#fn = ['duracion', 'paginas', 'acciones', 'valor']
fn = ['duracion', 'valor']
cn = ["0", "1", "2"]

plt.figure(figsize = (5,4))
plot_tree(modelo_tree, feature_names = fn, class_names = cn, filled = True)
plt.tight_layout(h_pad=0.5, w_pad=0.5)

# Si el gini tiende a cero, el error tiende a cero

In [None]:
predicciones = modelo_tree.predict(X = X_test)
print(metrics.classification_report(y_true = y_test,
y_pred = predicciones
))

# Modelo 2 - Random Forest



In [5]:
modelo_rf =  RandomForestClassifier(
            n_estimators = 100,
            n_jobs       = -1,
            max_depth= 6,
            random_state = 1
         )
      
modelo_rf.fit(X_train.values, y_train.values)

RandomForestClassifier(max_depth=6, n_jobs=-1, random_state=1)

In [6]:
# Accuracy
# ==============================================================================
y_pred = modelo_rf.predict(X_test.values)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6744186046511628


In [7]:
predicciones = modelo_rf.predict(X = X_test.values)
print(metrics.classification_report(y_true = y_test,
y_pred = predicciones
))

              precision    recall  f1-score   support

           0       0.65      0.77      0.71        22
           1       0.50      0.20      0.29        10
           2       0.77      0.91      0.83        11

    accuracy                           0.67        43
   macro avg       0.64      0.63      0.61        43
weighted avg       0.65      0.67      0.64        43



In [8]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param_grid = ParameterGrid(
                {'n_estimators': [100,150],
                 'max_features': [1,2],
                 'max_depth'   : [None,3, 6, 10],
                 'criterion'   : ['gini', 'entropy']
                }
            )

# Loop para ajustar un modelo con cada combinación de hiperparámetros
# ==============================================================================
resultados = {'params': [], 'oob_accuracy': []}

for params in param_grid:
    
    modelo = RandomForestClassifier(
                oob_score    = True,
                n_jobs       = -1,
                random_state = 123,
                ** params
             )
    
    modelo.fit(X_train.values, y_train)
    
    resultados['params'].append(params)
    resultados['oob_accuracy'].append(modelo.oob_score_)
    print(f"Modelo: {params} \u2713")

# Resultados
# ==============================================================================
resultados = pd.DataFrame(resultados)
resultados = pd.concat([resultados, resultados['params'].apply(pd.Series)], axis=1)
resultados = resultados.sort_values('oob_accuracy', ascending=False)
resultados = resultados.drop(columns = 'params')
resultados.head(4)

Modelo: {'criterion': 'gini', 'max_depth': None, 'max_features': 1, 'n_estimators': 100} ✓
Modelo: {'criterion': 'gini', 'max_depth': None, 'max_features': 1, 'n_estimators': 150} ✓
Modelo: {'criterion': 'gini', 'max_depth': None, 'max_features': 2, 'n_estimators': 100} ✓
Modelo: {'criterion': 'gini', 'max_depth': None, 'max_features': 2, 'n_estimators': 150} ✓
Modelo: {'criterion': 'gini', 'max_depth': 3, 'max_features': 1, 'n_estimators': 100} ✓
Modelo: {'criterion': 'gini', 'max_depth': 3, 'max_features': 1, 'n_estimators': 150} ✓
Modelo: {'criterion': 'gini', 'max_depth': 3, 'max_features': 2, 'n_estimators': 100} ✓
Modelo: {'criterion': 'gini', 'max_depth': 3, 'max_features': 2, 'n_estimators': 150} ✓
Modelo: {'criterion': 'gini', 'max_depth': 6, 'max_features': 1, 'n_estimators': 100} ✓
Modelo: {'criterion': 'gini', 'max_depth': 6, 'max_features': 1, 'n_estimators': 150} ✓
Modelo: {'criterion': 'gini', 'max_depth': 6, 'max_features': 2, 'n_estimators': 100} ✓
Modelo: {'criterion'

Unnamed: 0,oob_accuracy,criterion,max_depth,max_features,n_estimators
16,0.732283,entropy,,1,100
28,0.724409,entropy,10.0,1,100
12,0.716535,gini,10.0,1,100
8,0.708661,gini,6.0,1,100


In [None]:
# Error de test del modelo final
# ==============================================================================
modelo_rf =  RandomForestClassifier(
            n_estimators = 150,
            n_jobs       = -1,
            max_depth= 6,
            random_state = 1,
            max_features = 2,
            criterion='entropy'
         )

modelo_rf.fit(X_train.values, y_train)        

predicciones = modelo_rf.predict(X = X_test.values)
mat_confusion = metrics.confusion_matrix(
                    y_true    = y_test,
                    y_pred    = predicciones
                )

accuracy = metrics.accuracy_score(
            y_true    = y_test,
            y_pred    = predicciones,
            normalize = True
           )

print("Matriz de confusión")
print("-------------------")
print(mat_confusion)
print("")
print(f"El accuracy de test es: {100 * accuracy} %")

# Modelo 3 - RandomForest con PCA

In [9]:
# Entrenamiento modelo PCA con escalado de los datos
# ==============================================================================
# N components = 4 por el análisis anterior
pca_pipe = make_pipeline(StandardScaler(), PCA(n_components=4), RandomForestClassifier(random_state=1, class_weight=pesos) ) # Estandar Scaler estandariza, PCA obtiene componentes
pca_pipe.fit(X_train.values, y_train.values)


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=4)),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight={0: 0.5116279,
                                                      1: 0.2558135,
                                                      2: 0.23255813},
                                        random_state=1))])

Reviso  metricas del modelo

In [10]:
predicciones = pca_pipe.predict(X = X_test.values)
print(metrics.classification_report(y_true = y_test,
y_pred = predicciones
))

              precision    recall  f1-score   support

           0       0.78      0.82      0.80        22
           1       0.75      0.60      0.67        10
           2       0.83      0.91      0.87        11

    accuracy                           0.79        43
   macro avg       0.79      0.78      0.78        43
weighted avg       0.79      0.79      0.79        43



In [11]:
pca_pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'pca', 'randomforestclassifier', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'pca__copy', 'pca__iterated_power', 'pca__n_components', 'pca__random_state', 'pca__svd_solver', 'pca__tol', 'pca__whiten', 'randomforestclassifier__bootstrap', 'randomforestclassifier__ccp_alpha', 'randomforestclassifier__class_weight', 'randomforestclassifier__criterion', 'randomforestclassifier__max_depth', 'randomforestclassifier__max_features', 'randomforestclassifier__max_leaf_nodes', 'randomforestclassifier__max_samples', 'randomforestclassifier__min_impurity_decrease', 'randomforestclassifier__min_samples_leaf', 'randomforestclassifier__min_samples_split', 'randomforestclassifier__min_weight_fraction_leaf', 'randomforestclassifier__n_estimators', 'randomforestclassifier__n_jobs', 'randomforestclassifier__oob_score', 'randomforestclassifier__random_state', 'randomforestclassifier__verbose', 'randomforestclass

In [14]:
param_dict = {
            "randomforestclassifier__criterion": ['gini', 'entropy'],
            "randomforestclassifier__n_estimators":[100,150,200,500,1000],
            'randomforestclassifier__criterion': ['gini', 'entropy'],
            'randomforestclassifier__max_features':   [1,2,3,4],
            

              }

estimator = GridSearchCV(pca_pipe,
                         param_dict,
                         verbose=2)


In [15]:
estimator.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END randomforestclassifier__criterion=gini, randomforestclassifier__max_features=1, randomforestclassifier__n_estimators=100; total time=   0.2s
[CV] END randomforestclassifier__criterion=gini, randomforestclassifier__max_features=1, randomforestclassifier__n_estimators=100; total time=   0.2s
[CV] END randomforestclassifier__criterion=gini, randomforestclassifier__max_features=1, randomforestclassifier__n_estimators=100; total time=   0.2s
[CV] END randomforestclassifier__criterion=gini, randomforestclassifier__max_features=1, randomforestclassifier__n_estimators=100; total time=   0.1s
[CV] END randomforestclassifier__criterion=gini, randomforestclassifier__max_features=1, randomforestclassifier__n_estimators=100; total time=   0.2s
[CV] END randomforestclassifier__criterion=gini, randomforestclassifier__max_features=1, randomforestclassifier__n_estimators=150; total time=   0.3s
[CV] END randomforestclassifier__crite

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('pca', PCA(n_components=4)),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(class_weight={0: 0.5116279,
                                                                             1: 0.2558135,
                                                                             2: 0.23255813},
                                                               random_state=1))]),
             param_grid={'randomforestclassifier__criterion': ['gini',
                                                               'entropy'],
                         'randomforestclassifier__max_features': [1, 2, 3, 4],
                         'randomforestclassifier__n_estimators': [100, 150, 200,
                                                                  500, 1000]},
             verbose=2)

In [16]:
modelo_final = estimator.best_estimator_

In [17]:
modelo_final

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=4)),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight={0: 0.5116279,
                                                      1: 0.2558135,
                                                      2: 0.23255813},
                                        max_features=4, n_estimators=500,
                                        random_state=1))])

In [18]:
#  test del modelo final
# ==============================================================================
predicciones = modelo_final.predict(X = X_test)

In [19]:
print(
    metrics.classification_report(
        y_true = y_test,
        y_pred = predicciones
    )
)

              precision    recall  f1-score   support

           0       0.78      0.82      0.80        22
           1       0.75      0.60      0.67        10
           2       0.83      0.91      0.87        11

    accuracy                           0.79        43
   macro avg       0.79      0.78      0.78        43
weighted avg       0.79      0.79      0.79        43



## ADA Boost

https://machinelearningmastery.com/adaboost-ensemble-in-python/  
https://www.cienciadedatos.net/documentos/py09_gradient_boosting_python.html

In [66]:
Rfc = RandomForestClassifier(
                                        max_features=4, n_estimators=500,
                                        random_state=1)

adb_pipe = make_pipeline(StandardScaler(), PCA(n_components=4), AdaBoostClassifier(random_state=1,base_estimator=Rfc, learning_rate=0.1) ) # Estandar Scaler estandariza, PCA obtiene componentes
adb_pipe.fit(X_train.values, y_train.values)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=4)),
                ('adaboostclassifier',
                 AdaBoostClassifier(base_estimator=RandomForestClassifier(max_features=4,
                                                                          n_estimators=500,
                                                                          random_state=1),
                                    learning_rate=0.1, random_state=1))])

In [39]:
adb_pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'pca', 'adaboostclassifier', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'pca__copy', 'pca__iterated_power', 'pca__n_components', 'pca__random_state', 'pca__svd_solver', 'pca__tol', 'pca__whiten', 'adaboostclassifier__algorithm', 'adaboostclassifier__base_estimator', 'adaboostclassifier__learning_rate', 'adaboostclassifier__n_estimators', 'adaboostclassifier__random_state'])

In [67]:
predicciones = adb_pipe.predict(X = X_test)
print(metrics.classification_report(y_true = y_test,
y_pred = predicciones
))



              precision    recall  f1-score   support

           0       0.78      0.82      0.80        22
           1       0.75      0.60      0.67        10
           2       0.83      0.91      0.87        11

    accuracy                           0.79        43
   macro avg       0.79      0.78      0.78        43
weighted avg       0.79      0.79      0.79        43



In [68]:
predicciones

array([0, 2, 1, 0, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0, 2, 1, 0, 2, 0, 1, 2,
       0, 0, 2, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 2, 0, 0, 2, 0, 1, 0, 0],
      dtype=int64)