## Entregable Grupo 13: Aprendizaje supervisado

## Paquetes requeridos

In [16]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import make_column_transformer
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
import sklearn.linear_model ##SVM
import sklearn.neural_network ##SVM
from sklearn import neural_network
from sklearn import ensemble ##Random Forest
import xgboost as xgb ##Boosting

## Datos

In [17]:
train_df = pd.read_csv("../data/travel_insurance_prediction_train.csv")
test_df = pd.read_csv("../data/travel_insurance_prediction_test.csv")

Se realizaron distintas transformaciones de los atributos que implicaron la discretización de variables, la eliminación de aquellas poco relevantes y la conversión de atributos categóricos a variables dummy.

Para cada una de los datasets preprocesados se aplicaron técnicas de Validación Cruzada y Gridsearch para optimizar los hiperparámetros según la métrica elegica (f1_score).
A continuación se presentan los Modelos con valores de F1 score macro average más altos

#### Modelo 1: Algoritmo:  XGBOOST
#### Preprocesado: Discretización de las features `Age` y `AnnualIncome`

In [18]:
transformer = make_column_transformer(
    (KBinsDiscretizer(n_bins=5, encode="onehot-dense", strategy="quantile"), ["Age", "AnnualIncome"]),
    (OneHotEncoder(categories="auto", dtype="int", handle_unknown="ignore"),
     ["Employment Type", "GraduateOrNot", "FamilyMembers", "FrequentFlyer", "EverTravelledAbroad"]),
    remainder="passthrough")

In [19]:
X_train = transformer.fit_transform(train_df.drop(columns=["Customer", "TravelInsurance"]))
y_train = train_df["TravelInsurance"].values

In [20]:
X_test=transformer.transform(test_df.drop(columns=["Customer"]))

#### División de los datos en train y validation

In [21]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.20, random_state=42)

#### Modelado

In [22]:
best_xg_reg=xgb.XGBClassifier(alpha=0.0001, colsample_bytree=1,
              learning_rate=0.4, max_depth=7,           
              n_estimators=5, objetive="binary:logistic")
best_xg_reg.fit(X_train, y_train)

Parameters: { objetive } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






XGBClassifier(alpha=0.0001, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.4, max_delta_step=0,
              max_depth=7, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=5, n_jobs=12,
              num_parallel_tree=1, objetive='binary:logistic', random_state=0,
              reg_alpha=9.99999975e-05, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [23]:
y_train_pred_best= best_xg_reg.predict(X_train)
y_val_pred_best= best_xg_reg.predict(X_val)

print("ENTRENAMIENTO")
print(classification_report(y_train, y_train_pred_best))

print("TESTEO")
print(classification_report(y_val, y_val_pred_best))

ENTRENAMIENTO
              precision    recall  f1-score   support

           0       0.81      0.96      0.88       751
           1       0.89      0.62      0.73       441

    accuracy                           0.83      1192
   macro avg       0.85      0.79      0.80      1192
weighted avg       0.84      0.83      0.82      1192

TESTEO
              precision    recall  f1-score   support

           0       0.87      0.97      0.92       207
           1       0.91      0.67      0.77        91

    accuracy                           0.88       298
   macro avg       0.89      0.82      0.84       298
weighted avg       0.88      0.88      0.87       298



#### Modelo 2: Algoritmo:  RandomForestClassifier
#### Preprocesado: Discretización de las features `Age`,  `AnnualIncome` y `FamilyMembers` 

In [24]:
transformer = make_column_transformer(
    (KBinsDiscretizer(n_bins=5, encode="onehot-dense", strategy="quantile"), ["Age", "AnnualIncome"]),(KBinsDiscretizer(n_bins=4, encode="onehot-dense", strategy="quantile"), ["FamilyMembers"]),
    (OneHotEncoder(categories="auto", dtype="int", handle_unknown="ignore"),
     ["Employment Type", "GraduateOrNot", "FrequentFlyer", "EverTravelledAbroad"]),
    remainder="passthrough")

In [25]:
X_train = transformer.fit_transform(train_df.drop(columns=["Customer", "TravelInsurance"]))
y_train = train_df["TravelInsurance"].values

In [26]:
X_test=transformer.transform(test_df.drop(columns=["Customer"]))

#### División de los datos en train y validation

In [27]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.20, random_state=42)

#### Modelado

In [28]:
best_RM=ensemble.RandomForestClassifier(max_depth=10, min_samples_leaf=4, n_estimators=20,
                       random_state=42)
best_RM.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, min_samples_leaf=4, n_estimators=20,
                       random_state=42)

In [29]:
y_train_pred_best= best_RM.predict(X_train)
y_val_pred_best= best_RM.predict(X_val)

print("ENTRENAMIENTO")
print(classification_report(y_train, y_train_pred_best))

print("TESTEO")
print(classification_report(y_val, y_val_pred_best))

ENTRENAMIENTO
              precision    recall  f1-score   support

           0       0.80      0.96      0.87       751
           1       0.89      0.59      0.71       441

    accuracy                           0.82      1192
   macro avg       0.85      0.78      0.79      1192
weighted avg       0.83      0.82      0.81      1192

TESTEO
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       207
           1       0.92      0.66      0.77        91

    accuracy                           0.88       298
   macro avg       0.90      0.82      0.84       298
weighted avg       0.88      0.88      0.87       298



#### Modelo 3: Algoritmo:  SVM
#### Preprocesado: Discretización de las features `Age`,  `AnnualIncome` y `FamilyMembers` (utilizado en paso anterior) 

In [30]:
best_svm=sklearn.svm.SVC(C=1)
best_svm.fit(X_train, y_train)

SVC(C=1)

In [31]:
y_train_pred_best= best_svm.predict(X_train)
y_val_pred_best= best_svm.predict(X_val)

print("ENTRENAMIENTO")
print(classification_report(y_train, y_train_pred_best))

print("TESTEO")
print(classification_report(y_val, y_val_pred_best))

ENTRENAMIENTO
              precision    recall  f1-score   support

           0       0.80      0.96      0.87       751
           1       0.90      0.58      0.71       441

    accuracy                           0.82      1192
   macro avg       0.85      0.77      0.79      1192
weighted avg       0.83      0.82      0.81      1192

TESTEO
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       207
           1       0.92      0.66      0.77        91

    accuracy                           0.88       298
   macro avg       0.90      0.82      0.84       298
weighted avg       0.88      0.88      0.87       298



#### Modelo 4: Algoritmo:  XGBOOST
#### Preprocesado:  Modificación de la cantidad de bins de las variables a discretizar y eliminación de la columna no relevante `ChronicDiseases`

In [32]:
transformer = make_column_transformer(
    (KBinsDiscretizer(n_bins=5, encode="onehot-dense", strategy="quantile"), ["Age", "FamilyMembers"]), (KBinsDiscretizer(n_bins=6, encode="onehot-dense", strategy="quantile"), ["AnnualIncome"]),
    (OneHotEncoder(categories="auto", dtype="int", handle_unknown="ignore"),
     ["Employment Type", "FrequentFlyer", "EverTravelledAbroad", "GraduateOrNot"]),
    remainder="passthrough")

In [33]:
X_train = transformer.fit_transform(train_df.drop(columns=["Customer", "TravelInsurance", "ChronicDiseases"]))
y_train = train_df["TravelInsurance"].values

In [34]:
X_test = transformer.transform(test_df.drop(columns=["Customer", "ChronicDiseases"]))

#### División de los datos en train y validation

In [35]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.20, random_state=42)

#### Modelado

In [36]:
best_xg_reg=xgb.XGBClassifier(alpha=0.0001, colsample_bytree=1, 
              learning_rate=0.2, max_depth=4,           
              n_estimators=1, objetive="binary:logistic")
best_xg_reg.fit(X_train, y_train)

Parameters: { objetive } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






XGBClassifier(alpha=0.0001, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.2, max_delta_step=0,
              max_depth=4, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1, n_jobs=12,
              num_parallel_tree=1, objetive='binary:logistic', random_state=0,
              reg_alpha=9.99999975e-05, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [37]:
y_train_pred_best= best_xg_reg.predict(X_train)
y_val_pred_best= best_xg_reg.predict(X_val)

print("ENTRENAMIENTO")
print(classification_report(y_train, y_train_pred_best))

print("TESTEO")
print(classification_report(y_val, y_val_pred_best))

ENTRENAMIENTO
              precision    recall  f1-score   support

           0       0.79      0.97      0.87       751
           1       0.91      0.58      0.70       441

    accuracy                           0.82      1192
   macro avg       0.85      0.77      0.79      1192
weighted avg       0.84      0.82      0.81      1192

TESTEO
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       207
           1       0.94      0.66      0.77        91

    accuracy                           0.88       298
   macro avg       0.90      0.82      0.85       298
weighted avg       0.89      0.88      0.88       298

