In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

In [2]:
from sklearn.compose import make_column_transformer
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# DATA

In [3]:
train_df = pd.read_csv("../data/travel_insurance_prediction_train.csv")
test_df = pd.read_csv("../data/travel_insurance_prediction_test.csv")

In [19]:
transformer = make_column_transformer(
    (KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform"), ["Age", "AnnualIncome"]),
    (OneHotEncoder(categories="auto", dtype="int", handle_unknown="ignore"),
     ["Employment Type", "GraduateOrNot", "FamilyMembers", "FrequentFlyer", "EverTravelledAbroad"]),
    remainder="passthrough")
#Modificar strategy
#handle_unknow: los valores nuevos ignorar
#remainder="passthrough": resto de elementos no tocar
# The data for training the model
X_train = transformer.fit_transform(train_df.drop(columns=["Customer", "TravelInsurance"]))
y_train = train_df["TravelInsurance"].values

# The test data is only for generating the submission
X_test = transformer.transform(test_df.drop(columns=["Customer"]))

search_params = {
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [1, 2, 5],
    'max_depth': [3, 6, 10]
}
tree = DecisionTreeClassifier(random_state=42)
# probar con diferentes semillas para verificar que el modelo funciona bien
tree_clf = GridSearchCV(tree, search_params, cv=5, scoring='f1', n_jobs=-1)
tree_clf.fit(X_train, y_train)

best_tree_clf = tree_clf.best_estimator_

print(classification_report(y_train, best_tree_clf.predict(X_train)))

              precision    recall  f1-score   support

           0       0.82      0.94      0.88       958
           1       0.86      0.63      0.73       532

    accuracy                           0.83      1490
   macro avg       0.84      0.79      0.80      1490
weighted avg       0.84      0.83      0.83      1490



---

In [20]:
transformer_kmeans = make_column_transformer(
    (KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="kmeans"), ["Age", "AnnualIncome"]),
    (OneHotEncoder(categories="auto", dtype="int", handle_unknown="ignore"),
     ["Employment Type", "GraduateOrNot", "FamilyMembers", "FrequentFlyer", "EverTravelledAbroad"]),
    remainder="passthrough")
# The data for training the model
X_train = transformer_kmeans.fit_transform(train_df.drop(columns=["Customer", "TravelInsurance"]))
y_train = train_df["TravelInsurance"].values

# The test data is only for generating the submission
X_test = transformer_kmeans.transform(test_df.drop(columns=["Customer"]))
search_params = {
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [1, 2, 5],
    'max_depth': [3, 6, 10]
}
tree = DecisionTreeClassifier(random_state=42)
# probar con diferentes semillas para verificar que el modelo funciona bien
tree_clf = GridSearchCV(tree, search_params, cv=5, scoring='f1', n_jobs=-1)
tree_clf.fit(X_train, y_train)

best_tree_clf = tree_clf.best_estimator_
print(classification_report(y_train, best_tree_clf.predict(X_train)))

              precision    recall  f1-score   support

           0       0.81      0.96      0.88       958
           1       0.88      0.60      0.71       532

    accuracy                           0.83      1490
   macro avg       0.85      0.78      0.79      1490
weighted avg       0.84      0.83      0.82      1490



---

In [17]:
from sklearn.model_selection import KFold

In [18]:
kf = KFold(n_splits=4, shuffle=True, random_state=0)

# OUTPUT

In [13]:
test_id = test_df["Customer"]
test_pred = best_tree_clf.predict(X_test)

In [14]:
submission = pd.DataFrame(list(zip(test_id, test_pred)), columns=["Customer", "TravelInsurance"])

In [15]:
submission.head()

Unnamed: 0,Customer,TravelInsurance
0,1491,0
1,1492,0
2,1493,1
3,1494,1
4,1495,1


In [16]:
submission.to_csv("../data/travel_insurance_submission-try-2.csv", header=True, index=False)