In [9]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [47]:
train_df = pd.read_csv("../data/travel_insurance_prediction_train.csv")
test_df = pd.read_csv("../data/travel_insurance_prediction_test.csv")

In [11]:
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier

### Transform the columns into features

First we need to transform the columns into features. The type of features we use will have a direct impact on the final result. In this example we decided to discretize some numeric features and make a one hot encoding of others. The number of bins, what we use as a one hot encoding, etc, is all up to you to try it out.

In [48]:
transformer = make_column_transformer(
    (KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"), ["Age"]),
    (OneHotEncoder(categories="auto", dtype="int", handle_unknown="ignore"),
     ["Employment Type", "GraduateOrNot", "FamilyMembers", "FrequentFlyer", "EverTravelledAbroad"]),
    (MinMaxScaler(),["AnnualIncome"]),
    remainder="passthrough")

In [61]:
X = transformer.fit_transform(train_df.drop(columns=["Customer", "TravelInsurance"]))
y = train_df["TravelInsurance"].values
X_test_transform = transformer.transform(test_df.drop(columns=["Customer"]))

In [14]:
def model_fit(model,X_train,y_train):
    d=model
    d.fit(X_train,y_train)
    y_pred=d.predict(X_trian)
    print(classification_report(y_train,y_pred))
    print(f1_score(y_train,y_pred))
    return y_pred

In [15]:
def split(X,y):
    return train_test_split(X,y,test_size=0.2,stratify=y)

In [16]:
def balancer(X,y):
    over = SMOTE()
    return over.fit_resample(X, y)

In [50]:
X_train,X_test,y_train,y_test = split(X,y)

In [51]:
X_bal,y_bal = balancer(X_train,y_train)

In [52]:
X_bal_all, y_bal_all = balancer(X,y)

In [53]:
# KNeighborsClassifier con X, y
params = {
    'n_neighbors': [*range(1,20,1)],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [*range(1,60,5)],
    'p': [1,2]
}
cv = RandomizedSearchCV(
    KNeighborsClassifier(),
    params,
    n_iter=20,
    cv=5,
    scoring='f1',
    random_state=42)
cv.fit(X, y)
display(cv.best_estimator_)
print(classification_report(y, cv.best_estimator_.predict(X)))

KNeighborsClassifier(leaf_size=21, n_neighbors=7)

              precision    recall  f1-score   support

           0       0.82      0.94      0.88       958
           1       0.86      0.63      0.73       532

    accuracy                           0.83      1490
   macro avg       0.84      0.79      0.80      1490
weighted avg       0.83      0.83      0.82      1490



In [54]:
# KNeighborsClassifier con X_bal, y
params = {
    'n_estimators': [*range(1,500,5)],
    'criterion': ['gini','entropy']
}
cv = RandomizedSearchCV(
    RandomForestClassifier(),
    params,
    n_iter=20,
    cv=5,
    scoring='f1',
    random_state=42)
cv.fit(X, y)
display(cv.best_estimator_)
print(classification_report(y, cv.best_estimator_.predict(X)))

RandomForestClassifier(criterion='entropy', n_estimators=371)

              precision    recall  f1-score   support

           0       0.92      0.97      0.94       958
           1       0.94      0.84      0.89       532

    accuracy                           0.93      1490
   macro avg       0.93      0.91      0.92      1490
weighted avg       0.93      0.93      0.92      1490



In [None]:
# DecisionTreeClassifier con X_bal, y
params = {
    'n_estimators': [*range(1,500,5)],
    'criterion': ['gini','entropy']
}
cv = RandomizedSearchCV(
    DecisionTreeClassifier(),
    params,
    n_iter=20,
    cv=5,
    scoring='f1',
    random_state=42)
cv.fit(X, y)
display(cv.best_estimator_)
print(classification_report(y, cv.best_estimator_.predict(X)))

In [62]:
test_id = test_df["Customer"]
test_pred = cv.best_estimator_.predict(X_test_transform)

submission = pd.DataFrame(list(zip(test_id, test_pred)), columns=["Customer", "TravelInsurance"])
submission.to_csv("../data/travel_insurance_submission.csv", header=True, index=False)

In [63]:
submission

Unnamed: 0,Customer,TravelInsurance
0,1491,0
1,1492,0
2,1493,1
3,1494,1
4,1495,1
...,...,...
492,1983,1
493,1984,0
494,1985,0
495,1986,1


In [None]:
search_params = {
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [1, 2, 5],
    'max_depth': [3, 6, 10]
}
tree = DecisionTreeClassifier(random_state=42)
tree_clf = GridSearchCV(tree, search_params, cv=5, scoring='f1', n_jobs=-1)
tree_clf.fit(X_train, y_train)

best_tree_clf = tree_clf.best_estimator_

In [None]:
print(classification_report(y_train, best_tree_clf.predict(X_train)))

## Generate the output

The last thing we do is generating a file that should be *submitted* on kaggle

In [None]:
test_id = test_df["Customer"]
test_pred = best_tree_clf.predict(X_test)

submission = pd.DataFrame(list(zip(test_id, test_pred)), columns=["Customer", "TravelInsurance"])
submission.to_csv("../data/travel_insurance_submission.csv", header=True, index=False)