# Diplodatos Kaggle Competition

## Try 2

Dividimos en train y validation. Se utiliza el algoritmo GradientBoostingClassifier con los hiperparámetros por defecto. Se realiza una transformación en las variables categoricas booleanas a 0 y 1. Borramos la columna Employee Type porque tiene poca correlación con el target.

Realizamos oversampling usando SMOTE para tener mas entradas con la clase positiva.

In [1]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier

## Read Data

In [2]:
train_df = pd.read_csv("../data/travel_insurance_prediction_train.csv")
test_df = pd.read_csv("../data/travel_insurance_prediction_test.csv")

In [3]:
#hard encoding some features
def encoding(df,feature):
    df[feature]=df[feature].replace({'Yes':1,'No':0})

In [4]:
encoding(train_df,'GraduateOrNot')
encoding(train_df,'FrequentFlyer')
encoding(train_df,'EverTravelledAbroad')

encoding(test_df,'GraduateOrNot')
encoding(test_df,'FrequentFlyer')
encoding(test_df,'EverTravelledAbroad')

In [5]:
train_df

Unnamed: 0,Customer,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,1,33,Private Sector/Self Employed,1,550000,6,0,0,0,1
1,2,28,Private Sector/Self Employed,1,800000,7,0,1,0,0
2,3,31,Private Sector/Self Employed,1,1250000,4,0,0,0,0
3,4,31,Government Sector,0,300000,7,0,0,0,0
4,5,28,Private Sector/Self Employed,1,1250000,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1485,1486,31,Government Sector,0,300000,5,0,0,0,0
1486,1487,31,Private Sector/Self Employed,1,950000,3,0,1,0,0
1487,1488,28,Private Sector/Self Employed,1,1250000,5,0,0,0,0
1488,1489,31,Government Sector,1,1300000,5,0,0,0,0


In [6]:
train_df=pd.get_dummies(train_df,columns=['Employment Type'],drop_first=True)
test_df=pd.get_dummies(test_df,columns=['Employment Type'],drop_first=True)

In [7]:
X=train_df.drop(columns=['TravelInsurance'])
y=train_df['TravelInsurance']

In [8]:
train_df

Unnamed: 0,Customer,Age,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance,Employment Type_Private Sector/Self Employed
0,1,33,1,550000,6,0,0,0,1,1
1,2,28,1,800000,7,0,1,0,0,1
2,3,31,1,1250000,4,0,0,0,0,1
3,4,31,0,300000,7,0,0,0,0,0
4,5,28,1,1250000,3,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1485,1486,31,0,300000,5,0,0,0,0,0
1486,1487,31,1,950000,3,0,1,0,0,1
1487,1488,28,1,1250000,5,0,0,0,0,1
1488,1489,31,1,1300000,5,0,0,0,0,0


In [9]:
np.unique(train_df['TravelInsurance'],return_counts=True)

(array([0, 1]), array([958, 532]))

In [10]:
#using smote to handle class imbalance for minority class
over = SMOTE()
#fitting smote on training set and testing on real unsampled data
X_bal, y_bal = over.fit_resample(X, y)

In [11]:
np.unique(y_bal,return_counts=True)

(array([0, 1]), array([958, 958]))

In [12]:
#function to fit baseline models
def model_fit(model):
    d=model
    d.fit(X_bal,y_bal)
    y_pred=d.predict(test_df)
    return y_pred

In [13]:
gbc = model_fit(GradientBoostingClassifier())


In [14]:
test_id = test_df["Customer"]
test_pred = gbc

submission = pd.DataFrame(list(zip(test_id, test_pred)), columns=["Customer", "TravelInsurance"])
submission.to_csv("../data/travel_insurance_submission.csv", header=True, index=False)