In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

# DATA

In [2]:
train_df = pd.read_csv("../data/travel_insurance_prediction_train.csv")
test_df = pd.read_csv("../data/travel_insurance_prediction_test.csv")

In [3]:
X_cat = train_df.drop(["Customer","TravelInsurance"],axis=1).select_dtypes(include='object').columns
X_num = train_df.drop(["Customer","TravelInsurance"],axis=1).select_dtypes(include='int64').columns
X_cat.tolist(), X_num.tolist()

(['Employment Type', 'GraduateOrNot', 'FrequentFlyer', 'EverTravelledAbroad'],
 ['Age', 'AnnualIncome', 'FamilyMembers', 'ChronicDiseases'])

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
encoder = OneHotEncoder(categories="auto", dtype="int", handle_unknown="ignore")
transformer = make_column_transformer((encoder, X_cat),remainder="passthrough")
train, validation = train_test_split(train_df)
X = transformer.fit_transform(train.drop(columns=["Customer", "TravelInsurance"]))
y = train["TravelInsurance"].values
X_test = transformer.transform(test_df.drop(columns=["Customer"]))

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=2, stratify=y)

In [9]:
from sklearn.svm import SVC
svc = SVC(kernel='rbf')
svc.fit(X_train,y_train)

SVC()

In [10]:
y_pred=svc.predict(X_test)

In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_train, svc.predict(X_train)))
print(classification_report(y_val, svc.predict(X_val)))

              precision    recall  f1-score   support

           0       0.78      0.93      0.85       538
           1       0.81      0.52      0.64       299

    accuracy                           0.79       837
   macro avg       0.80      0.73      0.74       837
weighted avg       0.79      0.79      0.77       837

              precision    recall  f1-score   support

           0       0.77      0.93      0.84       180
           1       0.79      0.50      0.61       100

    accuracy                           0.78       280
   macro avg       0.78      0.71      0.73       280
weighted avg       0.78      0.78      0.76       280



In [14]:
from sklearn.model_selection import GridSearchCV
param_grid={
    'C': np.arange(1e-05, 3, 0.1),
    'gamma': ['scale', 'auto']
}
grid=GridSearchCV(SVC(), param_grid=param_grid, cv=10)

In [15]:
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': array([1.00000e-05, 1.00010e-01, 2.00010e-01, 3.00010e-01, 4.00010e-01,
       5.00010e-01, 6.00010e-01, 7.00010e-01, 8.00010e-01, 9.00010e-01,
       1.00001e+00, 1.10001e+00, 1.20001e+00, 1.30001e+00, 1.40001e+00,
       1.50001e+00, 1.60001e+00, 1.70001e+00, 1.80001e+00, 1.90001e+00,
       2.00001e+00, 2.10001e+00, 2.20001e+00, 2.30001e+00, 2.40001e+00,
       2.50001e+00, 2.60001e+00, 2.70001e+00, 2.80001e+00, 2.90001e+00]),
                         'gamma': ['scale', 'auto']})

In [16]:
print('Best Parameters : ',grid.best_params_)
print('Best score Through Grid Search : %.3f'%grid.best_score_)

Best Parameters :  {'C': 0.50001, 'gamma': 'auto'}
Best score Through Grid Search : 0.804


In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_train, grid.predict(X_train)))
print(classification_report(y_val, grid.predict(X_val)))

              precision    recall  f1-score   support

           0       0.78      0.98      0.87       538
           1       0.94      0.52      0.67       299

    accuracy                           0.82       837
   macro avg       0.86      0.75      0.77       837
weighted avg       0.84      0.82      0.80       837

              precision    recall  f1-score   support

           0       0.78      0.98      0.87       180
           1       0.94      0.49      0.64       100

    accuracy                           0.81       280
   macro avg       0.86      0.74      0.76       280
weighted avg       0.84      0.81      0.79       280



# OUTPUT

In [None]:
test_id = test_df["Customer"]
test_pred = best_tree_clf.predict(X_test)

In [None]:
submission = pd.DataFrame(list(zip(test_id, test_pred)), columns=["Customer", "TravelInsurance"])

In [None]:
submission.head()

In [None]:
submission.to_csv("../data/travel_insurance_submission-try-2.csv", header=True, index=False)