In [71]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

# DATA

Split df

In [72]:
train_df = pd.read_csv("../data/travel_insurance_prediction_train.csv")
test_df = pd.read_csv("../data/travel_insurance_prediction_test.csv")

In [73]:
from sklearn.model_selection import train_test_split
train,validation = train_test_split(train_df)
X_train, y_train = train.drop(["Customer","TravelInsurance"],axis=1),train["TravelInsurance"]
X_test = test_df.drop(["Customer"],axis=1)
X_val, y_val = validation.drop(["Customer","TravelInsurance"],axis=1),validation["TravelInsurance"]

In [74]:
X_cat = X_train.select_dtypes(include='object').columns
X_num = X_train.select_dtypes(include='int64').columns

In [102]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, mean_squared_error, r2_score
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder, Normalizer, QuantileTransformer, RobustScaler,\
PowerTransformer, KBinsDiscretizer, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

In [76]:
transformer = make_column_transformer(
    (KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile"), ["Age", "AnnualIncome"]),
    (OneHotEncoder(categories="auto", dtype="int", handle_unknown="ignore"),
     ["Employment Type", "GraduateOrNot", "FamilyMembers", "FrequentFlyer", "EverTravelledAbroad"]),
    remainder="passthrough")

In [77]:
X_train_transform = transformer.fit_transform(X_train)
X_test_transform = transformer.transform(X_test)
X_val_transform = transformer.transform(X_val)

In [112]:
X_test_transform

array([[2., 3., 0., ..., 1., 0., 0.],
       [2., 1., 0., ..., 1., 0., 1.],
       [3., 4., 1., ..., 0., 1., 0.],
       ...,
       [2., 2., 0., ..., 1., 0., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 2., 1., ..., 1., 0., 0.]])

In [78]:
search_params = {
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [1, 2, 5],
    'max_depth': [3, 6, 10]
}
tree = DecisionTreeClassifier()
# probar con diferentes semillas para verificar que el modelo funciona bien
tree_clf = GridSearchCV(tree, search_params, cv=5, scoring='f1', n_jobs=-1)
tree_clf.fit(X_train_transform, y_train)

best_tree_clf = tree_clf.best_estimator_
print(classification_report(y_train, best_tree_clf.predict(X_train_transform)))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89       720
           1       0.87      0.66      0.75       397

    accuracy                           0.85      1117
   macro avg       0.85      0.80      0.82      1117
weighted avg       0.85      0.85      0.84      1117



In [81]:
model = LinearRegression()
model.fit(X_train_transform,y_train)

LinearRegression()

In [99]:
y_val_pred = model.predict(X_val_transform)

In [100]:
print("Mean squared error: %.2f" % mean_squared_error(y_val, y_val_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_val, y_val_pred))

Mean squared error: 0.17
Variance score: 0.26


In [109]:
from sklearn.svm import SVC
svc = SVC(kernel='rbf')
svc.fit(X_train_transform,y_train)
y_val_pred=svc.predict(X_val)

ValueError: could not convert string to float: 'Private Sector/Self Employed'

# OUTPUT

In [None]:
test_id = test_df["Customer"]
test_pred = best_tree_clf.predict(X_test)

In [None]:
submission = pd.DataFrame(list(zip(test_id, test_pred)), columns=["Customer", "TravelInsurance"])

In [None]:
submission.head()

In [None]:
submission.to_csv("../data/travel_insurance_submission-try-2.csv", header=True, index=False)