In [15]:
import os
from IPython.display import Image

import pandas as pd
import numpy as np
import itertools
import timeit
import time

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

from sklearn import tree
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Display all columns
pd.set_option('display.max_columns', None)
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV

from collections import Counter
from numpy import where
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from numpy import mean
from sklearn.decomposition import FastICA, PCA
from scipy.stats import kurtosis 
from sklearn import random_projection
from sklearn.decomposition import TruncatedSVD

from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

from sklearn.metrics import v_measure_score, adjusted_mutual_info_score, homogeneity_score


In [2]:
df_insurance = pd.read_csv('data/insurance/train.csv', nrows=100000)

assert df_insurance.shape[0] > 0
assert df_insurance.shape[1] > 0

print(f"Data has {df_insurance.shape[0]} rows and {df_insurance.shape[1]} columns.")

def encode_labels(df):
    for column in df.columns:
        if df[column].dtype=='object':
            label = LabelEncoder()
            label.fit(list(df[column].values))
            df[column] = label.transform(list(df[column].values))
    return df

df_insurance = df_insurance.drop(['QuoteNumber'], axis=1)

# Now convert the date to day, month and week and drop the date
df_insurance['Date'] = pd.to_datetime(pd.Series(df_insurance['Original_Quote_Date']))
df_insurance = df_insurance.drop('Original_Quote_Date', axis=1)
df_insurance['Year'] = df_insurance['Date'].apply(lambda x: int(str(x)[:4]))
df_insurance['Month'] = df_insurance['Date'].apply(lambda x: int(str(x)[5:7]))
df_insurance['weekday'] = df_insurance['Date'].dt.dayofweek
df_insurance = df_insurance.drop('Date', axis=1)
df_insurance = encode_labels(df_insurance)
df_insurance = df_insurance.fillna(-1)

insurance_X = df_insurance.loc[:, df_insurance.columns != 'QuoteConversion_Flag']
insurance_Y = df_insurance['QuoteConversion_Flag']

Data has 100000 rows and 299 columns.


In [3]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline 


over = SMOTE(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over), ('under', under)]

insurance_X = df_insurance.loc[:, df_insurance.columns != 'QuoteConversion_Flag']
insurance_Y = df_insurance['QuoteConversion_Flag']
insurance_X_resample, insurance_Y_resample = Pipeline(steps=steps).fit_resample(insurance_X, insurance_Y)

insurance_X_resample_scaled = StandardScaler().fit_transform(insurance_X_resample) 

insurance_X_train, insurance_X_test, insurance_y_train, insurance_y_test = train_test_split(np.array(insurance_X_resample),np.array(insurance_Y_resample), test_size=0.15)


In [12]:
# KMeans = 2
pca = PCA(n_components= 2).fit(insurance_X_resample_scaled)
pca_insurance_X_resample_scaled = pca.transform(insurance_X_resample_scaled)
start = time.time()
kmeans=KMeans(2).fit(pca_insurance_X_resample_scaled)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
kmeans.predict(pca_insurance_X_resample_scaled)
query_time = time.time() - start
print("Query time: " + str(query_time))
y_pred = kmeans.predict(pca_insurance_X_resample_scaled)

result = pd.concat([pd.DataFrame(pca_insurance_X_resample_scaled), pd.DataFrame(y_pred)], axis=1, sort=False)
result.columns = [0, 1, 2]

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.01)
X_train, X_test, y_train, y_test = train_test_split(result, insurance_Y_resample, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20, scoring="roc_auc").mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = roc_auc_score(y_test, y_pred)
print("Test ROC AUC: " + str(score))


Train time: 0.07094192504882812
Query time: 0.0024411678314208984
Cross validation score: 0.6190145641368027
Train time: 4.496754884719849
Query time: 0.003148794174194336
Test ROC AUC: 0.5


In [13]:
pca = PCA(n_components= 2).fit(insurance_X_resample_scaled)
pca_insurance_X_resample_scaled = pca.transform(insurance_X_resample_scaled)
start = time.time()
gmm=GaussianMixture(2).fit(pca_insurance_X_resample_scaled)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
gmm.predict(pca_insurance_X_resample_scaled)
query_time = time.time() - start
print("Query time: " + str(query_time))
y_pred = gmm.predict(pca_insurance_X_resample_scaled)

result = pd.concat([pd.DataFrame(pca_insurance_X_resample_scaled), pd.DataFrame(y_pred)], axis=1, sort=False)
result.columns = [0, 1, 2]

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.001)
X_train, X_test, y_train, y_test = train_test_split(result, insurance_Y_resample, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20, scoring="roc_auc").mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = roc_auc_score(y_test, y_pred)
print("Test ROC AUC: " + str(score))


Train time: 0.09236001968383789
Query time: 0.008335113525390625
Cross validation score: 0.6182566988967348
Train time: 5.659135341644287
Query time: 0.0028429031372070312
Test ROC AUC: 0.5


In [16]:
# KMeans = 2
pca = PCA(n_components= 2).fit(insurance_X_resample_scaled)
pca_insurance_X_resample_scaled = pca.transform(insurance_X_resample_scaled)
start = time.time()
kmeans=KMeans(2).fit(pca_insurance_X_resample_scaled)
labels = kmeans.predict(pca_insurance_X_resample_scaled)

print(v_measure_score(insurance_Y_resample, labels))
print(adjusted_mutual_info_score(insurance_Y_resample, labels))
print(homogeneity_score(insurance_Y_resample, labels))


0.002377346718288729
0.0023668088106171548
0.002420757622846224


In [17]:
# Gaussian = 2
pca = PCA(n_components= 2).fit(insurance_X_resample_scaled)
pca_insurance_X_resample_scaled = pca.transform(insurance_X_resample_scaled)
start = time.time()
gmm=GaussianMixture(2).fit(pca_insurance_X_resample_scaled)
labels = gmm.predict(pca_insurance_X_resample_scaled)

print(v_measure_score(insurance_Y_resample, labels))
print(adjusted_mutual_info_score(insurance_Y_resample, labels))
print(homogeneity_score(insurance_Y_resample, labels))


0.0021340408920460375
0.002123487903120226
0.0021704330106454756
