In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import statsmodels as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Load data
pd.set_option('display.max_columns', None)
df_td = pd.read_csv('tic_2000_train_data.csv')
eval1 = pd.read_csv('tic_2000_eval_data.csv')
target = pd.read_csv('tic_2000_target_data.csv') #CARAVAN is renamed 'Target' in this set

#renaming the training data to match the test data.
df_td.rename(columns={'CARAVAN': 'Target'}, inplace=True)

In [3]:
# Using SweetViz for visualizing the overall data to determine where to further investigate
#You will need to have a full screen to see whats on the right side.
import sweetviz as sv
#config reports
#Configuring the reports, early attempts automatically catagorized MOSTYPE and PWAPART as numberical rather than categorical
cfg_1 = sv.FeatureConfig(force_cat=['MOSTYPE'])

report_combined = sv.analyze([df_td, "Combined"], target_feat = "Target", feat_cfg=cfg_1)
report_combined.show_html("Report_Combined.html")

:FEATURES DONE:                    |█████████████████████| [100%]   00:14  -> (00:00 left)
:PAIRWISE DONE:                    |█████████████████████| [100%]   00:28  -> (00:00 left)


Creating Associations graph... DONE!


## First Thoughts on Data Above:
According to the Insurance challange the data is split into 2 sections: Demographics 0-42 and Policy ownership 43-86.The latter includes our target. Additionally, the features are based on the zipcodes which is unknown.

To break down what is in the report was created:

#### 1) The target variable is heavily imbalanced. With only 348 instances of there being a Caravan policy; this will need to be addressed in order to prevent the models from overfitting.
To answer the question of "Who would most likely purchase a Caravan policy?", I will create a dummy table to better isolate which features have a relationship with the target variable.

In [None]:
#Encoding the data to isolate features
pre_enc= df_td.drop('Target', axis=1)
post_enc_df = pd.get_dummies(df_td, prefix_sep="_", columns=pre_enc.columns)

In [None]:
#In order to quickly find feature importance, I am using LassoCV to eliminate unimportant features.

from sklearn.linear_model import LassoCV, Lasso

X = post_enc_df.drop('Target', axis=1)
y = post_enc_df.Target
reg = LassoCV(max_iter=100000, tol=0.00001)
reg.fit(X, y)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" % reg.score(X, y))
coef = pd.Series(reg.coef_, index=X.columns)

In [None]:
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0))+ " variables")

imp_coef = coef.sort_values()
plt.rcParams['figure.figsize'] = (10, 100)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Lasso Model")

In [None]:
pd.set_option('display.max_rows', None) 
kept_labels = imp_coef[imp_coef != 0]
kept_labels

In [None]:
kept_labels.keys()

In [None]:
#Using LassoCV, we now have our new features
new_df = post_enc_df[['PPLEZIER_0', 'MINKGEM_3', 'MGODOV_1', 'PBRAND_2', 'MOPLHOOG_0',
       'ALEVEN_1', 'MZFONDS_6', 'MINK7512_0', 'MAUT0_4', 'MSKC_5',
       'MBERHOOG_5', 'MOSHOOFD_10', 'MGODGE_6', 'MSKB1_2', 'MSKA_2', 'MSKD_3',
       'MINK3045_5', 'MHKOOP_7', 'MAUT1_4', 'MOPLHOOG_3', 'MBERMIDD_3',
       'AFIETS_0', 'MOPLHOOG_2', 'MBERZELF_0', 'MFGEKIND_5', 'MFWEKIND_5',
       'PFIETS_0', 'APLEZIER_0', 'MZPART_2', 'MOPLHOOG_1', 'MHKOOP_9',
       'MGODOV_2', 'MRELGE_9', 'MSKD_0', 'MAUT2_1', 'MAUT0_2', 'MFGEKIND_4',
       'MOPLLAAG_2', 'MOSHOOFD_1', 'MZFONDS_7', 'MBERARBG_2', 'MINKGEM_5',
       'MOSTYPE_3', 'MSKC_1', 'MSKD_1', 'MINKGEM_4', 'MOSHOOFD_9', 'MRELSA_0',
       'MRELOV_3', 'MINK123M_0', 'MOSTYPE_8', 'MINK7512_1', 'MGODRK_1',
       'MOSHOOFD_2', 'ABYSTAND_1', 'PWAPART_2', 'MAUT1_7', 'MBERBOER_0',
       'MHHUUR_0', 'MBERARBG_1', 'MINKM30_2', 'MGODPR_7', 'PBRAND_3',
       'APERSAUT_2', 'MKOOPKLA_7', 'PBRAND_4', 'PPERSAUT_6','Target']]

In [None]:
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, plot_roc_curve, classification_report, balanced_accuracy_score, coverage_error
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, make_scorer
import matplotlib.pyplot as plt


def clf_comp(df):
    classifiers = [
    ComplementNB(),
    MLPClassifier(hidden_layer_sizes=(10,10,10), max_iter=1000),
    KNeighborsClassifier(n_neighbors=5),
    RandomForestClassifier(),
    DecisionTreeClassifier(),
    GaussianProcessClassifier()
    ]
    
    # Separating out the features
    X =  df.drop('Target', axis=1) #df_td.drop('Target', axis=1)# train_feat
    # Separating out the target
    y = df.Target #df_td.Target#target_feat
    
   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
    for classifier in classifiers:
        scores = cross_val_score(classifier, X_train, y_train, cv=5, scoring='f1_macro')
        model = classifier.fit(X_train, y_train)
        predictions = model.predict(X_test)
        print(classifier)
        print('The Training F1 Score is', f1_score(classifier.predict(X_train), y_train))
        print('The Testing F1 Score is', f1_score(predictions, y_test))
        print("accuracy score" '\n', accuracy_score(y_test, predictions))
        print("balanced_accuracy_score" '\n', balanced_accuracy_score(y_test, predictions))
        print("model confusion matrix" '\n', confusion_matrix(y_test, predictions, normalize='all'))
        print("classification_report" '\n', classification_report(y_test, predictions),'\n')
        ax = plt.gca()
        plt.rcParams['figure.figsize'] = (10, 10)
        disp = plot_roc_curve(classifier, X_test,y_test, ax=ax, alpha=0.9)

In [None]:
clf_comp(post_enc_df)