In [1]:
import numpy as np
import sklearn
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_columns = 100
%matplotlib inline

In [2]:
def transform(df):
    df2 = df.copy()
    #fill na for city data - 395 missing entries - categorical and numerical, for now assume they come from the most frequent town
    df2['commune_code'] = df2['commune_code'].fillna(df2['commune_code'].mode()[0])
    df2['canton_code'] = df2['canton_code'].fillna(df2['canton_code'].mode()[0])
    df2['city_district_code'] = df2['city_district_code'].fillna(df2['city_district_code'].mode()[0])
    df2['regional_department_code'] = df2['regional_department_code'].fillna(df2['regional_department_code'].mode()[0])
    df2['population'] = df2['population'].fillna(df2['population'].mode()[0])
    df2['town_mean_altitude'] = df2['town_mean_altitude'].fillna(df2['town_mean_altitude'].mode()[0])
    df2['town_surface_area'] = df2['town_surface_area'].fillna(df2['town_surface_area'].mode()[0])
    
#     #impute or remove illogical values
    df2.loc[df2['vh_weight']==0,'vh_weight'] = df2['vh_weight'].median() # - 2959 missing entries - imputation doesnt give a better correlation
    df2 = df2.loc[df2['drv_age1']>=df2['drv_age_lic1']] # 32 missing entires; dropped
    df2 = df2.loc[df2['vh_cyl']>0] #3 missing entries
    df2.loc[df2['vh_value']==0,'vh_value'] = 18659 #from data exploration

#     #one hot encode categorical features; 
    df2 = pd.concat([df2,pd.get_dummies(df2['pol_coverage'])],axis=1)
    df2.loc[df2['pol_usage']=='AllTrips','pol_usage'] = 'Professional' #- only 77 AllTrips; from data description it is similar to professional
    df2 = pd.concat([df2,pd.get_dummies(df2['pol_usage'])],axis=1)
    df2 = pd.concat([df2,pd.get_dummies(df2['pol_pay_freq'])],axis=1)

#     #binarize features
    df2['pol_payd'] = (df2['pol_payd'] == 'Yes') * 1 # Yes/ No
    df2['drv_drv2'] = (df2['drv_drv2'] == 'Yes') * 1 # Yes / No
    df2['tourism'] = (df2['vh_type'] == 'Tourism') * 1 #tourism or commerical
    df2['diesel'] = (df2['vh_fuel'] == 'Diesel') * 1 #also hybrids but very low representation - 62 hybrids
    df2['M'] = (df2['drv_sex1'] == 'M') * 1 #Male / Female
    
    df2['pol_bonus2'] = (df2['pol_bonus'] == 0.5) * 1
    
    
#     #ordinally encode policy coverage
    order = {'Mini':1,'Median2':2,'Median1':3,'Maxi':4}
    df2['order_pol_coverage'] = df['pol_coverage'].apply(lambda x : order[x])

    unwantedFeatures= ['id_policy','pol_coverage','pol_pay_freq','pol_usage','pol_insee_code','drv_sex1',
                   'drv_age2','drv_sex2','drv_age_lic2','vh_fuel','vh_make','vh_model','vh_type']+['commune_code',
       'canton_code', 'city_district_code', 'regional_department_code']+['WorkPrivate','Yearly']+['made_claim','claim_amount']#+['drv_drv2']
    
    y1 = df2['made_claim']
    y2 = df2['claim_amount']
    df2 = df2.drop(unwantedFeatures,axis=1)
    return y1, y2, df2



df = pd.read_csv("training_data.csv")
y1, y2, df2 = transform(df)
df2.head()

Unnamed: 0,pol_bonus,pol_duration,pol_sit_duration,pol_payd,drv_drv2,drv_age1,drv_age_lic1,vh_age,vh_cyl,vh_din,vh_sale_begin,vh_sale_end,vh_speed,vh_value,vh_weight,town_mean_altitude,town_surface_area,population,Maxi,Median1,Median2,Mini,Professional,Retired,Biannual,Monthly,Quarterly,tourism,diesel,M,pol_bonus2,order_pol_coverage
0,0.5,36,6,0,0,77,55,15,1598,111,16,15,185,17517,1260.0,526.0,3216.0,4.8,1,0,0,0,0,1,1,0,0,1,0,1,1,4
1,0.5,15,5,0,1,52,33,12,2184,112,12,9,180,21500,1480.0,57.0,4912.0,141.3,1,0,0,0,0,0,0,0,0,1,1,1,1,4
2,0.5,16,6,0,0,52,34,20,2496,112,32,19,130,23600,2931.0,257.0,4488.0,5.3,0,0,0,1,1,0,0,1,0,0,1,1,1,1
3,0.5,11,7,0,0,67,46,12,1149,75,14,12,170,13050,930.0,109.0,1339.0,61.2,1,0,0,0,0,0,1,0,0,1,0,1,1,4
4,0.5,16,6,0,1,60,35,23,1905,93,23,18,185,17974,1035.0,24.0,1849.0,9.7,0,0,1,0,0,0,1,0,0,1,1,1,1,2


In [4]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report, r2_score, roc_auc_score,roc_curve
from sklearn.model_selection import cross_val_score
booleanFeatures = ['pol_payd','Professional','Retired','M','diesel','order_pol_coverage','tourism','pol_bonus2']#+['Maxi', 'Median1', 'Median2', 'Mini'] # omitted - + ['drv_drv2','vh_make','vh_model','canton_code','commune_code',city_district_code','regional_department_code','Biannual', 'Monthly','Quarterly']
numericFeatures = ['pol_duration', 'vh_din', 'pol_bonus', 'population', 'pol_sit_duration', 'vh_value', 'vh_sale_begin', 'vh_sale_end', 'vh_cyl', 'vh_speed', 'drv_age1', 'vh_weight', 'vh_age', 'drv_age_lic1'] #+ ['order_pol_coverage','town_surface_area','town_mean_altitude']


# Preprocessing

[x] Log Transform

[x] Standard Scaling



[x] PCA or not? / Kernel PCA

Create Interactions / Polynomial Features

Quantising numerical features?

Min-Max + NMF - didnt really work

LDA on boolean features - did not really work


In [5]:
#denoise vehicle age factors;
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures


#Normalised features
df3 = df2.copy()
df3[numericFeatures] = ((df3[numericFeatures]-df3[numericFeatures].mean())/df3[numericFeatures].std())
df3['order_pol_coverage']/=4

#transformations
df4 = df2.copy()
df4['population'] = np.log(df4['population']+1)
df4['vh_din'] = np.log(df4['vh_din'])
df4['vh_value'] = np.log(df4['vh_value'])
df4['vh_weight'] = np.log(df4['vh_weight'])
df4['vh_age'] = np.log(df4['vh_age'])
df4['vh_sale_begin'] = np.log(df4['vh_sale_begin'])
df4[numericFeatures] = ((df4[numericFeatures]-df4[numericFeatures].mean())/df4[numericFeatures].std())
df4['order_pol_coverage']/=4

    
    
    
#PCA transformed
pca = PCA()
X2 = pca.fit_transform(df3[numericFeatures])[:,:12]
X3 = pd.concat([df3[booleanFeatures].copy(),pd.DataFrame(X2,index=df3[booleanFeatures].index)],axis=1)

#polynomial features
X4 = df3[numericFeatures+booleanFeatures].copy()
# feats = list(X4.columns)
# for i in range(len(numericFeatures)):
#     for j in range(i+1,len(numericFeatures+booleanFeatures)):
#         X4[feats[i]+"_and_"+feats[j]] = X4[feats[i]] * X4[feats[j]]
poly = PolynomialFeatures(degree=2,include_bias=False)
X5 = poly.fit_transform(X4)

In [None]:
X6 = pd.DataFrame(X5).loc[y3.index]

In [None]:
from sklearn.linear_model import LinearRegression, SGDRegressor
# lr = LinearRegression()
# lr.fit(X5,np.log(y2+1))
# sgd = SGDRegressor()
# sgd.fit(X5,y2)
#0.008696886053574104 - base linear regression;
#0.009767566771513803 - degree 2 features
# print(r2_score(np.log(y2+1),lr.predict(X4)))
# print(r2_score(np.log(y2+1),lr.predict(X5)))
# plt.scatter(lr.predict(X5),np.log(y2+1))
# plt.scatter(y2,np.exp(lr.predict(X4))-1)


y3 = y2[y2!=0]
lr = LinearRegression()
# lr.fit(X6.loc[y3.index],y3)
lr.fit(X6,y3)
# print(r2_score(y3,lr.predict(X6.loc[y3.index])))
print(r2_score(y3,lr.predict(pd.DataFrame(X5).loc[y3.index])))

In [None]:
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(penalty='l2',solver='saga',max_iter=500,class_weight={0:1,1:8},n_jobs=-1)
logit.fit(X4,y1)


In [None]:
preds = logit.predict_proba(X4)[:,1]
print(r2_score(y1,preds))
print(f1_score(y1,np.round(preds)))
print(classification_report(y1,np.round(preds)))
confusion_matrix(y1,np.round(preds))

In [None]:
print(dict(sorted(zip(list(X3.columns)+['intercept'],list(logit.coef_[0])+list(logit.intercept_)),key=lambda k : abs(k[1]), reverse=True)))

In [None]:
from sklearn.linear_model import LogisticRegressionCV
logit = LogisticRegressionCV(cv=4,penalty='elasticnet',l1_ratios=[0.5],solver='saga',scoring='f1',max_iter=1000,class_weight={0:1,1:8},n_jobs=-1)
logit.fit(df2[booleanFeatures+numericFeatures],y1)

In [None]:
preds = logit.predict_proba(df4[numericFeatures+booleanFeatures])[:,1]
print(r2_score(y1,preds))
print(f1_score(y1,np.round(preds)))
print(classification_report(y1,np.round(preds)))
confusion_matrix(y1,np.round(preds))

#  Logistic Regression

F1 score - 0.20665294017642946 whole dataset (logitCV); 

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score,confusion_matrix
logit = LogisticRegressionCV(cv=4,penalty='elasticnet',l1_ratios=[0.5],solver='saga',scoring='f1',max_iter=2000,class_weight={0:1,1:8},n_jobs=-1)
logit.fit(X3,y1)
# print(dict(zip(list(numericFeatures)+['intercept'],list(logit.coef_[0])+list(logit.intercept_))))
print(f1_score(y1,logit.predict(X3)))
print(dict(sorted(zip(list(X3.columns)+['intercept'],list(logit.coef_[0])+list(logit.intercept_)),key=lambda k: abs(k[1]),reverse=True)))
confusion_matrix(y1,logit.predict(X3))

#0.22084635378639642
# PCA - 12 components 
#booleanFeatures = ['pol_payd','Professional','Retired','M','diesel','order_pol_coverage','tourism']#+['Maxi', 'Median1', 'Median2', 'Mini'] # omitted - + ['drv_drv2','vh_make','vh_model','canton_code','commune_code',city_district_code','regional_department_code','Biannual', 'Monthly','Quarterly']
#numericFeatures = ['pol_duration', 'vh_din', 'pol_bonus', 'population', 'pol_sit_duration', 'vh_value', 'vh_sale_begin', 'vh_sale_end', 'vh_cyl', 'vh_speed', 'drv_age1', 'vh_weight', 'vh_age', 'drv_age_lic1'] #+ ['order_pol_coverage','town_surface_area','town_mean_altitude']
#{'intercept': -1.2816726233158604, 'order_pol_coverage': 0.8818957942043697, 'Professional': 0.28509350856548965, 'pol_payd': -0.199076304952475, 0: 0.1853148764034734, 'diesel': 0.17271695034768617, 10: 0.1709972763750221, 1: -0.11689737479236778, 11: 0.10776916230010722, 'tourism': 0.09403280941367201, 5: 0.0894471102207708, 'Retired': 0.08741246835566338, 9: -0.08088024876285624, 2: -0.07405054993731437, 3: 0.06053853062186241, 'M': -0.04357310622682324, 7: 0.03256260809907931, 8: -0.006274962263683822, 4: 0.005540399001789366, 6: 0.002016743325228757}
#logit = LogisticRegressionCV(cv=4,penalty='l2',solver='saga',scoring='f1',max_iter=1000,class_weight={0:1,1:9.1})

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score,confusion_matrix
logit = LogisticRegressionCV(cv=4,penalty='elasticnet',l1_ratios=[0.5],solver='saga',scoring='f1',max_iter=2000,class_weight={0:1,1:8},n_jobs=-1)
logit.fit(df3[numericFeatures+booleanFeatures],y1)
# print(dict(zip(list(numericFeatures)+['intercept'],list(logit.coef_[0])+list(logit.intercept_))))
print(f1_score(y1,logit.predict(df3[numericFeatures+booleanFeatures])))
print(dict(sorted(zip(list(df3[numericFeatures+booleanFeatures].columns)+['intercept'],list(logit.coef_[0])+list(logit.intercept_)),key=lambda k: abs(k[1]),reverse=True)))
confusion_matrix(y1,logit.predict(df3[numericFeatures+booleanFeatures]))

#0.22043443282381336 - No PCA{'pol_payd': -1.2824996176580108, 1: -1.2809208687711864, 2: -1.2502802183077386, 9: -1.2106389710837047, 'M': -1.2028787104330692, 'drv_drv2': -1.1717891919876045, 'tourism': -1.1717891919876045, 6: -1.1717891919876045, 8: -1.1717891919876045, 12: -1.1717891919876045, 13: -1.1717891919876045, 4: -1.1690840663049051, 'Retired': -1.1513352412308835, 7: -1.139096142954687, 11: -1.1181827981955113, 3: -1.109005694367409, 5: -1.0805319282957322, 10: -1.0399889096109443, 'diesel': -0.9989086254790716, 0: -0.9861306618692427, 'order_pol_coverage': -0.9488964084112984}
#booleanFeatures = ['pol_payd','Professional','Retired','M','diesel','order_pol_coverage','tourism']#+['Maxi', 'Median1', 'Median2', 'Mini'] # omitted - + ['drv_drv2','vh_make','vh_model','canton_code','commune_code',city_district_code','regional_department_code','Biannual', 'Monthly','Quarterly']
#numericFeatures = ['pol_duration', 'vh_din', 'pol_bonus', 'population', 'pol_sit_duration', 'vh_value', 'vh_sale_begin', 'vh_sale_end', 'vh_cyl', 'vh_speed', 'drv_age1', 'vh_weight', 'vh_age', 'drv_age_lic1'] #+ ['order_pol_coverage','town_surface_area','town_mean_altitude']

#logit = LogisticRegressionCV(cv=4,penalty='elasticnet',l1_ratios=[0.5],solver='saga',scoring='f1',max_iter=2000,class_weight={0:1,1:8},n_jobs=-1)

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score,confusion_matrix
logit = LogisticRegressionCV(cv=2,penalty='l2',solver='saga',scoring='f1',max_iter=3000,class_weight={0:1,1:8},n_jobs=-1)
logit.fit(df4[numericFeatures+booleanFeatures],y1)
# print(dict(zip(list(numericFeatures)+['intercept'],list(logit.coef_[0])+list(logit.intercept_))))

preds = logit.predict(df4[numericFeatures+booleanFeatures])
print(f1_score(y1,preds))
print(dict(sorted(zip(list(df4[numericFeatures+booleanFeatures].columns)+['intercept'],list(logit.coef_[0])+list(logit.intercept_)),key=lambda k: abs(k[1]),reverse=True)),"\n")
print(classification_report(y1,preds))
confusion_matrix(y1,preds)

#0.22043443282381336 - No PCA{'pol_payd': -1.2824996176580108, 1: -1.2809208687711864, 2: -1.2502802183077386, 9: -1.2106389710837047, 'M': -1.2028787104330692, 'drv_drv2': -1.1717891919876045, 'tourism': -1.1717891919876045, 6: -1.1717891919876045, 8: -1.1717891919876045, 12: -1.1717891919876045, 13: -1.1717891919876045, 4: -1.1690840663049051, 'Retired': -1.1513352412308835, 7: -1.139096142954687, 11: -1.1181827981955113, 3: -1.109005694367409, 5: -1.0805319282957322, 10: -1.0399889096109443, 'diesel': -0.9989086254790716, 0: -0.9861306618692427, 'order_pol_coverage': -0.9488964084112984}
#booleanFeatures = ['pol_payd','Professional','Retired','M','diesel','order_pol_coverage','tourism']#+['Maxi', 'Median1', 'Median2', 'Mini'] # omitted - + ['drv_drv2','vh_make','vh_model','canton_code','commune_code',city_district_code','regional_department_code','Biannual', 'Monthly','Quarterly']
#numericFeatures = ['pol_duration', 'vh_din', 'pol_bonus', 'population', 'pol_sit_duration', 'vh_value', 'vh_sale_begin', 'vh_sale_end', 'vh_cyl', 'vh_speed', 'drv_age1', 'vh_weight', 'vh_age', 'drv_age_lic1'] #+ ['order_pol_coverage','town_surface_area','town_mean_altitude']

#logit = LogisticRegressionCV(cv=4,penalty='elasticnet',l1_ratios=[0.5],solver='saga',scoring='f1',max_iter=2000,class_weight={0:1,1:8},n_jobs=-1)

In [None]:
X3['preds'] = logit.predict_proba(X3)[:,1]

In [None]:
((X3['preds']>0.5)*50+100).describe()

In [None]:
df3[numericFeatures+booleanFeatures+['preds']].loc[y3.index]

In [None]:
X4 = pd.DataFrame(X4,index=X3.index)

In [None]:
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, ElasticNet,Lasso, BayesianRidge

y3 = y2[y2!=0]
lr = LinearRegression()
# lr.fit(X6.loc[y3.index],y3)
lr.fit(X6,np.log(y2+1))
# print(r2_score(y3,lr.predict(X6.loc[y3.index])))
print(r2_score(np.log(y2+1),lr.predict(X6)))

In [None]:
X6['severity'] = lr.predict(X6)

In [None]:
X6['severity'].corr(y1)

In [None]:
y3 = y2[y2!=0]

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
gpr = GaussianProcessRegressor(copy_X_train=False)
gpr.fit(X3.loc[y3.index],np.log(y3))

In [None]:
preds = gpr.predict(X3.loc[y3.index])

In [None]:
print(r2_score(y2.iloc[:1000],np.exp(gpr.predict(X3.iloc[:1000]))))
plt.scatter(np.exp(gpr.predict(X3.iloc[:1000])),y2.iloc[:1000])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV

logit = LogisticRegressionCV(cv=2,penalty='l2',solver='saga',scoring='f1',max_iter=1000,class_weight={0:1,1:9.1},n_jobs=-1)
clf = CalibratedClassifierCV(logit,cv=2,method='sigmoid')
clf.fit(df4,y1)

# Ensemble Methods - Gradient Boosting, Random Forest, XGBoost

In [8]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier,ExtraTreesClassifier
sample_weights = np.zeros(len(y1))
sample_weights[y1 == 0] = 1
sample_weights[y1 == 1] = 8

gbc = GradientBoostingClassifier(learning_rate=0.1,n_estimators=500,max_depth=5)
gbc.fit(X4,y1,sample_weight=sample_weights)

preds = gbc.predict_proba(X4)[:,1]
print(r2_score(y1,preds))
print(f1_score(y1,np.round(preds)))
print(classification_report(y1,np.round(preds)))
confusion_matrix(y1,np.round(preds))


-0.737667974674193
0.428299794010458
              precision    recall  f1-score   support

           0       0.97      0.83      0.89     72125
           1       0.30      0.74      0.43      7260

    accuracy                           0.82     79385
   macro avg       0.64      0.79      0.66     79385
weighted avg       0.91      0.82      0.85     79385



array([[59547, 12578],
       [ 1854,  5406]])

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(gbc,X4,y1,cv=3,scoring='f1')
print(scores.mean())

In [None]:
fig,ax = plt.subplots(figsize=(20,10))
rfFeatImportances = dict(sorted(zip(numericFeatures+booleanFeatures+['preds'],gbr.feature_importances_),key=lambda k: k[1]))
plt.barh(list(rfFeatImportances.keys()),rfFeatImportances.values())

In [None]:
from sklearn.tree import plot_tree
fig,ax = plt.subplots(figsize=(20,10))
plot_tree(gbc.estimators_[0][0])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
lr = LinearRegression()
gbr = GradientBoostingRegressor(n_estimators=300,max_depth=3,loss='ls',init=lr)
gbr.fit(df3[numericFeatures+booleanFeatures],y2)
preds = gbr.predict(X4)
print("R2 score",r2_score(y2,preds))
plt.scatter(preds,y2)
#R2 score 0.5782434142409241
#gbr = GradientBoostingRegressor(n_estimators=300,max_depth=3,loss='ls',init=lr)
#numericFeatures+booleanFeatures+['preds']

In [None]:
lr = LinearRegression()
lr.fit(X4,y2)
preds = lr.predict(X4)
print("R2 score",r2_score(y2,preds))
plt.scatter(preds,y2)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier,BaggingClassifier,AdaBoostClassifier

In [None]:
hgbc = ExtraTreesClassifier(n_estimators=100,max_depth=10,class_weight={0:1,1:9})

scores = cross_val_score(hgbc,df3[numericFeatures+booleanFeatures],y1,cv=3,scoring='f1')
print(scores.mean())
hgbc.fit(df3[numericFeatures+booleanFeatures],y1)
preds = hgbc.predict(df3[numericFeatures+booleanFeatures])
print("F1 score",f1_score(y1,preds))
confusion_matrix(y1,preds)

In [None]:
fig,ax = plt.subplots(figsize=(20,10))
rfFeatImportances = dict(sorted(zip(numericFeatures+booleanFeatures,hgbc.feature_importances_),key=lambda k: k[1]))
plt.barh(list(rfFeatImportances.keys()),rfFeatImportances.values())

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
xbgc = XGBClassifier(base_score=0.9,max_depth=8)

xbgc.fit(df2[numericFeatures+booleanFeatures],y1)

In [None]:
xbgc.predict(df2[numericFeatures+booleanFeatures])
# print(f1_score(y1,preds))
# confusion_matrix(y1,preds)

In [None]:
rfFeatures = list(df3.columns)
# rfFeatures = rfFeatures2

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, confusion_matrix
rf = RandomForestClassifier(n_estimators=500,
                            class_weight={0:0.1,1:0.9},
                            max_depth=8,
                           max_features='auto',
                           min_samples_split=2,
                           min_samples_leaf=1)
# rfFeatures = ['town_mean_altitude', 'Retired', 'town_surface_area', 'tourism', 'WorkPrivate', 'M', 'pol_duration', 'Median1', 'Professional', 'population', 'Median2', 'pol_bonus', 'diesel', 'vh_cyl', 'vh_weight', 'vh_speed', 'vh_din', 'vh_value', 'Mini', 'Maxi', 'order_pol_coverage', 'vh_age', 'vh_sale_end', 'vh_sale_begin']

scores = cross_val_score(rf,df3[numericFeatures+booleanFeatures],y1,scoring='f1',cv=4)
print(scores.mean())
#0.21783976920072673 - All Features - except pol_payd,drv_drv2,  n_estimators=300,class_weight={0:0.1,1:0.9},max_depth=8

In [None]:
#F1 score 0.24166413296848077, 4769 true positives 27439 false positives
rf.fit(df4[rfFeatures],y1)

from sklearn.metrics import f1_score
preds = rf.predict(df3[rfFeatures])
print("F1 score",f1_score(y1,preds))
from sklearn.metrics import confusion_matrix

confusion_matrix(y1,preds)

In [None]:
fig,ax = plt.subplots(figsize=(20,10))
rfFeatImportances = dict(sorted(zip(rfFeatures,rf.feature_importances_),key=lambda k: k[1]))
plt.barh(list(rfFeatImportances.keys()),rfFeatImportances.values())

# Naive bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB,ComplementNB,CategoricalNB,GaussianNB
from sklearn.metrics import confusion_matrix, f1_score
bnb = ComplementNB(class_prior=[0.5,0.5])
# bnb = GaussianNB(priors=[0.8,0.2])

booleanFeatures2 = ['pol_payd','Maxi', 'Median1', 'Median2', 'Mini','tourism','drv_drv2','Monthly','vh_age','vh_value']
booleanFeatures2=['order_pol_coverage']
bnb.fit(X4,y1)

print(bnb.score(X4,y1)) # accuracy - 0.6396170561189142
preds = bnb.predict_proba()
print(f1_score(y1,bnb.predict(X4))) #f1 - 0.21801284679513463 - GaussianNB

confusion_matrix(y1,bnb.predict(X4))



# Quadratic/Linear Discriminant Analysis

In [None]:
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(priors=[0.55,0.45],store_covariance=True)
# lda = QuadraticDiscriminantAnalysis(priors=[0.75,0.25],store_covariance=True)
lda.fit(X4,y1)
print(lda.score(X4,y1))
print(f1_score(y1,lda.predict(X4))) #f1 - 0.21100518481351396
confusion_matrix(y1,lda.predict(X4))

#0.22177432255174842 - lda = LinearDiscriminantAnalysis(priors=[0.55,0.45],store_covariance=True)

#and 0.21121506055214373 - lda = QuadraticDiscriminantAnalysis(priors=[0.75,0.25],store_covariance=True)

## KNeighbors 

In [None]:
from sklearn.decomposition import PCA
X = df2[numericFeatures]
X = ((X-X.mean())/X.std())
pca = PCA()
X2 = pca.fit_transform(X)


from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor, RadiusNeighborsRegressor#NeighborhoodComponentsAnalysis, NearestCentroid

from sklearn.model_selection import cross_val_score
# nca = NeighborhoodComponentsAnalysis(init='pca')
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X,y1,test_size=0.9)
# nc = NearestCentroid(metric='euclidean')
# nc.fit(X2,y1)
# print(f1_score(y1,nc.predict(X2)))
# confusion_matrix(y1,nc.predict(X2))

# knn = KNeighborsClassifier(n_neighbors=3,weights='uniform',n_jobs=-1,p=1)
# scores = cross_val_score(knn,X2,y1,scoring='f1')
# print(scores.mean())
knn.fit(X2,y1)
# plt.scatter(X2[:,0],X2[:,1],c=df2['made_claim'],alpha=0.5)

preds = knn.predict(X2)
print(f1_score(y1,preds))
confusion_matrix(y1,preds)

# #0.3240575448913158, p=1 n_neighbors=3 f1_score on pcaed componentsd

### SVM

In [None]:
from sklearn.svm import LinearSVC,NuSVC,OneClassSVM
svc = LinearSVC(C=0.5,penalty='l1',max_iter=3000,dual=False,class_weight={0:1,1:8}) #0.21847693375787028
# svc = NuSVC(nu=0.15,kernel='rbf',class_weight={0:1,1:1}) #0.19431386636559583
# svc = OneClassSVM(kernel='linear',cache_size=500)
svc.fit(X4,y1)

In [None]:
preds = svc.predict(X4)
print(r2_score(y1,preds))
print(f1_score(y1,preds))
print(classification_report(y1,preds))
confusion_matrix(y1,preds)

### Other -  SGD / Perceptron, Gaussian Mixture

In [None]:
# from sklearn.linear_model import Perceptron, SGDClassifier
# gpc = SGDClassifier(loss='log',penalty='l2',early_stopping=True,class_weight={0:1,1:9},max_iter=2000)
# gpc.fit(X3,y1)
# print(f1_score(y1,gpc.predict(X3)))
# confusion_matrix(y1,gpc.predict(X3))
# dict(sorted(zip(list(X3.columns)+['intercept'],list(gpc.coef_[0])+list(gpc.intercept_)),key=lambda k: abs(k[1]),reverse=True))


# from sklearn.mixture import GaussianMixture, BayesianGaussianMixture

# gm = BayesianGaussianMixture(n_components=2)
# #f1 - 0.18451121455776556
# gm.fit(df2[booleanFeatures2],y1)
# print(f1_score(y1,gm.predict(df2[booleanFeatures2])))
# confusion_matrix(y1,gm.predict(df2[booleanFeatures2]))

# Model

1. Use Logistic Regression to predict probability, then use Linear Regression to predict severity including the probability
    1. Use Linear Regression to predict severity, then use Logistic Regression to predict probability including predicted severity
2. Same but Use Gradient Boosting Regression

### Possible Classifiers

[x] Logistic Regression  - 0.2 F1 so far

[x] Naive bayes - some success using ComplementNB

[x] GradientBoosting 

[x] Random Forest, ExtraTrees - similar performance

[x] XGBOOST

[x] SVC 

[x] KNeighbors - worked on numericFeatures - too large for submission

Gaussian Mixture - Data not gaussian

Gaussian Process - too slow

SGD, Perceptron - didnt really work

HistGradientBoosting - did not work

BaggingClassifier, Adaboost - slow

### Possible Metrics
Accuracy - fraction of correct predictions - causes to predict default 0

Balanced Accuracy

Precision - ability to not have false negatives
Recall - Ability to not find true positives

*F1 score* - weighted measure of precision and recall

ROC curve -  performance of a binary classifier system as its discrimination threshold is varied

roc_auc_curve score

brier_score_loss

hinge-loss

roc_auc_score


## Feature Engineering

PCA

Polynomial Features / Interactions