In [None]:
#The Project -- Root Insurance Challenge ########################################################

In [192]:
#Import the data
import pandas as pd
pd.options.mode.chained_assignment = None
data = pd.read_csv("Acme.csv") #Read into dataframe
print(data.head(3))

   impression_id  click  cost  number_of_vehicles  number_of_drivers  rank  \
0              1      1    10                   1                  1     1   
1              2      0    10                   2                  1     4   
2              3      0    10                   1                  1     2   

   policiessold  currently_insured  marital_status  
0             1                  1               1  
1             0                  1               1  
2             0                  1               2  


In [193]:
import matplotlib.pyplot as plt
import numpy as np

In [194]:
#Calculate current cost
#Probability of purchase, once clicked
sum_click = data['click'].sum()
sum_sold = data['policiessold'].sum()
print('The unconditional probability of purchasing after clicking is', sum_sold/sum_click)

The unconditional probability of purchasing after clicking is 0.39665211062590977


In [195]:
##############The probability of purchase conditional on click ###################################

In [196]:
#Calculate the probability by frequency
data2 = data[data['click'] > 0]; 
grouped = data2.groupby(['number_of_vehicles','number_of_drivers','currently_insured','marital_status']).agg({'policiessold': 'sum'})
grouped2 = data2.groupby(['number_of_vehicles','number_of_drivers','currently_insured','marital_status']).agg({'click': 'count'})
grouped2_new = grouped2.rename(columns={'click': 'policiessold'})
freq_purch = grouped/grouped2_new
freq_purch = freq_purch.round(4)

#Output into Latex
#print(freq_purch.to_latex())

In [197]:
#Calculate the probability by frequency
grouped = data.groupby(['number_of_vehicles','number_of_drivers','currently_insured','marital_status','rank']).agg({'click': 'sum'})
grouped2 = data.groupby(['number_of_vehicles','number_of_drivers','currently_insured','marital_status','rank']).agg({'click': 'count'})
freq_click = grouped/grouped2
freq_click = freq_click.round(4)

#Output into Latex
#print(freq_click.to_latex())

In [198]:
#Solve this by the classfication trees ===========================#
from sklearn import tree
clf = tree.DecisionTreeClassifier()
X = data2[['number_of_vehicles','number_of_drivers','currently_insured','marital_status']]
Y = data2[['policiessold']]
clf.fit(X, Y)
probs_trees = clf.predict_proba(X)

In [199]:
#Solve this by the random forest ===========================#
from sklearn.ensemble import RandomForestClassifier
import random
random.seed(2991)
clf = RandomForestClassifier()
clf.fit(X, Y.values.ravel())
probs_rf = clf.predict_proba(X)
probs_rf = probs_rf[:,1] #Note, the starting column number is 0! and 0 corresponding to 0, 1 corresponding to 1.
data2['probs_rf'] = probs_rf

In [200]:
grouped = data2.groupby(['number_of_vehicles','number_of_drivers','currently_insured','marital_status'])
rf_probs = pd.DataFrame(grouped.mean()['probs_rf'])
rf_probs = rf_probs.round(4)
#print(rf_probs.to_latex())

In [201]:
#Calculate the probability by weighted SVM ===========================#
from sklearn import svm
#Between 0-1, generate arithmetic sequence
weight = [i/500.0 for i in range(500)];
#Need to delete the first, because either all 0.
#And there seems some problem with all 0 case
del weight[0]

In [107]:
#Loops
y_pred = np.array(np.zeros((len(X),len(weight))))
arr_Y = Y.to_numpy()
for i in range(len(weight)):
    #Calculate the weight: 1 -- 1-weight; 0 -- weight
    #Because it is probability estimation, weighted-SVM cannot have the test sample!
    sw = np.zeros(len(arr_Y)) + weight[i]
    sw[arr_Y.ravel() > 0] = 1 - weight[i]
    clf_weights = svm.SVC()
    clf_weights.fit(X, arr_Y.ravel(), sample_weight = sw)   
    #Save the results for each weight, and calculate the weight as estimated probability
    temp = clf_weights.predict(X)    
    #Save this into a big array
    y_pred[:,i] = temp.ravel()

In [108]:
#Calculate the estimated probability from the weights
#Should be the midpoint of min(0) and max(1)
#Find the first non-one number, and its position, for each row
def first_nonone(arr, axis, invalid_val=-1):
    mask = arr!=0
    return np.where(mask.any(axis=axis), mask.argmin(axis=axis), invalid_val)

FN = first_nonone(y_pred, axis=1, invalid_val=-1) #First non-one
LO = FN - 1 #Last one
arr_w = np.array(weight)
probs_SVM = (arr_w[FN] + arr_w[LO])/2
data2['probs_SVM'] = probs_SVM

In [110]:
grouped = data2.groupby(['number_of_vehicles','number_of_drivers','currently_insured','marital_status'])
SVM_probs = pd.DataFrame(grouped.mean()['probs_SVM'])
#SVM_probs

In [None]:
###############The probability of click ###################################

In [202]:
#Solve this by the classfication trees
from sklearn import tree
clf = tree.DecisionTreeClassifier()
X = data[['number_of_vehicles','number_of_drivers','currently_insured','marital_status','rank']]
Y = data[['click']]
clf.fit(X, Y)
probs_trees = clf.predict_proba(X)

In [203]:
#Solve this by the random forest
from sklearn.ensemble import RandomForestClassifier
import random
random.seed(2991)
clf = RandomForestClassifier()
clf.fit(X, Y.values.ravel())
probs_rf = clf.predict_proba(X)
probs_rf = probs_rf[:,1] #Note, the starting column number is 0!
data['probs_rf'] = probs_rf

In [204]:
grouped = data.groupby(['number_of_vehicles','number_of_drivers','currently_insured','marital_status','rank'])
rf_probs = pd.DataFrame(grouped.mean()['probs_rf'])
rf_probs = rf_probs.round(4)
#print(rf_probs.to_latex())

In [205]:
#Click, conditional on rank only
grouped = data.groupby(['rank'])['click'].agg(['sum','count'])
unc_prob_rank = pd.DataFrame(grouped)
unc_prob_rank = unc_prob_rank['sum']/unc_prob_rank['count']
unc_prob_rank = unc_prob_rank.round(4)

In [206]:
###############The proportion of each group ###################################

In [207]:
#Get the unconditional proportion of each type
grouped = data.groupby(['number_of_vehicles','number_of_drivers','currently_insured','marital_status']).agg({'policiessold': 'count'})
#prop_type = grouped(level = 0).apply(lambda x: 100*x/float(x.sum()))
prop_type = grouped.apply(lambda x: 100*x/float(x.sum()))
prop_type = prop_type.round(4)
#print(prop_type.to_latex())

In [208]:
#Get the rank for each type
grouped = data.groupby(['number_of_vehicles','number_of_drivers','currently_insured','marital_status']).agg({'rank': 'mean'})
grouped = grouped.round(4)
#print(grouped.to_latex())

In [209]:
#Get the conditional distribution of rank, for each type, given all potentially cost 10
#Use random forest, needs to have multiple categories!

random.seed(2991)
clf = RandomForestClassifier()
X = data[['number_of_vehicles','number_of_drivers','currently_insured','marital_status']]
Y = data[['rank']]
clf.fit(X, Y.values.ravel())
probs_rank = clf.predict_proba(X)

data['P_r1'] = probs_rank[:,0]
data['P_r2'] = probs_rank[:,1]
data['P_r3'] = probs_rank[:,2]
data['P_r4'] = probs_rank[:,3]
data['P_r5'] = probs_rank[:,4]

In [210]:
#Calculate Summary
grouped = data.groupby(['number_of_vehicles','number_of_drivers','currently_insured','marital_status']).agg({'P_r1':'mean','P_r2':'mean','P_r3':'mean','P_r4':'mean','P_r5':'mean'})
grouped = grouped.round(4)
#print(grouped.to_latex())