In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

In [None]:
data = pd.read_csv('/content/Root_Insurance_data.csv')

In this notebook, we'll try to find probability of customer buying policies given they click our ad for each class.  

 ## Buy given click

We clean the data and get columns we need and transform categorical variables into dummy variables.

In [None]:
xdf = data[["click","policies_sold"]].copy()
xdf[["insured","unknown"]] = pd.get_dummies(data["Currently Insured"])[["Y","unknown"]]
xdf[["cars2","cars3"]] = pd.get_dummies(data["Number of Vehicles"])[[2,3]]
xdf[["drivers2"]] = pd.get_dummies(data["Number of Drivers"])[[1]]
xdf[["married"]] = pd.get_dummies(data["Marital Status"])[["M"]]
# xdf[["rank1","rank2","rank3","rank4","rank5"]] = pd.get_dummies(data["rank"])

# logistic regression

We tried firstly logistic regression and get results as follows.

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
poly_features = poly.fit_transform(xdf.drop(columns=["click","policies_sold"]))
poly_features = pd.DataFrame(poly_features, columns=poly.get_feature_names(xdf.columns[2:]))
poly_features.drop(columns=['insured unknown','cars2 cars3'], inplace=True)


In [None]:
import statsmodels.api as sm

glm = sm.Logit(xdf["policies_sold"].loc[xdf.click],
  sm.add_constant(poly_features.loc[xdf.click])).fit()
print(glm.summary())

Optimization terminated successfully.
         Current function value: 0.664257
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:          policies_sold   No. Observations:                 1878
Model:                          Logit   Df Residuals:                     1858
Method:                           MLE   Df Model:                           19
Date:                Wed, 19 May 2021   Pseudo R-squ.:                 0.02212
Time:                        22:49:01   Log-Likelihood:                -1247.5
converged:                       True   LL-Null:                       -1275.7
Covariance Type:            nonrobust   LLR p-value:                 1.396e-05
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -0.3876      0.191     -2.026      0.043      -0.763      -0.013
insured    

In [None]:
poly_features.sum(axis=0)

insured             3419.0
unknown             3137.0
cars2               3464.0
cars3               3095.0
drivers2            5001.0
married             5191.0
insured unknown        0.0
insured cars2       1171.0
insured cars3       1107.0
insured drivers2    1675.0
insured married     1740.0
unknown cars2       1146.0
unknown cars3        820.0
unknown drivers2    1669.0
unknown married     1726.0
cars2 cars3            0.0
cars2 drivers2      1637.0
cars2 married       1734.0
cars3 drivers2      1655.0
cars3 married       1725.0
drivers2 married    2529.0
dtype: float64

However, logistic regression relies on linear relationship between features and result, which does seem relevant in our case. 

# random forest

This time we try random forest method with bagging. Random Forest or tree-based method does assume linear relationship, that's why we prefer it here.

In [None]:
xdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   click          10000 non-null  bool 
 1   policies_sold  10000 non-null  int64
 2   insured        10000 non-null  uint8
 3   unknown        10000 non-null  uint8
 4   cars2          10000 non-null  uint8
 5   cars3          10000 non-null  uint8
 6   drivers2       10000 non-null  uint8
 7   married        10000 non-null  uint8
dtypes: bool(1), int64(1), uint8(6)
memory usage: 146.6 KB


In [None]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier

rf_clf = RandomForestClassifier(bootstrap=True)
rf_clf.fit(xdf.drop(columns=['click','policies_sold']).loc[xdf.click], xdf['policies_sold'].loc[xdf.click])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
# check accuracy on training data
pred = rf_clf.predict(xdf.drop(columns=['click','policies_sold']).loc[xdf.click])
np.mean(pred==xdf['policies_sold'].loc[xdf.click])

0.6043663471778488

In [None]:
# predict on classes
X = pd.DataFrame(columns=['insured','cars','drivers','married'])

for insured in ['insured', 'N', 'unknown']:
  for cars in ['cars1','cars2','cars3']:
    for drivers in ['drivers1','drivers2']:
      for married in ['married','single']:
        X.loc[len(X)] = [insured, cars, drivers, married]

X = pd.get_dummies(X,prefix='',prefix_sep='').drop(columns=['N','cars1','drivers1','single'])

p = rf_clf.predict_proba(X.values)[:,1]

X['prob_to_buy'] = p

In [33]:
X.to_csv('classes with probability.csv', index=False)

In [None]:
rf_clf.classes_

array([0, 1])