# Churn model for Insurance Company

The first attempt will be to try to predict the drivers of churn based on a classification method. The data contains flags relevant to customers demographics, claim and service behaviour and policy characteristics.

In [121]:
import pandas as pd

###### ** Changes from dataset version 2 to version 6:**

The data source had to be changed in order to address few issues/opportunities:
- the field AvgTurnaround had too many null values. Considering that only customers who claim will have this information, I couldn't find a way to keep the information in. So, for this reason it has been excluded from the datased
- the field CXMonthsFromRenewal also contained too many null values. The objective of the field is to provide the number of months that the policy is in the policy period, which expires every 12 months. There is a hypothesis that indicates that the customers are more likely to cancel when their policy is about to expiry
- the data size - reduce to only use policies with 0 to 5 years of tenure
- added data about latest claim and further demographics

In [122]:
policies = pd.read_table('ChurnModelDatasetV6.txt',index_col='no_')

In [None]:
### policies.shape
#policies.values            # underlying numpy array
policies.info() 

**Interpreting the quantiles**
- Ntile5LastClaimAmount - 5 tile, 1 = lowest value
- Ntile5LastPaidAmount - 5 tile, 1 = lowest value
- PremiumDecile - 10 tile, 1 = lowest value

In [None]:
## remove columns that won't be used
policies.columns = [c.replace(' ', '_') for c in policies.columns]
policies.drop('premium', axis=1, inplace=True)
policies.drop('ClientAgeInMonths', axis=1, inplace=True)
policies.drop('PetAgeInMonths', axis=1, inplace=True)
policies.drop('petAgeInYears', axis=1, inplace=True)
policies.drop('ClientAgeInYears', axis=1, inplace=True)
policies.drop('Insured_Gender', axis=1, inplace=True)
policies.drop('Brand',axis=1,inplace=True)
policies.info() 

In [None]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot') # This styles the graphs in a nicer format

In [None]:
policies[policies.Cancelled == 1].CXMonthsFromRenewal.plot(kind='hist', xlim=(0,20))
#policies.CXMonthsFromRenewal.hist(by=policies.Cancelled, sharex=True, sharey=True)
plt.xlabel('CXMonthsFromRenewal')


In [None]:
#transforming the string values into numeric values

policies.LastClaimStatus.value_counts().plot(kind='bar', title='Last Claim Status')
policies.InsuredGender.value_counts().plot(kind='bar', title='Gender')
policies.PlanType.value_counts().plot(kind='bar',title='PlanType')
policies.PlanType.value_counts()
policies.info()
policies.premium_payment_period_code.value_counts()
policies.payment_method_code.value_counts()
policies.species_code.value_counts()
policies.State.value_counts()
policies.ClientAgeInGroup.value_counts()
policies.PetAgeGroup.value_counts()

In [None]:
policies.LastClaimStatus.replace(['No Claims', 'Accepted with Pay', 'Denied', 'Accepted no Pay', 'Other'],  [1, 2, 3, 4, 0], inplace=True)
policies.InsuredGender.replace(['Female', 'Male'], [1, 2], inplace=True)
policies.PlanType.replace(['Accident & GVE', 'Accident & GVE & Routine','Accident','ASIA Accident & GVE',
                         'ASIA Accident & GVE & Routine','ASIA Accident','Accident & Routine','GVE & Routine','GVE'], 
                          [1, 2,3,1,2,3,4,4,4], inplace=True)
policies.premium_payment_period_code.replace(['FORTNIGHTL','MONTHLY','ANNUALLY'],[1,2,3], inplace=True)
policies.payment_method_code.replace(['CC','BANK','CHEQUE'],[1,2,3], inplace=True)
policies.species_code.replace(['CANINE','FELINE'],[1,2], inplace=True)
policies.State.replace(['NSW','VIC','QLD','WA','SA','ACT','TAS','NT'],[1,2,3,4,5,6,7,8], inplace=True)
policies.ClientAgeInGroup.replace(['00-19','20-29','30-39','40-49','50-59','60-100'],[1,2,3,4,5,6],inplace=True)
policies.PetAgeGroup.replace(['00-01','01-02','02-03','03-04','04-05','05-06','06-07','07-08','08-09','09-10','10-11','11-100'],[1,2,3,4,5,6,7,8,9,10,11,12],inplace=True)     
policies.Cancelled.value_counts()
policies.describe()

**From String to values**
- LastClaimStatus: No Claims = 1, Accepted with Pay = 2, Denied = 3, Accepted no Pay = 4, Other = 0
- InsuredGender: Female = 1, Male = 2
- PlanType: Accident & GVE = 1, Accident & GVE & Routine = 2, Accident = 3, ASIA Accident & GVE = 1, ASIA Accident & GVE & Routine = 2, ASIA Accident = 3, Accident & Routine =4, GVE & Routine = 4, GVE = 4
- premium_payment_period_code: FORTNIGHTL = 1, MONTHLY = 2, ANNUALLY = 3
- payment_method_code: CC = 1, BANK = 2, CHEQUE = 3
- species_code: CANINE = 1, FELINE = 2
- State: NSW = 1, VIC = 2, QLD = 3, WA = 4, SA = 5, ACT = 6, TAS = 7, NT = 8
- ClientAgeInGroup: 00-19 = 1, 20-29 = 2, 30-39 = 3, 40-49 = 4, 50-59 = 5, 60-100 = 6
- PetAgeGroup: 00-01 = 1, 01-02 = 2, 02-03 = 3, 03-04 = 4, 04-05 = 5, 05-06 = 6, 06-07 = 7, 07-08 = 8, 08-09 = 9, 09-10 = 10, 10-11 = 11, 11-100 = 12

In [None]:
policies.BenefitToClaimRatio.plot(kind='density', xlim=(0,1))
plt.xlabel('Benefit To Claim Ratio')

In [None]:
policies.info()

In [None]:
policies.Ntile5LastPaidAmount.hist(by=policies.Cancelled, sharex=True, sharey=True)

In [None]:
policies.OtherPets.hist(by=policies.Cancelled, sharex=True, sharey=True)

In [None]:
colors = np.where(policies.species_code==1, 'r', 'b') #canine red
policies.plot(kind='scatter', x='TenureInMonths', y='MOnthsSinceLastClaim', c=colors)

In [None]:
#pd.scatter_matrix(policies, figsize=(30,30))

In [None]:
import statsmodels.formula.api as smf

In [None]:
policies.describe()

In [None]:
policies.corr()

In [None]:
lm = smf.ols(formula='Cancelled ~ CXMonthsFromRenewal + OtherPets + MOnthsSinceLastClaim + LastClaimStatus + PetAgeGroup', data=policies).fit()

# print the coefficients
lm.params

In [None]:
lm.summary()

In [None]:
policies.Cancelled.value_counts()
# cancelled policies make up to 72.4% of the dataset so the model isn't really predicting any better

In [None]:
! pip install seaborn

In [None]:
plt.scatter(policies.TenureInMonths, policies.Cancelled)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)

feature_cols = ['MOnthsSinceLastClaim']
X = policies[feature_cols]
y = policies.Cancelled
logreg.fit(X, y)
policies_pred_class = logreg.predict(X)

In [None]:
# plot the class predictions
plt.scatter(policies.MOnthsSinceLastClaim, policies.Cancelled)
plt.plot(policies.MOnthsSinceLastClaim, policies_pred_class, color='red')

In [None]:
policies_pred_prob = logreg.predict_proba(X)[:, 1]
policies_pred_prob

In [None]:
# plot the predicted probabilities
plt.scatter(policies.MOnthsSinceLastClaim, policies.Cancelled)
plt.plot(policies.MOnthsSinceLastClaim, policies_pred_prob, color='red')

In [None]:
#policies.dropna(inplace=True)
#policies.irsd_decile

In [None]:
#print logreg.predict_proba(11)

In [None]:
policies.describe()

In [None]:
feature_cols = ['species_code','irsd_decile',
                'State','OtherPets',
                'MOnthsSinceLastClaim','LastClaimStatus','Ntile5LastClaimAmount','Ntile5LastPaidAmount',
                'PortalUser','OtherMembershipNumber','ClientAgeInGroup','PetAgeGroup','PremiumDecile','InsuredGender',
                'payment_method_code','premium_payment_period_code','ClaimedInThePast','PlanType']
X = policies[feature_cols]
y = policies.Cancelled

In [None]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
#from sklearn.linear_model import LogisticRegression
#logreg = LogisticRegression()
#logreg.fit(X_train, y_train)
#zip(feature_cols, logreg.coef_[0])

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train, y_train)
Cancelled_pred_class = logreg.predict_proba(X_train)[:, 1]
Cancelled_pred_class

In [None]:
import numpy as np
print 'no regularization =', np.sqrt(metrics.mean_squared_error(y_train, Cancelled_pred_class))

In [None]:
from sklearn import metrics
prds = logreg.predict(X)
print metrics.confusion_matrix(y, prds)

- Accuracy = (177710+28894)/265649 == 0.78
- Specificity = 177710 / (177710 + 14609) == 0.92

In [125]:
print metrics.accuracy_score(y, prds)

0.777733023652


## Trying Regularization

In [None]:
from sklearn.linear_model import LassoCV
alpha_range = 10.**np.arange(-2, 3)
laspolicies = LassoCV(normalize=True, alphas=alpha_range)
laspolicies.fit(X_train, y_train)
laspolicies.alpha_
laspolicies.coef_
preds2 = laspolicies.predict(X_test)
print 'Lasso CV reg =', np.sqrt(metrics.mean_squared_error(y_test, preds2))

**Ian**, if this is the mean square error, does this mean that the log reg with LASSO is predicting better than the one without? If the mean square error eh 44%, what exactly does it mean? :)

## Trying Decision Trees

In [None]:
from sklearn import tree
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt

%matplotlib inline

policies.Cancelled.mean() 

In [None]:
cancelled = policies['Cancelled']

X_train, X_test, y_train, y_test = train_test_split(policies,cancelled, random_state=1)

# Create a decision tree classifier instance (start out with a small tree for interpretability)
ctree = tree.DecisionTreeClassifier(random_state=1, max_depth=2)

# Fit the decision tree classifier
ctree.fit(X_train, y_train)



In [None]:
ctree.classes_

In [None]:
# Which features are the most important?
ctree.feature_importances_


In [None]:
# Clean up the output
pd.DataFrame(zip(features, ctree.feature_importances_)).sort_index(by=1, ascending=False)

In [None]:
# Make predictions on the test set
preds = ctree.predict(X_test)

# Calculate accuracy
metrics.accuracy_score(y_test, preds)

In [None]:
# Confusion matrix
pd.crosstab(y_test, preds, rownames=['actual'], colnames=['predicted'])

# Make predictions on the test set using predict_proba
probs = ctree.predict_proba(X_test)[:,1]

# Calculate the AUC metric
metrics.roc_auc_score(y_test, probs)

# Decision Trees have notorouisly high variance, so what can we do
# to better estimate the out of sample error of a high variance model?

In [None]:
import StringIO
from IPython.display import Image
import pydot
# make sure pydot and graphviz are installed 
# if problems see here:
# http://stackoverflow.com/questions/15951748/pydot-and-graphviz-error-couldnt-import-dot-parser-loading-of-dot-files-will/17902926#17902926

dot_data = StringIO.StringIO()  
tree.export_graphviz(
    ctree, 
    out_file=dot_data,
    feature_names=features,  
    class_names=["cancelled","active"],  
    filled=True, 
    rounded=True,  
    special_characters=True
)  
graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
policies.describe()


In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
ctree = tree.DecisionTreeClassifier(random_state=1, max_depth=2)

# compare AUC using cross-validation
#from sklearn.cross_validation import cross_val_score
#cross_val_score(logreg, policies, Cancelled, cv=10, scoring='roc_auc').mean()
#cross_val_score(ctree, policies, Cancelled, cv=10, scoring='roc_auc').mean()



# so far logistic regression is winning..

'''

FINE-TUNING THE TREE

'''
from sklearn.grid_search import GridSearchCV


# check CV score for max depth = 3
ctree = tree.DecisionTreeClassifier(max_depth=3)
np.mean(cross_val_score(ctree, policies, y_train, cv=5, scoring='roc_auc'))

# check CV score for max depth = 10
ctree = tree.DecisionTreeClassifier(max_depth=10)
np.mean(cross_val_score(ctree, policies, y_train, cv=5, scoring='roc_auc'))