In [1]:
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# import data
df = pd.read_csv('data/churn_data.csv')
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [3]:
# clean up column names
df.columns = df.columns.str.replace(' ', '_')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
state                     3333 non-null object
account_length            3333 non-null int64
area_code                 3333 non-null int64
phone_number              3333 non-null object
international_plan        3333 non-null object
voice_mail_plan           3333 non-null object
number_vmail_messages     3333 non-null int64
total_day_minutes         3333 non-null float64
total_day_calls           3333 non-null int64
total_day_charge          3333 non-null float64
total_eve_minutes         3333 non-null float64
total_eve_calls           3333 non-null int64
total_eve_charge          3333 non-null float64
total_night_minutes       3333 non-null float64
total_night_calls         3333 non-null int64
total_night_charge        3333 non-null float64
total_intl_minutes        3333 non-null float64
total_intl_calls          3333 non-null int64
total_intl_charge         3333 non-null float64

In [5]:
df['state'] = df['state'].astype('category')
df['state_id'] = df['state'].cat.codes
df['international_plan'] = df['international_plan'].replace(('yes', 'no'), (1, 0))
df['international_plan'] = df['international_plan'].astype('int')
df['voice_mail_plan'] = df['voice_mail_plan'].replace(('yes', 'no'), (1, 0))
df['voice_mail_plan'] = df['voice_mail_plan'].astype('int')
df['churn'] = df['churn'].replace((True, False), (1, 0))
df['churn'] = df['churn'].astype('int')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 22 columns):
state                     3333 non-null category
account_length            3333 non-null int64
area_code                 3333 non-null int64
phone_number              3333 non-null object
international_plan        3333 non-null int32
voice_mail_plan           3333 non-null int32
number_vmail_messages     3333 non-null int64
total_day_minutes         3333 non-null float64
total_day_calls           3333 non-null int64
total_day_charge          3333 non-null float64
total_eve_minutes         3333 non-null float64
total_eve_calls           3333 non-null int64
total_eve_charge          3333 non-null float64
total_night_minutes       3333 non-null float64
total_night_calls         3333 non-null int64
total_night_charge        3333 non-null float64
total_intl_minutes        3333 non-null float64
total_intl_calls          3333 non-null int64
total_intl_charge         3333 non-null float64

In [6]:
# add column for totals
# df['total_minutes'] = df['total_day_minutes'] + df['total_eve_minutes'] + df['total_night_minutes']
# df['total_calls'] = df['total_day_calls'] + df['total_eve_calls'] + df['total_night_calls']
# df['total_charge'] = df['total_day_charge'] + df['total_eve_charge'] + df['total_night_charge']
df.head()

Unnamed: 0,state,account_length,area_code,phone_number,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,...,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,churn,state_id
0,KS,128,415,382-4657,0,1,25,265.1,110,45.07,...,16.78,244.7,91,11.01,10.0,3,2.7,1,0,16
1,OH,107,415,371-7191,0,1,26,161.6,123,27.47,...,16.62,254.4,103,11.45,13.7,3,3.7,1,0,35
2,NJ,137,415,358-1921,0,0,0,243.4,114,41.38,...,10.3,162.6,104,7.32,12.2,5,3.29,0,0,31
3,OH,84,408,375-9999,1,0,0,299.4,71,50.9,...,5.26,196.9,89,8.86,6.6,7,1.78,2,0,35
4,OK,75,415,330-6626,1,0,0,166.7,113,28.34,...,12.61,186.9,121,8.41,10.1,3,2.73,3,0,36


In [7]:
# Logistic Regression Model - Statsmodels
import statsmodels.api as sm

In [8]:
y = df['churn']
X = df.drop(columns=['churn', 'phone_number', 'state'], axis=1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [10]:
X = sm.tools.add_constant(X)
logit_model = sm.Logit(y, X)
result = logit_model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.323797
         Iterations 7


0,1,2,3
Dep. Variable:,churn,No. Observations:,3333.0
Model:,Logit,Df Residuals:,3313.0
Method:,MLE,Df Model:,19.0
Date:,"Sun, 07 Feb 2021",Pseudo R-squ.:,0.2175
Time:,20:43:53,Log-Likelihood:,-1079.2
converged:,True,LL-Null:,-1379.1
Covariance Type:,nonrobust,LLR p-value:,5.4040000000000005e-115

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-8.4723,0.929,-9.117,0.000,-10.294,-6.651
account_length,0.0008,0.001,0.594,0.553,-0.002,0.004
area_code,-0.0005,0.001,-0.370,0.712,-0.003,0.002
international_plan,2.0460,0.146,14.036,0.000,1.760,2.332
voice_mail_plan,-2.0160,0.575,-3.507,0.000,-3.143,-0.889
number_vmail_messages,0.0356,0.018,1.976,0.048,0.000,0.071
total_day_minutes,-0.2382,3.276,-0.073,0.942,-6.658,6.182
total_day_calls,0.0032,0.003,1.154,0.249,-0.002,0.009
total_day_charge,1.4772,19.268,0.077,0.939,-36.288,39.243


In [11]:
relevant_columns = ['international_plan', 'voice_mail_plan', 'number_vmail_messages', 'total_intl_calls', 'customer_service_calls', 'churn']
dummy_dataframe = pd.get_dummies(df[relevant_columns], drop_first=True, dtype=float)

dummy_dataframe = dummy_dataframe.dropna()

y = dummy_dataframe['churn']
X = dummy_dataframe.drop(columns=['churn'], axis=1)

X = sm.tools.add_constant(X)
logit_model = sm.Logit(y, X)
result = logit_model.fit()

result.summary()

Optimization terminated successfully.
         Current function value: 0.357354
         Iterations 7


0,1,2,3
Dep. Variable:,churn,No. Observations:,3333.0
Model:,Logit,Df Residuals:,3327.0
Method:,MLE,Df Model:,5.0
Date:,"Sun, 07 Feb 2021",Pseudo R-squ.:,0.1364
Time:,20:43:53,Log-Likelihood:,-1191.1
converged:,True,LL-Null:,-1379.1
Covariance Type:,nonrobust,LLR p-value:,4.042e-79

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.3652,0.139,-16.967,0.000,-2.638,-2.092
international_plan,1.9657,0.136,14.460,0.000,1.699,2.232
voice_mail_plan,-1.8059,0.534,-3.381,0.001,-2.853,-0.759
number_vmail_messages,0.0323,0.017,1.924,0.054,-0.001,0.065
total_intl_calls,-0.0754,0.023,-3.214,0.001,-0.121,-0.029
customer_service_calls,0.4418,0.037,12.090,0.000,0.370,0.513


In [12]:
# Logistic Regression Model - SKlearn
from sklearn.linear_model import LogisticRegression

In [13]:
# Instantiate the model
logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')

# Fit the model
logreg.fit(X_train, y_train)

LogisticRegression(C=1000000000000.0, fit_intercept=False, solver='liblinear')

In [14]:
# Generate predictions
y_hat_train = logreg.predict(X_train)
y_hat_test = logreg.predict(X_test)

In [15]:
residuals = np.abs(y_train - y_hat_train)
print(pd.Series(residuals).value_counts())
print('------------------------------------')
print(pd.Series(residuals).value_counts(normalize=True))

0    2300
1     366
Name: churn, dtype: int64
------------------------------------
0    0.862716
1    0.137284
Name: churn, dtype: float64


In [16]:
residuals = np.abs(y_test - y_hat_test)
print(pd.Series(residuals).value_counts())
print('------------------------------------')
print(pd.Series(residuals).value_counts(normalize=True))

0    569
1     98
Name: churn, dtype: int64
------------------------------------
0    0.853073
1    0.146927
Name: churn, dtype: float64


In [17]:
# Basic decision tree

In [18]:
# Train a DT classifier
classifier = DecisionTreeClassifier(random_state=10)  
classifier.fit(X_train, y_train) 

DecisionTreeClassifier(random_state=10)

In [19]:
# Make predictions for test data
y_pred = classifier.predict(X_test) 

In [20]:
# Calculate accuracy 
acc = accuracy_score(y_test,y_pred) * 100
print('Accuracy is :{0}'.format(acc))

# Check the AUC for predictions
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('\nAUC is :{0}'.format(round(roc_auc, 2)))

# Create and print a confusion matrix 
print('\nConfusion Matrix')
print('----------------')
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Accuracy is :92.05397301349325

AUC is :0.85

Confusion Matrix
----------------


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,538,28,566
1,25,76,101
All,563,104,667


In [21]:
# # Alternative confusion matrix
# from sklearn.metrics import plot_confusion_matrix

# plot_confusion_matrix(classifier, X, y, values_format='.3g')
# plt.show()

In [22]:
# Instantiate and fit a DecisionTreeClassifier
classifier_2 = DecisionTreeClassifier(random_state=10, criterion='entropy')  
classifier_2.fit(X_train, y_train)

# Make predictions for test data
y_pred = classifier_2.predict(X_test) 

In [23]:
# Calculate accuracy 
acc = accuracy_score(y_test,y_pred) * 100
print('Accuracy is :{0}'.format(acc))

# Check the AUC for predictions
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('\nAUC is :{0}'.format(round(roc_auc, 2)))

# Create and print a confusion matrix 
print('\nConfusion Matrix')
print('----------------')
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Accuracy is :91.00449775112443

AUC is :0.83

Confusion Matrix
----------------


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,535,31,566
1,29,72,101
All,564,103,667


In [24]:
# XG Boost

In [25]:
# Instantiate XGBClassifier
clf = XGBClassifier()

# Fit XGBClassifier
clf.fit(X_train, y_train)

# Predict on training and test sets
training_preds = clf.predict(X_train)
test_preds = clf.predict(X_test)

# Accuracy of training and test sets
training_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy * 100))

Training Accuracy: 97.04%
Validation accuracy: 94.9%


In [26]:
param_grid = {
    'learning_rate': [0.1, 0.2],
    'max_depth': [6],
    'min_child_weight': [1, 2],
    'subsample': [0.5, 0.7],
    'n_estimators': [100]
}

In [27]:
grid_clf = GridSearchCV(clf, param_grid, scoring='accuracy', cv=None, n_jobs=1)
grid_clf.fit(X_train, y_train)

best_parameters = grid_clf.best_params_

print('Grid Search found the following optimal parameters: ')
for param_name in sorted(best_parameters.keys()):
    print('%s: %r' % (param_name, best_parameters[param_name]))

training_preds = grid_clf.predict(X_train)
test_preds = grid_clf.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print('')
print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy * 100))

Grid Search found the following optimal parameters: 
learning_rate: 0.1
max_depth: 6
min_child_weight: 1
n_estimators: 100
subsample: 0.7

Training Accuracy: 98.8%
Validation accuracy: 95.8%
