In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('Telco-Customer-Churn.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [3]:
#Change to the TotalCharges column numeric and fill empty cells with 0
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors = 'coerce')
data['TotalCharges'] = data.TotalCharges.fillna(0)
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [4]:
#change column name to lower case and remove any spaces in the columns names
data.columns = data.columns.str.lower().str.replace(' ', '_')
string_columns = list(data.dtypes[data.dtypes == 'object'].index)
for col in string_columns:
    data[col] = data[col].str.lower().replace(' ', '_')
string_columns

['customerid',
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'churn']

In [5]:
#Convert the churn column from categorical to numerical
data['churn'] = data['churn'].astype('category')
data['churn'] = data['churn'].cat.codes
data.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no phone service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic check,29.85,29.85,0
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one year,no,mailed check,56.95,1889.5,0
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed check,53.85,108.15,1
3,7795-cfocw,male,0,no,no,45,no,no phone service,dsl,yes,...,yes,yes,no,no,one year,no,bank transfer (automatic),42.3,1840.75,0
4,9237-hqitu,female,0,no,no,2,yes,no,fiber optic,no,...,no,no,no,no,month-to-month,yes,electronic check,70.7,151.65,1


In [21]:
#Split the data into three: training set, validation set and test set
from sklearn.model_selection import train_test_split
data_train_val, data_test = train_test_split(data, test_size = 0.2, random_state = 10)

In [22]:
y_test = data_test.churn.values
x_test = data_test.drop(columns = 'churn')

In [23]:
# Check the distribution of values in the target variable
data_train_val.churn.value_counts()

churn
0    4108
1    1526
Name: count, dtype: int64

In [24]:
#The dataset is imbalanced
global_mean = round(data_train_val.churn.mean(), 3)
global_mean
#Approximately 27% of the customers stopped using our services

0.271

In [25]:
categorical_columns = ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice',
 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract',
 'paperlessbilling', 'paymentmethod']
numerical_columns = ['tenure', 'monthlycharges', 'totalcharges']
#Check how many unique variables each column has
data_train_val[categorical_columns].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [26]:
#Computing individual groups churn rate
female_mean = data_train_val[data_train_val.gender == 'female'].churn.mean()
male_mean = data_train_val[data_train_val.gender == 'male'].churn.mean()
partner_mean = data_train_val[data_train_val.partner == 'yes'].churn.mean()
no_partner_mean = data_train_val[data_train_val.partner == 'no'].churn.mean()
partner_mean

0.19911991199119913

In [27]:
#Calculating risk ratio for all the categorical variables
from IPython.display import display
for col in categorical_columns:
    data_group = data_train_val.groupby(by=col).churn.agg(['mean'])
    data_group['diff'] = data_group['mean'] - global_mean
    data_group['rate'] = data_group['mean'] / global_mean
    display(data_group)

# If the risk is lower than 1, the group has lower risks the churn rate in this group is smaller than the global churn
# if the value is higher than 1, the group is risky: there’s more churn in the group than in the population

Unnamed: 0_level_0,mean,diff,rate
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.27409,0.00309,1.011402
male,0.267655,-0.003345,0.987658


Unnamed: 0_level_0,mean,diff,rate
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24263,-0.02837,0.895313
1,0.415669,0.144669,1.533835


Unnamed: 0_level_0,mean,diff,rate
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338149,0.067149,1.247783
yes,0.19912,-0.07188,0.73476


Unnamed: 0_level_0,mean,diff,rate
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.318992,0.047992,1.177092
yes,0.160023,-0.110977,0.590492


Unnamed: 0_level_0,mean,diff,rate
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.239927,-0.031073,0.885339
yes,0.274175,0.003175,1.011714


Unnamed: 0_level_0,mean,diff,rate
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.256533,-0.014467,0.946616
no phone service,0.239927,-0.031073,0.885339
yes,0.294391,0.023391,1.086312


Unnamed: 0_level_0,mean,diff,rate
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.078653,0.709769
fiber optic,0.42903,0.15803,1.583135
no,0.069652,-0.201348,0.257017


Unnamed: 0_level_0,mean,diff,rate
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.426954,0.155954,1.575477
no internet service,0.069652,-0.201348,0.257017
yes,0.146783,-0.124217,0.541636


Unnamed: 0_level_0,mean,diff,rate
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.408377,0.137377,1.506926
no internet service,0.069652,-0.201348,0.257017
yes,0.220051,-0.050949,0.811998


Unnamed: 0_level_0,mean,diff,rate
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.403885,0.132885,1.490351
no internet service,0.069652,-0.201348,0.257017
yes,0.226878,-0.044122,0.837188


Unnamed: 0_level_0,mean,diff,rate
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.42434,0.15334,1.565829
no internet service,0.069652,-0.201348,0.257017
yes,0.155597,-0.115403,0.574157


Unnamed: 0_level_0,mean,diff,rate
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.348,0.077,1.284133
no internet service,0.069652,-0.201348,0.257017
yes,0.302571,0.031571,1.116499


Unnamed: 0_level_0,mean,diff,rate
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.343416,0.072416,1.267219
no internet service,0.069652,-0.201348,0.257017
yes,0.307339,0.036339,1.134094


Unnamed: 0_level_0,mean,diff,rate
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.433109,0.162109,1.598187
one year,0.1135,-0.1575,0.418818
two year,0.032496,-0.238504,0.119913


Unnamed: 0_level_0,mean,diff,rate
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.16382,-0.10718,0.6045
yes,0.343778,0.072778,1.268553


Unnamed: 0_level_0,mean,diff,rate
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank transfer (automatic),0.169082,-0.101918,0.623919
credit card (automatic),0.153908,-0.117092,0.567927
electronic check,0.468816,0.197816,1.729949
mailed check,0.189039,-0.081961,0.697561


In [28]:
#Calculating the Mutual information
from sklearn.metrics import mutual_info_score
def calculate_mi(series):
    return mutual_info_score(series, data_train_val.churn)

data_mi = data_train_val[categorical_columns].apply(calculate_mi) #Apply the calculate mi function
data_mi = data_mi.sort_values(ascending=False).to_frame(name= 'MI') #Order the values and convert into a dataframe
data_mi

Unnamed: 0,MI
contract,0.098122
onlinesecurity,0.069055
techsupport,0.066233
internetservice,0.059577
onlinebackup,0.050043
paymentmethod,0.048881
deviceprotection,0.048156
streamingtv,0.034956
streamingmovies,0.034614
paperlessbilling,0.020653


In [29]:
#Calculating correlation between the numerical columns to the churn column
data_train_val[numerical_columns].corrwith(data_train_val.churn)


tenure           -0.351325
monthlycharges    0.201568
totalcharges     -0.198116
dtype: float64

In [30]:
# convert our dataframe to a list of dictionaries
train_dict = data_train[categorical_columns + numerical_columns].to_dict(orient='records')

In [31]:
#Training the Model
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
def train(data, y, C):
    train_dict = data[categorical_columns + numerical_columns].to_dict(orient = 'records')
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)
    data_x_train = dv.transform(train_dict)
    model = LogisticRegression(solver = 'liblinear', C = 0.5)
    model.fit(data_x_train, y)
    return dv, model
    

In [36]:
#Applying the model to new data
def predict(data, dv, model):
    train_dict = data[categorical_columns + numerical_columns].to_dict(orient = 'records')
    data_x_train = dv.transform(train_dict)
    y_pred = model.predict_proba(data_x_train)[:, 1]
    return y_pred

#Tuning to select the best parameter C using cross validation
nfolds = 5
kfold = KFold(n_splits=nfolds, shuffle=True, random_state=1)
for C in [0.001, 0.01, 0.1, 0.5, 1, 10]:
    aucs = []
    for train_idx, val_idx in kfold.split(data_train_val):
        data_train = data_train_val.iloc[train_idx]
        data_val = data_train_val.iloc[val_idx]
        y_train = data_train.churn.values
        y_val = data_val.churn.values
        dv, model = train(data_train, y_train, C = C)
        y_pred = predict(data_val, dv, model)
        auc = roc_auc_score(y_val, y_pred)
        aucs.append(auc)
        print('C=%s, auc = %0.3f ± %0.3f' % (C, np.mean(aucs), np.std(aucs)))


In [38]:
#Kfold cross validation
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
kfold = KFold(n_splits=10, shuffle=True, random_state=1)
aucs = []
for train_idx, val_idx in kfold.split(data_train_val):
    data_train = data_train_val.iloc[train_idx]
    data_val = data_train_val.iloc[val_idx]
    y_train = data_train.churn.values
    y_val = data_val.churn.values
    dv, model = train(data_train, y_train, C = 0.5)
    y_pred = predict(data_val, dv, model)
    auc = roc_auc_score(y_val, y_pred)
    aucs.append(auc)
    print('auc = %.3f' % auc)

auc = 0.822
auc = 0.868
auc = 0.827
auc = 0.849
auc = 0.849
auc = 0.821
auc = 0.856
auc = 0.854
auc = 0.844
auc = 0.856


In [42]:
#Testing the model on the test data
y_train = data_train_val.churn.values
t_test = data_test.churn.values
dv, model = train(data_train_val, y_train, C = 0.5)
y_pred = predict(data_test, dv, model)
auc = roc_auc_score(y_test, y_pred)
print('auc = %.3f' %auc)
#An 84.3 % accuracy, much better than the previous model

auc = 0.843


In [46]:
#To predict a single customer
def predict_one_customer(customer, dv, model):
    x = dv.transform([customer])
    y_pred = model.predict_proba(x)[:, 1]
    return y_pred[0]

In [47]:
customer = {
'customerid': '8879-zkjof',
'gender': 'female',
'seniorcitizen': 0,
'partner': 'no',
'dependents': 'no',
'tenure': 41,
'phoneservice': 'yes',
'multiplelines': 'no',
'internetservice': 'dsl',
'onlinesecurity': 'yes',
'onlinebackup': 'no',
'deviceprotection': 'yes',
'techsupport': 'yes',
'streamingtv': 'yes',
'streamingmovies': 'yes',
'contract': 'one_year',
'paperlessbilling': 'yes',
'paymentmethod': 'bank_transfer_(automatic)',
'monthlycharges': 79.85,
'totalcharges': 3320.75,
}
predict_one_customer(customer, dv, model)

0.08029645942332791

8 % probability of churning

In [49]:
#Use pickle to save the module
import pickle
with open('churn-model.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out) #Save the dictvectorizer and the model as a tuple