Working at a telecom company that offers phone and internet services, and we have a problem: some of our customers are churning. They no longer
are using our services and are going to a different provider. To prevent that from happening, we develop a system for identifying these customers and offer
them an incentive to stay.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('Telco-Customer-Churn.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [3]:
data.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [4]:
#Change to numeric and fill empty cells with 0
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors = 'coerce')

In [5]:
data['TotalCharges'] = data.TotalCharges.fillna(0)
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [6]:
#change column name to lower case and remove any spaces in the data
data.columns = data.columns.str.lower().str.replace(' ', '_')
string_columns = list(data.dtypes[data.dtypes == 'object'].index)
for col in string_columns:
    data[col] = data[col].str.lower().replace(' ', '_')
string_columns

['customerid',
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'churn']

In [64]:
#Convert the churn column from categorical to numerical
data['churn'] = data['churn'].astype('category')
data['churn'] = data['churn'].cat.codes
data.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no phone service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic check,29.85,29.85,0
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one year,no,mailed check,56.95,1889.5,0
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed check,53.85,108.15,1
3,7795-cfocw,male,0,no,no,45,no,no phone service,dsl,yes,...,yes,yes,no,no,one year,no,bank transfer (automatic),42.3,1840.75,0
4,9237-hqitu,female,0,no,no,2,yes,no,fiber optic,no,...,no,no,no,no,month-to-month,yes,electronic check,70.7,151.65,1


In [65]:
#Split the data into three: training set, validation set and test set
from sklearn.model_selection import train_test_split
data_train_val, data_test = train_test_split(data, test_size = 0.2, random_state = 10)
data_train, data_validation = train_test_split(data_train_val, test_size = 0.15, random_state = 19)
data_validation

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
2381,7649-phjvr,male,0,no,no,11,yes,no,no,no internet service,...,no internet service,no internet service,no internet service,no internet service,one year,no,mailed check,19.50,226.80,0
1794,5377-ndtou,female,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,two year,no,mailed check,91.05,6293.75,0
6686,4670-tabxh,male,0,yes,no,72,yes,yes,fiber optic,no,...,yes,yes,yes,yes,two year,yes,credit card (automatic),109.75,7758.90,0
2085,8290-ywkhz,female,1,yes,no,32,yes,yes,fiber optic,no,...,no,no,yes,yes,month-to-month,yes,electronic check,93.20,2931.00,1
1882,5832-exgtt,male,0,yes,yes,29,yes,no,dsl,yes,...,no,no,no,yes,month-to-month,yes,mailed check,60.20,1834.15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6194,2868-llskm,female,0,yes,yes,68,yes,yes,fiber optic,no,...,no,yes,no,no,one year,yes,bank transfer (automatic),83.65,5733.40,0
1823,8496-ejaui,male,0,no,no,19,yes,no,fiber optic,no,...,no,no,no,no,month-to-month,yes,credit card (automatic),73.85,1424.50,1
3958,4652-nnhny,male,0,yes,no,16,yes,yes,fiber optic,no,...,no,no,no,no,month-to-month,yes,bank transfer (automatic),73.85,1284.20,1
4784,1622-hshsf,female,0,no,no,1,yes,no,no,no internet service,...,no internet service,no internet service,no internet service,no internet service,month-to-month,yes,mailed check,19.55,19.55,0


In [71]:
y_train = data_train['churn']
y_val = data_validation.churn.values
y_test = data_test.churn.values

x_train = data_train.drop(columns = 'churn')
x_val = data_validation.drop(columns = 'churn')
x_test = data_test.drop(columns = 'churn')

In [72]:
y_train

6348    0
6243    0
5381    1
81      0
5751    0
       ..
2436    0
5730    0
4155    0
535     0
388     0
Name: churn, Length: 4788, dtype: int8

In [73]:
# Check the distribution of values in the target variable
data_train_val.churn.value_counts()

churn
0    4108
1    1526
Name: count, dtype: int64

In [74]:
#The dataset is imbalanced
global_mean = round(data_train_val.churn.mean(), 3)
global_mean
#Approximately 27% of the customers stopped using our services

0.271

In [75]:
categorical_columns = ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice',
 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract',
 'paperlessbilling', 'paymentmethod']
numerical_columns = ['tenure', 'monthlycharges', 'totalcharges']
#Check how many unique variables each column has
data_train_val[categorical_columns].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [76]:
#Computing individual groups churn rate
female_mean = data_train_val[data_train_val.gender == 'female'].churn.mean()
male_mean = data_train_val[data_train_val.gender == 'male'].churn.mean()
partner_mean = data_train_val[data_train_val.partner == 'yes'].churn.mean()
no_partner_mean = data_train_val[data_train_val.partner == 'no'].churn.mean()
partner_mean

0.19911991199119913

In [77]:
#Calculating risk ratio for all the categorical variables
from IPython.display import display
for col in categorical_columns:
    data_group = data_train_val.groupby(by=col).churn.agg(['mean'])
    data_group['diff'] = data_group['mean'] - global_mean
    data_group['rate'] = data_group['mean'] / global_mean
    display(data_group)

# If the risk is lower than 1, the group has lower risks the churn rate in this group is smaller than the global churn
# if the value is higher than 1, the group is risky: there’s more churn in the group than in the population

Unnamed: 0_level_0,mean,diff,rate
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.27409,0.00309,1.011402
male,0.267655,-0.003345,0.987658


Unnamed: 0_level_0,mean,diff,rate
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24263,-0.02837,0.895313
1,0.415669,0.144669,1.533835


Unnamed: 0_level_0,mean,diff,rate
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338149,0.067149,1.247783
yes,0.19912,-0.07188,0.73476


Unnamed: 0_level_0,mean,diff,rate
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.318992,0.047992,1.177092
yes,0.160023,-0.110977,0.590492


Unnamed: 0_level_0,mean,diff,rate
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.239927,-0.031073,0.885339
yes,0.274175,0.003175,1.011714


Unnamed: 0_level_0,mean,diff,rate
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.256533,-0.014467,0.946616
no phone service,0.239927,-0.031073,0.885339
yes,0.294391,0.023391,1.086312


Unnamed: 0_level_0,mean,diff,rate
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.078653,0.709769
fiber optic,0.42903,0.15803,1.583135
no,0.069652,-0.201348,0.257017


Unnamed: 0_level_0,mean,diff,rate
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.426954,0.155954,1.575477
no internet service,0.069652,-0.201348,0.257017
yes,0.146783,-0.124217,0.541636


Unnamed: 0_level_0,mean,diff,rate
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.408377,0.137377,1.506926
no internet service,0.069652,-0.201348,0.257017
yes,0.220051,-0.050949,0.811998


Unnamed: 0_level_0,mean,diff,rate
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.403885,0.132885,1.490351
no internet service,0.069652,-0.201348,0.257017
yes,0.226878,-0.044122,0.837188


Unnamed: 0_level_0,mean,diff,rate
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.42434,0.15334,1.565829
no internet service,0.069652,-0.201348,0.257017
yes,0.155597,-0.115403,0.574157


Unnamed: 0_level_0,mean,diff,rate
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.348,0.077,1.284133
no internet service,0.069652,-0.201348,0.257017
yes,0.302571,0.031571,1.116499


Unnamed: 0_level_0,mean,diff,rate
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.343416,0.072416,1.267219
no internet service,0.069652,-0.201348,0.257017
yes,0.307339,0.036339,1.134094


Unnamed: 0_level_0,mean,diff,rate
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.433109,0.162109,1.598187
one year,0.1135,-0.1575,0.418818
two year,0.032496,-0.238504,0.119913


Unnamed: 0_level_0,mean,diff,rate
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.16382,-0.10718,0.6045
yes,0.343778,0.072778,1.268553


Unnamed: 0_level_0,mean,diff,rate
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank transfer (automatic),0.169082,-0.101918,0.623919
credit card (automatic),0.153908,-0.117092,0.567927
electronic check,0.468816,0.197816,1.729949
mailed check,0.189039,-0.081961,0.697561


In [78]:
#Calculating the Mutual information
from sklearn.metrics import mutual_info_score
def calculate_mi(series):
    return mutual_info_score(series, data_train_val.churn)

data_mi = data_train_val[categorical_columns].apply(calculate_mi) #Apply the calculate mi function
data_mi = data_mi.sort_values(ascending=False).to_frame(name= 'MI') #Order the values and convert into a dataframe
data_mi

Unnamed: 0,MI
contract,0.098122
onlinesecurity,0.069055
techsupport,0.066233
internetservice,0.059577
onlinebackup,0.050043
paymentmethod,0.048881
deviceprotection,0.048156
streamingtv,0.034956
streamingmovies,0.034614
paperlessbilling,0.020653


In [79]:
#Calculating correlation between the numerical columns to the churn column
data_train_val[numerical_columns].corrwith(data_train_val.churn)


tenure           -0.351325
monthlycharges    0.201568
totalcharges     -0.198116
dtype: float64

In [80]:
# convert our dataframe to a list of dictionaries
train_dict = data_train[categorical_columns + numerical_columns].to_dict(orient='records')

In [81]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [82]:
# After we fit the vectorizer, we can use it for converting the dictionaries to a matrix
data_x_train = dv.transform(train_dict)
data_x_train[0]

array([1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
       1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
       0.00000e+00, 1.00000e+00, 0.00000e+00, 6.90500e+01, 1.00000e+00,
       0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
       1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
       1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
       0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
       1.00000e+00, 0.00000e+00, 0.00000e+00, 2.60000e+01, 1.81565e+03])

In [83]:
y_train

6348    0
6243    0
5381    1
81      0
5751    0
       ..
2436    0
5730    0
4155    0
535     0
388     0
Name: churn, Length: 4788, dtype: int8

In [84]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver = 'liblinear', random_state = 1)
model.fit(data_x_train, y_train)

In [85]:
#apply it to our validation data to obtain the probability of churn for each customer in the validation dataset
val_dict = data_validation[categorical_columns + numerical_columns].to_dict(orient = 'records')
x_val =dv.transform(val_dict)

In [88]:
y_val_pred = model.predict_proba(x_val)[:, 1]

In [89]:
#The probability of the customers churning are
y_val_pred

array([0.06571231, 0.00978069, 0.07417945, 0.67201704, 0.15282285,
       0.697387  , 0.82405894, 0.00246452, 0.57070479, 0.07127316,
       0.01327915, 0.09797799, 0.03821318, 0.12816434, 0.31566612,
       0.39067696, 0.16576885, 0.40489415, 0.00276994, 0.06043319,
       0.02006946, 0.00882278, 0.70327067, 0.15668029, 0.00342828,
       0.12569598, 0.00343576, 0.19154851, 0.10171624, 0.80833268,
       0.70330417, 0.44763325, 0.28179035, 0.40732499, 0.43058469,
       0.75751223, 0.58228802, 0.23731764, 0.37547972, 0.26311624,
       0.61665938, 0.0227931 , 0.02684768, 0.04411192, 0.33604023,
       0.00595603, 0.14617455, 0.46262788, 0.45228374, 0.06124371,
       0.2402789 , 0.4116964 , 0.15345968, 0.49742105, 0.04712372,
       0.83996858, 0.01044168, 0.31916278, 0.60293836, 0.01587708,
       0.49117253, 0.00621995, 0.04580148, 0.03338045, 0.10959153,
       0.07307541, 0.00357417, 0.3078975 , 0.34919657, 0.00914481,
       0.00761313, 0.68680933, 0.60043728, 0.00287211, 0.73711

In [94]:
#If the element is greater than 0.5 or equal to 0.5, the corresponding element in the output array is True, and otherwise,
# it’s False
churn = y_val_pred >= 0.5
accuracy = (y_val == churn).mean()
print("The model's accuracy is: ", round(accuracy * 100, 2), "%")

The model's accuracy is:  78.25 %


In [95]:
# prdict the churn rate of a customer to test the model
customer = {
'customerid': '8879-zkjof',
'gender': 'female',
'seniorcitizen': 0,
'partner': 'no',
'dependents': 'no',
'tenure': 41,
'phoneservice': 'yes',
'multiplelines': 'no',
'internetservice': 'dsl',
'onlinesecurity': 'yes',
'onlinebackup': 'no',
'deviceprotection': 'yes',
'techsupport': 'yes',
'streamingtv': 'yes',
'streamingmovies': 'yes',
'contract': 'one_year',
'paperlessbilling': 'yes',
'paymentmethod': 'bank_transfer_(automatic)',
'monthlycharges': 79.85,
'totalcharges': 3320.75,
}
customer_test = dv.transform([customer])
customer_test

array([[0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 7.98500e+01, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 4.10000e+01, 3.32075e+03]])

In [96]:
model.predict_proba(customer_test)[0, 1]

0.07836959247353878

The probability is 7.83% We will not be sending the customer an email