In [97]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from IPython.display import display

%matplotlib inline
sns.set(rc={'figure.figsize': (15,8)}) # Size all seaborn plots from the get go

In [98]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [99]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [100]:
len(df)

7043

In [101]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [102]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.TotalCharges = df.TotalCharges.fillna(0)
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [103]:
df.columns = df.columns.str.lower().str.replace(" ", "_")
string_columns = list(df.dtypes[df.dtypes == 'object'].index)
print(string_columns)

['customerid', 'gender', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod', 'churn']


In [104]:
for column in string_columns:
    df[column] = df[column].str.lower().str.replace(" ", "_")

In [105]:
# Our dependent variable is categorical - switch to numerical
# Yes/No --> Boolean --> cast to int (yes --> True = 1, no --> False = 0)
df.churn = (df.churn == 'yes').astype(int)

In [106]:
# Split the data into training and test sets:

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Split the training data into training and validation sets

df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=42)

y_train = df_train.churn.values # We just want the values as a np array, not the panda series
y_val = df_val.churn.values

In [107]:
df_train_full.isnull().sum()
# There are no missing values that we have to deal with.

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [108]:
# What proportion of our training set churned?

prop = df_train_full.value_counts(['churn'], normalize=True)
print(prop)

# The 1 represents those who churned, so 26.55% churned
# We have a significant class imbalance

churn
0        0.734469
1        0.265531
dtype: float64


In [109]:
df.dtypes


customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

In [110]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']

numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [111]:
# Just to make sure wer haven't mis-categorised
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

### Feature Importance Analysis

In [112]:
overall_churn_mean = df_train_full.churn.mean()
print(overall_churn_mean)

0.2655307064252751


In [113]:
# Let's look at the gender categorical attribute of the customers

female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean()
print(female_mean)

male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()
print(male_mean)

0.2708409173643975
0.26047800484932454


26.55% is our average churn rate across the train_full data subset - for females this is 27.08% and for males this is 26.05% - we can infer that gender isn't contributing to churn.

In [114]:
# Looking at the partner categorical attribute

partner_yes_mean = df_train_full[df_train_full.partner == 'yes'].churn.mean()
print(partner_yes_mean)

partner_no_mean = df_train_full[df_train_full.partner == 'no'].churn.mean()
print(partner_no_mean)

0.20073260073260074
0.32644628099173556


20% of those with a partner churn, whereas 32.64% of those without a partner churn - people who are single are more likely to churn. We can look at the risk of a single person churning.

In [115]:
# The risk of a negative outcome in a given group = the negative outcome in that group / the negative outcome overall - less than 1 = low risk, 1 = same risk, greater than 1 = high risk
risk_single_churning = partner_no_mean/overall_churn_mean

print(risk_single_churning)

1.2294106598311754


In [116]:
# Applying this across the board through a loop

for col in categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(['mean'])
    df_group['Diff'] = df_group['mean'] - overall_churn_mean
    df_group['Risk'] = df_group['mean']/overall_churn_mean
    display(df_group)


Unnamed: 0_level_0,mean,Diff,Risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.270841,0.00531,1.019998
male,0.260478,-0.005053,0.980971


Unnamed: 0_level_0,mean,Diff,Risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.237098,-0.028433,0.892922
1,0.413907,0.148377,1.558793


Unnamed: 0_level_0,mean,Diff,Risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.326446,0.060916,1.229411
yes,0.200733,-0.064798,0.755968


Unnamed: 0_level_0,mean,Diff,Risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.312326,0.046795,1.176233
yes,0.155674,-0.109856,0.586276


Unnamed: 0_level_0,mean,Diff,Risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.253623,-0.011908,0.955156
yes,0.266824,0.001293,1.004871


Unnamed: 0_level_0,mean,Diff,Risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.251397,-0.014134,0.946771
no_phone_service,0.253623,-0.011908,0.955156
yes,0.284105,0.018574,1.069952


Unnamed: 0_level_0,mean,Diff,Risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.191851,-0.073679,0.722521
fiber_optic,0.415558,0.150028,1.56501
no,0.076606,-0.188924,0.288502


Unnamed: 0_level_0,mean,Diff,Risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.416014,0.150484,1.566727
no_internet_service,0.076606,-0.188924,0.288502
yes,0.145342,-0.120189,0.547363


Unnamed: 0_level_0,mean,Diff,Risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.398693,0.133162,1.501494
no_internet_service,0.076606,-0.188924,0.288502
yes,0.216531,-0.048999,0.815467


Unnamed: 0_level_0,mean,Diff,Risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.387706,0.122175,1.460117
no_internet_service,0.076606,-0.188924,0.288502
yes,0.226825,-0.038705,0.854234


Unnamed: 0_level_0,mean,Diff,Risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.413472,0.147941,1.557153
no_internet_service,0.076606,-0.188924,0.288502
yes,0.152855,-0.112676,0.575657


Unnamed: 0_level_0,mean,Diff,Risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.335418,0.069887,1.263197
no_internet_service,0.076606,-0.188924,0.288502
yes,0.298945,0.033415,1.125841


Unnamed: 0_level_0,mean,Diff,Risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.333333,0.067803,1.255348
no_internet_service,0.076606,-0.188924,0.288502
yes,0.30132,0.035789,1.134784


Unnamed: 0_level_0,mean,Diff,Risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.426533,0.161002,1.60634
one_year,0.117987,-0.147544,0.444343
two_year,0.028379,-0.237151,0.106878


Unnamed: 0_level_0,mean,Diff,Risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.16414,-0.10139,0.618159
yes,0.33594,0.070409,1.265164


Unnamed: 0_level_0,mean,Diff,Risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.174475,-0.091056,0.65708
credit_card_(automatic),0.152404,-0.113126,0.573961
electronic_check,0.449921,0.18439,1.69442
mailed_check,0.190328,-0.075203,0.716782


We can use mutual information to further drill into the relationships between the categorical variables and the target variable which can then allow us to decide which variables to keep and which ones to ??disregard

In [119]:
def calculate_mutual_information(series):
    """
    A function to return the mutual information between a series/column within our dataframe
    and the churn target variable
    :param series: A series/column from our dataframe
    :return: A mutual information score between the series and the churn target variable
    """
    return mutual_info_score(series, df_train_full.churn)

In [118]:
df_mi = df_train_full[categorical].apply(calculate_mutual_information)
# Sort the values of the above from most informative to least
df_mi = df_mi.sort_values(ascending=False).to_frame(name='mutual_information')
df_mi

Unnamed: 0,mutual_information
contract,0.096652
onlinesecurity,0.063393
techsupport,0.060935
internetservice,0.053313
onlinebackup,0.045424
paymentmethod,0.042861
deviceprotection,0.042007
streamingtv,0.030844
streamingmovies,0.030705
paperlessbilling,0.019077


For the numerical (or non-categorical) variables we can use the correlation co-efficient (Pearson's)
0 --> No correlation
1 --> Fully positively correlated
-1 --> Fully negatively correlated

In [120]:
df_train_full[numerical].corrwith(df_train_full.churn)

tenure           -0.344925
monthlycharges    0.188574
totalcharges     -0.193370
dtype: float64