In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [96]:
# Generate sample data

df = pd.read_csv('data/classification_dataset.csv')




In [5]:
# transpose the dataframe to see the whole data at once
# df.head().T

In [97]:
# data cleaning
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_cols = list(df.dtypes[df.dtypes == 'object'].index)

for col in categorical_cols:
    df[col] = df[col].str.lower().str.replace(' ', '_')
    



In [7]:
tc = pd.to_numeric(df.totalcharges, errors='coerce')

In [8]:

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')


In [93]:
df.totalcharges = df.totalcharges.fillna(0)

df.churn

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: churn, Length: 7043, dtype: int64

In [10]:

# replace target variable with binary values
df.churn  = (df.churn == 'yes').astype(int) 



In [11]:
# SETTING UP THE VALIDATION FRAMEWORK

# perform the train-test split with scikit-learn
# what is skit-learn?   
from sklearn.model_selection import train_test_split




In [12]:
# train_test_split splits data into train and test sets

df_full_train, df_test =  train_test_split(df, test_size=0.2, random_state=1)

len(df_full_train), len(df_test)

(5634, 1409)

In [13]:
# get validation dataset

df_train, df_val =  train_test_split(df_full_train, test_size=0.25, random_state=1)

len(df_train), len(df_val), len(df_test)


(4225, 1409, 1409)

In [14]:
# make indices not to be shuffles

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [15]:
# get target variable
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [16]:
# delete target variable from dataframes

del df_train['churn']
del df_val['churn']
del df_test['churn']


In [17]:
# EDA

# checking missing values
# look at the target variable (churn) distribution
# look at numerical and categorical variables






In [18]:
df_full_train = df_full_train.reset_index(drop=True)



In [19]:
# check missing values

df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [20]:
df_full_train.churn.value_counts(normalize=True)

# churn rate : the rate at which customers leave a service

churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

In [21]:
global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 2)

0.27

In [22]:
df_full_train.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

In [23]:
# numerical columns

numerical = ['tenure', 'monthlycharges', 'totalcharges']


In [24]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [25]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
        'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [26]:
df_full_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [27]:
df_full_train[numerical].nunique()

tenure              73
monthlycharges    1494
totalcharges      5291
dtype: int64

In [28]:
# FEATURE IMPORTANCE: CHURN RATE AND RISK RATIO

# CHURN RATE
# RISK  RATIO
# MUTUTAL INFORMATION





In [29]:
# CHURN RATE

churn_female =  df_full_train[df_full_train.gender == 'female'].churn.mean()

churn_female

0.27682403433476394

In [30]:
churn_male =  df_full_train[df_full_train.gender == 'male'].churn.mean()

churn_male

0.2632135306553911

In [31]:
churn_partner =  df_full_train[df_full_train.partner == 'yes'].churn.mean()

churn_partner

0.20503330866025166

In [32]:
churn_no_partner =  df_full_train[df_full_train.partner == 'no'].churn.mean()
churn_no_partner

0.3298090040927694

In [33]:
global_churn = df_full_train.churn.mean()

global_churn

0.26996805111821087

In [34]:
churn_female_difference = global_churn - churn_female
churn_male_difference = global_churn - churn_male


# partner

churn_partner_difference = global_churn - churn_partner

churn_no_partner_difference = global_churn - churn_no_partner


print(churn_female_difference)
print(churn_male_difference)


# partner
print(churn_partner_difference)
print(churn_no_partner_difference)

-0.006855983216553063
0.006754520462819769
0.06493474245795922
-0.05984095297455855


In [35]:
# RISK RATIO

churn_no_partner / global_churn

1.2216593879412643

In [36]:
churn_partner / global_churn

0.7594724924338315

In [37]:
from IPython.display import display

In [38]:
# we cannot individually calculate risk ratio for all categorical variables
# so we can leverage on sql like groupby functionality of pandas

# SELECT gender, AVG(churn), AVG(churn), AVG(churn) - global_churn as diff, AVG(churn) / global_churn as risk_ratio FROM data GROUP BY gender

df_group = df_full_train.groupby('gender').churn.agg(['mean', 'count'])
df_group['diff'] = df_group['mean'] - global_churn
df_group['risk_ratio'] = df_group['mean'] / global_churn
df_group

for col in categorical:
    df_group = df_full_train.groupby(col).churn.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_churn
    df_group['risk_ratio'] = df_group['mean'] / global_churn
    print(f'\nColumn: {col}')
    display(df_group)





Column: gender


Unnamed: 0_level_0,mean,count,diff,risk_ratio
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.006856,1.025396
male,0.263214,2838,-0.006755,0.97498



Column: seniorcitizen


Unnamed: 0_level_0,mean,count,diff,risk_ratio
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,-0.027698,0.897403
1,0.413377,912,0.143409,1.531208



Column: partner


Unnamed: 0_level_0,mean,count,diff,risk_ratio
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.329809,2932,0.059841,1.221659
yes,0.205033,2702,-0.064935,0.759472



Column: dependents


Unnamed: 0_level_0,mean,count,diff,risk_ratio
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.31376,3968,0.043792,1.162212
yes,0.165666,1666,-0.104302,0.613651



Column: phoneservice


Unnamed: 0_level_0,mean,count,diff,risk_ratio
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.241316,547,-0.028652,0.89387
yes,0.273049,5087,0.003081,1.011412



Column: multiplelines


Unnamed: 0_level_0,mean,count,diff,risk_ratio
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.257407,2700,-0.012561,0.953474
no_phone_service,0.241316,547,-0.028652,0.89387
yes,0.290742,2387,0.020773,1.076948



Column: internetservice


Unnamed: 0_level_0,mean,count,diff,risk_ratio
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.192347,1934,-0.077621,0.712482
fiber_optic,0.425171,2479,0.155203,1.574895
no,0.077805,1221,-0.192163,0.288201



Column: onlinesecurity


Unnamed: 0_level_0,mean,count,diff,risk_ratio
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420921,2801,0.150953,1.559152
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.153226,1612,-0.116742,0.56757



Column: onlinebackup


Unnamed: 0_level_0,mean,count,diff,risk_ratio
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404323,2498,0.134355,1.497672
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.217232,1915,-0.052736,0.80466



Column: deviceprotection


Unnamed: 0_level_0,mean,count,diff,risk_ratio
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.395875,2473,0.125907,1.466379
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.230412,1940,-0.039556,0.85348



Column: techsupport


Unnamed: 0_level_0,mean,count,diff,risk_ratio
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.418914,2781,0.148946,1.551717
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.159926,1632,-0.110042,0.59239



Column: streamingtv


Unnamed: 0_level_0,mean,count,diff,risk_ratio
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342832,2246,0.072864,1.269897
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.302723,2167,0.032755,1.121328



Column: streamingmovies


Unnamed: 0_level_0,mean,count,diff,risk_ratio
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.338906,2213,0.068938,1.255358
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.307273,2200,0.037305,1.138182



Column: contract


Unnamed: 0_level_0,mean,count,diff,risk_ratio
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431701,3104,0.161733,1.599082
one_year,0.120573,1186,-0.149395,0.446621
two_year,0.028274,1344,-0.241694,0.10473



Column: paperlessbilling


Unnamed: 0_level_0,mean,count,diff,risk_ratio
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.172071,2313,-0.097897,0.637375
yes,0.338151,3321,0.068183,1.25256



Column: paymentmethod


Unnamed: 0_level_0,mean,count,diff,risk_ratio
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.168171,1219,-0.101797,0.622928
credit_card_(automatic),0.164339,1217,-0.10563,0.608733
electronic_check,0.45589,1893,0.185922,1.688682
mailed_check,0.19387,1305,-0.076098,0.718121


In [73]:
# Feature importance : Mutual Information

# concept from information theory that tells us how much information about one variable can be obtained by observing another variable

from sklearn.metrics import mutual_info_score

# mutual_info_score helps us to compute mutual information between two discrete variables
# comparing churn dataset with contract type

mutual_info_score(df_full_train.churn, df_full_train.contract)

0.0983203874041556

In [74]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.churn)

In [75]:
df_full_train[categorical].apply(mutual_info_churn_score).sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

In [76]:
# Feature correlation 
# Mutual information : a way to measure importance of categorical features
# correlation : a way to measure importance of numerical features

# what is correlation?

# A way to measure importance of numerical features is to calculate their correlation with the target variable



In [77]:
df_full_train[numerical].corrwith(df_full_train.churn)

# df_full_train[numerical].corrwith(df_full_train.churn).abs()

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

In [78]:
# one-hot encoding
# use scikit-learn's to encode categorical variables


from sklearn.feature_extraction import DictVectorizer

In [None]:
train_dicts = (df_train[categorical + numerical].to_dict(orient='records'))


In [None]:


dv = DictVectorizer(sparse=False)

In [89]:
# get feature names: dv.get_feature_names_out()
x_train = dv.fit_transform(train_dicts)




array([[0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        1.20000e+01, 2.58350e+02],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        4.20000e+01, 3.16055e+03],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        7.10000e+01, 4.68175e+03],
       ...,
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.80000e+01, 2.97950e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.00000e+00, 1.14100e+02],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        1.60000e+01, 1.11485e+03]])

In [None]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
x_val = dv.transform(val_dicts)


In [None]:
# test_dicts = df_test[categorical + numerical].to_dict(orient='records')
# x_test = dv.transform(test_dicts)