In [2]:
#data set is downloaded from 
#https://www.kaggle.com/datasets/blastchar/telco-customer-churn

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [4]:
df=pd.read_csv('data.csv')

In [5]:
len(df)

7043

In [6]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
#we can transpose the dataframe using T function so we can see lot more data
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [8]:
#checking if pandas has actually inferred the values correctly or not
#an object means a string value
df.dtypes


customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [9]:
#total charges is given an object type but its float value
#the reason is that in some cases, this column contains a space (" ") to represent a missing value
#when pandas comes across nonnumeric characters, it has no option to but give it object type

In [10]:
#we can force this column to be numeric by converting it to number
#using to_numeric()
total_charges=pd.to_numeric(df.TotalCharges,errors='coerce')


df[total_charges.isnull()][['customerID','TotalCharges']]

Unnamed: 0,customerID,TotalCharges
488,4472-LVYGI,
753,3115-CZMZD,
936,5709-LVOEQ,
1082,4367-NUYAO,
1340,1371-DWPAZ,
3331,7644-OMVMY,
3826,3213-VVOLG,
4380,2520-SGTTA,
5218,2923-ARZLG,
6670,4075-WKNIU,


In [11]:
#we set the missing values to zero
df.TotalCharges=df.TotalCharges.fillna(0)


In [12]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [13]:
df.columns=df.columns.str.lower().str.replace(' ','_')

string_columns=list(df.dtypes[df.dtypes=='object'].index)

for col in string_columns:
    df[col]=df[col].str.lower().str.replace(' ','_')

In [14]:
df.churn=(df.churn=='yes').astype(int)

In [15]:
df.churn.head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int32

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
df_train_full,df_test=train_test_split(df,test_size=0.2,random_state=1)

In [18]:
df_train_full.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
1814,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.7,258.35,0
5946,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.9,3160.55,1
3881,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
2389,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
3676,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.4,2044.75,0


In [19]:
#Because train_test_split splits a dataset into only two parts, we perform 
# the split two times because we need three parts. First, we split the entire dataset into full 
# train and test, and then we split full train into train and validation

In [20]:
df_train,df_val=train_test_split(df_train_full,test_size=0.33,random_state=11)
#in the above line of code, random_state sets the random seed when doing the split to make sure that everytime
#we run the code, the result is the same

#the next two lines of code, takes the columns with the target variable
y_train=df_train.churn.values
y_val=df_val.churn.values

# Deletes the churn columns from both dataframes to 
# make sure we don’t accidentally use the churn variable 
# as a feature during training
del df_train['churn']
del df_val['churn']

In [21]:
#Lets check if we need to perform any additional null handling
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [22]:
df_train_full.churn.value_counts()

0    4113
1    1521
Name: churn, dtype: int64

In [23]:
global_mean=df_train_full.churn.mean()
round(global_mean,4)

0.27

In [24]:
#Lets create two categories of data, numerican and categorical
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
 'phoneservice', 'multiplelines', 'internetservice',
 'onlinesecurity', 'onlinebackup', 'deviceprotection',
 'techsupport', 'streamingtv', 'streamingmovies',
 'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [25]:
#How many unique values each variable has
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [26]:
#Feature important
#knowing how other variables affect the target variable
#this process of finding out is known as feature importance analysis

female_mean=df_train_full[df_train_full.gender=='female'].churn.mean()
male_mean=df_train_full[df_train_full.gender=='male'].churn.mean()

print('gender==female:',round(female_mean,3))
print('gender==male:',round(male_mean,3))

gender==female: 0.277
gender==male: 0.263


In [27]:
partner_yes=df_train_full[df_train_full.partner=='yes'].churn.mean()
partner_no=df_train_full[df_train_full.partner=='no'].churn.mean()

print('partner==yes',round(partner_yes,3))
print('partner==no',round(partner_no,3))

partner==yes 0.205
partner==no 0.33


In [28]:
#Calculating the risk of churning
global_mean=df_train_full.churn.mean()
df_group=df_train_full.groupby(by='gender').churn.agg(['mean'])
df_group['diff']=df_group['mean']-global_mean
df_group['risk']=df_group['mean']/global_mean

df_group

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


In [29]:
#Mutual information 
#Mutual information is a way to quantify the degree of dependency between two categorical variables
#however it doesn't work when one of the features is numerical
from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    return mutual_info_score(series,df_train_full.churn)

df_mi=df_train_full[categorical].apply(calculate_mi)
df_mi=df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


In [30]:
df_train_full[numerical].corrwith(df_train_full.churn)

tenure           -0.351885
monthlycharges    0.196805
dtype: float64

In [31]:
#Feature Engineering 
#One hot encoding
train_dict=df_train[categorical+numerical].to_dict(orient='records')

train_dict

[{'gender': 'male',
  'seniorcitizen': 0,
  'partner': 'yes',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'dsl',
  'onlinesecurity': 'yes',
  'onlinebackup': 'yes',
  'deviceprotection': 'yes',
  'techsupport': 'yes',
  'streamingtv': 'yes',
  'streamingmovies': 'yes',
  'contract': 'two_year',
  'paperlessbilling': 'yes',
  'paymentmethod': 'bank_transfer_(automatic)',
  'tenure': 71,
  'monthlycharges': 86.1,
  'totalcharges': '6045.9'},
 {'gender': 'female',
  'seniorcitizen': 1,
  'partner': 'yes',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'yes',
  'internetservice': 'fiber_optic',
  'onlinesecurity': 'no',
  'onlinebackup': 'no',
  'deviceprotection': 'yes',
  'techsupport': 'no',
  'streamingtv': 'yes',
  'streamingmovies': 'yes',
  'contract': 'one_year',
  'paperlessbilling': 'yes',
  'paymentmethod': 'credit_card_(automatic)',
  'tenure': 60,
  'monthlycharges': 100.5,
  'totalcharges': '6029'},
 {'gender

In [32]:
from sklearn.feature_extraction import DictVectorizer
dv=DictVectorizer(sparse=False)
dv.fit(train_dict)

DictVectorizer(sparse=False)

In [33]:
X_train=dv.transform(train_dict)

In [34]:
X_train[0]

array([0., 0., 1., ..., 0., 0., 0.])

In [35]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', ..., 'totalcharges=999.45',
       'totalcharges=999.9', 'totalcharges=_'], dtype=object)

In [36]:
from sklearn.linear_model import LogisticRegression 

model=LogisticRegression(solver='liblinear',random_state=1)
model.fit(X_train,y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [37]:
#We can apply it to our validation
# data to obtain the probability of churn for each customer in the validation dataset.

In [39]:
val_dict=df_val[categorical+numerical].to_dict(orient='records')
X_val=dv.transform(val_dict)

In [40]:
y_pred=model.predict_proba(X_val)[:,1]

In [41]:
y_pred

array([0.23796155, 0.29506945, 0.29307723, ..., 0.05503021, 0.65206583,
       0.08160058])

In [42]:
churn=y_pred >=0.5

In [43]:
churn

array([False, False, False, ..., False,  True, False])

In [45]:
dict(zip(dv.get_feature_names(),model.coef_[0].round(3)))

{'contract=month-to-month': 0.589,
 'contract=one_year': 0.008,
 'contract=two_year': -0.734,
 'dependents=no': -0.019,
 'dependents=yes': -0.118,
 'deviceprotection=no': 0.078,
 'deviceprotection=no_internet_service': -0.182,
 'deviceprotection=yes': -0.034,
 'gender=female': -0.033,
 'gender=male': -0.105,
 'internetservice=dsl': -0.431,
 'internetservice=fiber_optic': 0.475,
 'internetservice=no': -0.182,
 'monthlycharges': -0.002,
 'multiplelines=no': -0.191,
 'multiplelines=no_phone_service': 0.016,
 'multiplelines=yes': 0.037,
 'onlinebackup=no': 0.104,
 'onlinebackup=no_internet_service': -0.182,
 'onlinebackup=yes': -0.06,
 'onlinesecurity=no': 0.24,
 'onlinesecurity=no_internet_service': -0.182,
 'onlinesecurity=yes': -0.196,
 'paperlessbilling=no': -0.224,
 'paperlessbilling=yes': 0.087,
 'partner=no': -0.054,
 'partner=yes': -0.083,
 'paymentmethod=bank_transfer_(automatic)': -0.046,
 'paymentmethod=credit_card_(automatic)': -0.147,
 'paymentmethod=electronic_check': 0.192,


In [46]:
customer = {
    'customerid': '8879-zkjof',
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'no',
    'tenure': 41,
    'phoneservice': 'yes',
    'multiplelines': 'no',
    'internetservice': 'dsl',
    'onlinesecurity': 'yes',
    'onlinebackup': 'no',
    'deviceprotection': 'yes',
    'techsupport': 'yes',
    'streamingtv': 'yes',
    'streamingmovies': 'yes',
    'contract': 'one_year',
    'paperlessbilling': 'yes',
    'paymentmethod': 'bank_transfer_(automatic)',
    'monthlycharges': 79.85,
    'totalcharges': 3320.75,
}

In [47]:
X_test=dv.transform([customer])

In [48]:
X_test

array([[0., 1., 0., ..., 0., 0., 0.]])

In [49]:
model.predict_proba(X_test)

array([[0.91414457, 0.08585543]])

In [50]:
model.predict_proba(X_test)[0,1]

0.08585542763264047