In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/refs/heads/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"

import os
if not os.path.exists("./WA_Fn-UseC_-Telco-Customer-Churn.csv"):
    !wget $data

### Data Preparation

In [3]:
df = pd.read_csv("./WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [5]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [6]:
df.columns = df.columns.str.lower()

In [7]:
categorical_columns = df.columns[df.dtypes == object]
for col in categorical_columns:
    df[col] = df[col].str.lower().str.replace(" ", "_")

In [8]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [9]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

In [10]:
df.churn = (df.churn == "yes").astype(int)

In [11]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


### Setting Up The Validation Framework

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train_test_split?

[31mSignature:[39m
train_test_split(
    *arrays,
    test_size=[38;5;28;01mNone[39;00m,
    train_size=[38;5;28;01mNone[39;00m,
    random_state=[38;5;28;01mNone[39;00m,
    shuffle=[38;5;28;01mTrue[39;00m,
    stratify=[38;5;28;01mNone[39;00m,
)
[31mDocstring:[39m
Split arrays or matrices into random train and test subsets.

Quick utility that wraps input validation,
``next(ShuffleSplit().split(X, y))``, and application to input data
into a single call for splitting (and optionally subsampling) data into a
one-liner.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
*arrays : sequence of indexables with same length / shape[0]
    Allowed inputs are lists, numpy arrays, scipy-sparse
    matrices or pandas dataframes.

test_size : float or int, default=None
    If float, should be between 0.0 and 1.0 and represent the proportion
    of the dataset to include in the test split. If int, represents the
    absolute number of test samples. If None

In [14]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [15]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
len(df_train),len(df_val), len(df_test)

(4225, 1409, 1409)

In [16]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

### EDA

In [18]:
df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [19]:
df_full_train.churn.value_counts(normalize=True)

churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

In [20]:
global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 2)

np.float64(0.27)

In [21]:
df.select_dtypes(include=['int64', "float64"]).columns

Index(['seniorcitizen', 'tenure', 'monthlycharges', 'totalcharges', 'churn'], dtype='object')

In [22]:
df.select_dtypes(include=['object']).columns

Index(['customerid', 'gender', 'partner', 'dependents', 'phoneservice',
       'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup',
       'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies',
       'contract', 'paperlessbilling', 'paymentmethod'],
      dtype='object')

In [23]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice',
       'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup',
       'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies',
       'contract', 'paperlessbilling', 'paymentmethod']

In [24]:
df[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

### Feature Importance

#### Churn Rate Difference

In [25]:
churn_female = df_full_train[df_full_train.gender == "female"].churn.mean()
churn_female

np.float64(0.27682403433476394)

In [26]:
churn_male = df_full_train[df_full_train.gender == "male"].churn.mean()
churn_male

np.float64(0.2632135306553911)

In [27]:
global_churn_rate - churn_female

np.float64(-0.006855983216553063)

In [28]:
global_churn_rate - churn_male

np.float64(0.006754520462819769)

In [29]:
df_full_train.gender.value_counts()

gender
male      2838
female    2796
Name: count, dtype: int64

#### Risk Ratio

In [30]:
churn_no_partner = df_full_train[df_full_train.partner == "no"].churn.mean()
churn_no_partner

np.float64(0.3298090040927694)

In [31]:
churn_partner = df_full_train[df_full_train.partner == "yes"].churn.mean()
churn_partner

np.float64(0.20503330866025166)

In [32]:
churn_no_partner / global_churn_rate

np.float64(1.2216593879412643)

In [33]:
churn_partner / global_churn_rate

np.float64(0.7594724924338315)

In [34]:
df_full_train.groupby('gender').churn.mean()

gender
female    0.276824
male      0.263214
Name: churn, dtype: float64

In [35]:
df_full_train.groupby('gender').churn.agg(['mean', 'count'])

Unnamed: 0_level_0,mean,count
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.276824,2796
male,0.263214,2838


In [36]:
df_group = df_full_train.groupby('gender').churn.agg(['mean', 'count'])
df_group['diff'] = df_group['mean'] - global_churn_rate
df_group['risk'] = df_group['mean'] / global_churn_rate
df_group

Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.006856,1.025396
male,0.263214,2838,-0.006755,0.97498


In [37]:
from IPython.display import display

for c in categorical:
    df_group = df_full_train.groupby(c).churn.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_churn_rate
    df_group['risk'] = df_group['mean'] / global_churn_rate
    display(df_group)
    print()
    print()

Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.006856,1.025396
male,0.263214,2838,-0.006755,0.97498






Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,-0.027698,0.897403
1,0.413377,912,0.143409,1.531208






Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.329809,2932,0.059841,1.221659
yes,0.205033,2702,-0.064935,0.759472






Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.31376,3968,0.043792,1.162212
yes,0.165666,1666,-0.104302,0.613651






Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.241316,547,-0.028652,0.89387
yes,0.273049,5087,0.003081,1.011412






Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.257407,2700,-0.012561,0.953474
no_phone_service,0.241316,547,-0.028652,0.89387
yes,0.290742,2387,0.020773,1.076948






Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.192347,1934,-0.077621,0.712482
fiber_optic,0.425171,2479,0.155203,1.574895
no,0.077805,1221,-0.192163,0.288201






Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420921,2801,0.150953,1.559152
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.153226,1612,-0.116742,0.56757






Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404323,2498,0.134355,1.497672
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.217232,1915,-0.052736,0.80466






Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.395875,2473,0.125907,1.466379
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.230412,1940,-0.039556,0.85348






Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.418914,2781,0.148946,1.551717
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.159926,1632,-0.110042,0.59239






Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342832,2246,0.072864,1.269897
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.302723,2167,0.032755,1.121328






Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.338906,2213,0.068938,1.255358
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.307273,2200,0.037305,1.138182






Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431701,3104,0.161733,1.599082
one_year,0.120573,1186,-0.149395,0.446621
two_year,0.028274,1344,-0.241694,0.10473






Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.172071,2313,-0.097897,0.637375
yes,0.338151,3321,0.068183,1.25256






Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.168171,1219,-0.101797,0.622928
credit_card_(automatic),0.164339,1217,-0.10563,0.608733
electronic_check,0.45589,1893,0.185922,1.688682
mailed_check,0.19387,1305,-0.076098,0.718121






#### Mutual Information

In [38]:
from sklearn.metrics import mutual_info_score

# mutual_info_score(df_full_train.churn, df_full_train.contract) does the same
mutual_info_score(df_full_train.contract, df_full_train.churn)

0.0983203874041556

In [39]:
mutual_info_score(df_full_train.gender, df_full_train.churn)

0.0001174846211139946

In [40]:
def mutual_info_churn_score(col):
    return mutual_info_score(col, df_full_train.churn)

mutual_info = df_full_train[categorical].apply(mutual_info_churn_score)
mutual_info

gender              0.000117
seniorcitizen       0.009410
partner             0.009968
dependents          0.012346
phoneservice        0.000229
multiplelines       0.000857
internetservice     0.055868
onlinesecurity      0.063085
onlinebackup        0.046923
deviceprotection    0.043453
techsupport         0.061032
streamingtv         0.031853
streamingmovies     0.031581
contract            0.098320
paperlessbilling    0.017589
paymentmethod       0.043210
dtype: float64

In [41]:
mutual_info.sort_values(ascending=False).to_frame(name='MI')

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


In [42]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']
categorical = ['seniorcitizen', 'partner', 'dependents', 'phoneservice',
       'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup',
       'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies',
       'contract', 'paperlessbilling', 'paymentmethod']

#### Correlation

In [43]:
df_full_train[numerical]

Unnamed: 0,tenure,monthlycharges,totalcharges
1814,12,19.70,258.35
5946,42,73.90,3160.55
3881,71,65.15,4681.75
2389,71,85.45,6300.85
3676,30,70.40,2044.75
...,...,...,...
905,9,100.50,918.60
5192,60,19.95,1189.90
3980,28,105.70,2979.50
235,2,54.40,114.10


In [44]:
df_full_train[numerical].corrwith(df_full_train.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

In [45]:
df_full_train[numerical].corrwith(df_full_train.churn).abs()

tenure            0.351885
monthlycharges    0.196805
totalcharges      0.196353
dtype: float64

In [46]:
df_full_train[df_full_train.tenure <= 2].churn.mean()

np.float64(0.5953420669577875)

In [47]:
df_full_train[(df_full_train.tenure > 2) & (df_full_train.tenure < 12)].churn.mean()

np.float64(0.4059003051881994)

In [48]:
df_full_train[df_full_train.tenure > 12].churn.mean()

np.float64(0.17634908339788277)

In [49]:
df_full_train[df_full_train.monthlycharges <= 20].churn.mean()

np.float64(0.08795411089866156)

In [50]:
df_full_train[(df_full_train.monthlycharges > 20) & (df_full_train.monthlycharges < 50)].churn.mean()

np.float64(0.1836890243902439)

In [51]:
df_full_train[df_full_train.monthlycharges > 50].churn.mean()

np.float64(0.32499341585462205)

### One-hot Encoding

In [52]:
df_full_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
1814,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.7,258.35,0
5946,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.9,3160.55,1
3881,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
2389,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
3676,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.4,2044.75,0


In [53]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

In [54]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
train_dicts[0]

{'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'yes',
 'onlinebackup': 'yes',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'two_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'tenure': 72,
 'monthlycharges': 115.5,
 'totalcharges': 8425.15}

In [55]:
X_train = dv.fit_transform(train_dicts)
X_train.shape

(4225, 43)

In [56]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'internetservice=dsl',
       'internetservice=fiber_optic', 'internetservice=no',
       'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',
       'streamingmovies=yes', '

In [57]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
val_dicts[0]

{'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'dsl',
 'onlinesecurity': 'yes',
 'onlinebackup': 'no',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'no',
 'streamingmovies': 'yes',
 'contract': 'two_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'credit_card_(automatic)',
 'tenure': 71,
 'monthlycharges': 70.85,
 'totalcharges': 4973.4}

In [58]:
X_val = dv.transform(val_dicts)
X_val.shape

(1409, 43)

### Logistic Regression with SciKit-Learn

In [59]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', random_state=1)
# The liblinear solver is often more reliable for smaller datasets and handles convergence better
# The random state is added for reproducability

In [60]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,1
,solver,'liblinear'
,max_iter,100


In [61]:
model.coef_

array([[ 6.04190219e-01, -6.09373968e-02, -6.71846756e-01,
        -3.04624020e-02, -9.81315321e-02,  5.05347535e-02,
        -1.07445534e-01, -7.16831537e-02, -3.81403483e-01,
         3.60255082e-01, -1.07445534e-01,  1.97605891e-03,
        -2.50142268e-01,  1.29013102e-01, -7.46476740e-03,
         5.42355773e-02, -1.07445534e-01, -7.53839775e-02,
         2.15080154e-01, -1.07445534e-01, -2.36228554e-01,
        -2.52749068e-01,  1.24155134e-01, -1.18903316e-01,
        -9.69061826e-03, -1.02221047e-01, -4.88666883e-02,
         6.51522382e-02, -4.26584369e-02,  1.29013102e-01,
        -2.57607036e-01,  1.99351156e-01, -1.03418168e-01,
        -1.07445534e-01,  8.22697679e-02, -6.62136512e-02,
        -1.07445534e-01,  4.50652510e-02,  1.68815613e-01,
        -1.07445534e-01, -1.89964013e-01, -6.69763194e-02,
         3.77592088e-04]])

In [62]:
model.coef_[0].round(3)

array([ 0.604, -0.061, -0.672, -0.03 , -0.098,  0.051, -0.107, -0.072,
       -0.381,  0.36 , -0.107,  0.002, -0.25 ,  0.129, -0.007,  0.054,
       -0.107, -0.075,  0.215, -0.107, -0.236, -0.253,  0.124, -0.119,
       -0.01 , -0.102, -0.049,  0.065, -0.043,  0.129, -0.258,  0.199,
       -0.103, -0.107,  0.082, -0.066, -0.107,  0.045,  0.169, -0.107,
       -0.19 , -0.067,  0.   ])

In [63]:
model.intercept_[0]

np.float64(-0.1285939340748113)

In [64]:
model.predict(X_train)
# Hard prediction

array([0, 1, 1, ..., 1, 0, 1])

In [65]:
model.predict_proba(X_train)
# Soft prediction
# first column for probability of belonging to the negative class (0)
# second column for probability of belonging to the positive class (1)

array([[0.92290494, 0.07709506],
       [0.31695702, 0.68304298],
       [0.37204542, 0.62795458],
       ...,
       [0.48549335, 0.51450665],
       [0.97301954, 0.02698046],
       [0.30185024, 0.69814976]])

In [66]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred

array([0.00748672, 0.21167603, 0.22302497, ..., 0.15142522, 0.7877929 ,
       0.8074779 ])

In [67]:
churn_decision = y_pred >= 0.5
churn_decision

array([False, False, False, ..., False,  True,  True])

In [68]:
display(df_val[churn_decision])
display(df_val[churn_decision].customerid)

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
3,8433-wxgna,male,0,no,no,2,yes,no,fiber_optic,yes,...,no,no,no,no,month-to-month,yes,electronic_check,75.70,189.20,1
8,3440-jpscl,female,0,no,no,6,yes,no,fiber_optic,no,...,yes,yes,yes,yes,month-to-month,yes,mailed_check,99.95,547.65,1
12,7228-omtpn,male,0,no,no,4,yes,no,fiber_optic,no,...,no,no,yes,yes,month-to-month,yes,electronic_check,88.45,370.65,1
19,6711-fldfb,female,0,no,no,7,yes,yes,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,74.90,541.15,1
24,2612-ranwt,female,0,no,no,12,yes,yes,fiber_optic,no,...,yes,no,yes,yes,month-to-month,yes,bank_transfer_(automatic),100.15,1164.30,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1397,5976-jcjrh,male,0,yes,no,10,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.30,738.20,1
1398,2034-cgrhz,male,1,no,no,24,yes,yes,fiber_optic,no,...,yes,no,yes,yes,month-to-month,yes,credit_card_(automatic),102.95,2496.70,1
1399,5276-kqwhg,female,1,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,69.60,131.65,1
1407,6521-yytyi,male,0,no,yes,1,yes,yes,fiber_optic,no,...,no,no,yes,yes,month-to-month,yes,electronic_check,93.30,93.30,1


3       8433-wxgna
8       3440-jpscl
12      7228-omtpn
19      6711-fldfb
24      2612-ranwt
           ...    
1397    5976-jcjrh
1398    2034-cgrhz
1399    5276-kqwhg
1407    6521-yytyi
1408    3049-solay
Name: customerid, Length: 312, dtype: object

In [69]:
(y_val == churn_decision).mean()

np.float64(0.8069552874378992)

### Model Interpretation

In [70]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'internetservice=dsl',
       'internetservice=fiber_optic', 'internetservice=no',
       'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',
       'streamingmovies=yes', '

In [71]:
model.coef_[0].round(3)

array([ 0.604, -0.061, -0.672, -0.03 , -0.098,  0.051, -0.107, -0.072,
       -0.381,  0.36 , -0.107,  0.002, -0.25 ,  0.129, -0.007,  0.054,
       -0.107, -0.075,  0.215, -0.107, -0.236, -0.253,  0.124, -0.119,
       -0.01 , -0.102, -0.049,  0.065, -0.043,  0.129, -0.258,  0.199,
       -0.103, -0.107,  0.082, -0.066, -0.107,  0.045,  0.169, -0.107,
       -0.19 , -0.067,  0.   ])

In [72]:
list(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

[('contract=month-to-month', np.float64(0.604)),
 ('contract=one_year', np.float64(-0.061)),
 ('contract=two_year', np.float64(-0.672)),
 ('dependents=no', np.float64(-0.03)),
 ('dependents=yes', np.float64(-0.098)),
 ('deviceprotection=no', np.float64(0.051)),
 ('deviceprotection=no_internet_service', np.float64(-0.107)),
 ('deviceprotection=yes', np.float64(-0.072)),
 ('internetservice=dsl', np.float64(-0.381)),
 ('internetservice=fiber_optic', np.float64(0.36)),
 ('internetservice=no', np.float64(-0.107)),
 ('monthlycharges', np.float64(0.002)),
 ('multiplelines=no', np.float64(-0.25)),
 ('multiplelines=no_phone_service', np.float64(0.129)),
 ('multiplelines=yes', np.float64(-0.007)),
 ('onlinebackup=no', np.float64(0.054)),
 ('onlinebackup=no_internet_service', np.float64(-0.107)),
 ('onlinebackup=yes', np.float64(-0.075)),
 ('onlinesecurity=no', np.float64(0.215)),
 ('onlinesecurity=no_internet_service', np.float64(-0.107)),
 ('onlinesecurity=yes', np.float64(-0.236)),
 ('paperles

In [73]:
small = ['contract', 'tenure', 'monthlycharges']
df_train[small].head()

Unnamed: 0,contract,tenure,monthlycharges
0,two_year,72,115.5
1,month-to-month,10,95.25
2,month-to-month,5,75.55
3,month-to-month,5,80.85
4,two_year,18,20.1


In [74]:
type(df_train)

pandas.core.frame.DataFrame

In [75]:
dicts_train_small = df_train[small].to_dict(orient='records')
dicts_val_small = df_val[small].to_dict(orient='records')

In [76]:
dv_small = DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [77]:
dv_small.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'monthlycharges', 'tenure'], dtype=object)

In [78]:
X_train_small = dv_small.transform(dicts_train_small)

In [79]:
model_small = LogisticRegression(solver='liblinear', random_state=1)
model_small.fit(X_train_small, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,1
,solver,'liblinear'
,max_iter,100


In [80]:
w0 = model_small.intercept_
w = model_small.coef_[0]

w0, w.round(3)

(array([-1.83424534]), array([ 0.347, -0.641, -1.54 ,  0.027, -0.036]))

In [81]:
dict(zip(dv_small.get_feature_names_out(), w.round(3)))

{'contract=month-to-month': np.float64(0.347),
 'contract=one_year': np.float64(-0.641),
 'contract=two_year': np.float64(-1.54),
 'monthlycharges': np.float64(0.027),
 'tenure': np.float64(-0.036)}

### Using the Model

In [82]:
# Convert to dict
dicts_full_train = df_full_train[categorical + numerical].to_dict(orient='records')

In [83]:
# Get X with vectorizer
dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)

In [84]:
# Get y
y_full_train = df_full_train.churn.values

In [85]:
# Train Logistic Regression Model
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_full_train, y_full_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,1
,solver,'liblinear'
,max_iter,100


In [86]:
# Get test to dict to test with
dicts_test = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(dicts_test)
y_test = df_test.churn.values

In [87]:
y_pred = model.predict_proba(X_test)[:,1]

In [88]:
churn_decision = (y_pred >= 0.5)
(churn_decision == y_test).mean()

np.float64(0.8133427963094393)

In [89]:
customer = dicts_test[-1]
customer

{'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'yes',
 'onlinebackup': 'no',
 'deviceprotection': 'yes',
 'techsupport': 'no',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'tenure': 17,
 'monthlycharges': 104.2,
 'totalcharges': 1743.5}

In [90]:
X_new = dv.transform(customer)
model.predict_proba(X_new)[0, 1]

np.float64(0.6239617243786809)

In [91]:
y_test[-1]

np.int64(1)