# Churn Prediction Project

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [3]:
!wget $data

--2023-10-15 16:21:25--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 977501 (955K) [text/plain]
Saving to: ‘WA_Fn-UseC_-Telco-Customer-Churn.csv’


2023-10-15 16:21:28 (554 KB/s) - ‘WA_Fn-UseC_-Telco-Customer-Churn.csv’ saved [977501/977501]



In [4]:
df = pd.read_csv(data)

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [6]:
# Prepare the data

df.columns = df.columns.str.lower().str.replace(' ','_')
string_cols = list(df.dtypes[df.dtypes=='object'].index)
for s in string_cols:
    df[s] = df[s].str.lower().str.replace(' ','_')
df

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.30,1840.75,no
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.70,151.65,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-resvb,male,0,yes,yes,24,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,yes,mailed_check,84.80,1990.5,no
7039,2234-xaduh,female,0,yes,yes,72,yes,yes,fiber_optic,no,...,yes,no,yes,yes,one_year,yes,credit_card_(automatic),103.20,7362.9,no
7040,4801-jzazl,female,0,yes,yes,11,no,no_phone_service,dsl,yes,...,no,no,no,no,month-to-month,yes,electronic_check,29.60,346.45,no
7041,8361-ltmkd,male,1,yes,no,4,yes,yes,fiber_optic,no,...,no,no,no,no,month-to-month,yes,mailed_check,74.40,306.6,yes


In [7]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [8]:
tc = pd.to_numeric(df.totalcharges,errors='coerce')

In [9]:
# According to Alexey, it is not always the best approach but it is not that bad either
df.totalcharges = tc.fillna(0)

In [10]:
# We converted a yes or no column to a binary column
df.churn = (df.churn=='yes').astype(int)

# Setting up the validation framework (with scikit-learn)

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
train_test_split?

[0;31mSignature:[0m
[0mtrain_test_split[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0marrays[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtest_size[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrain_size[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshuffle[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstratify[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Split arrays or matrices into random train and test subsets.

Quick utility that wraps input validation,
``next(ShuffleSplit().split(X, y))``, and application to input data
into a single call for splitting (and optionally subsampling) data into a
one-liner.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
*arrays : sequence of indexables with sa

In [13]:
df_full_train,df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train,df_val = train_test_split(df_full_train, test_size=0.2/0.8, random_state=1)
# Consider that the 20% of the full train dataframe won't be the same 20% as the df_test

In [14]:
len(df_full_train),len(df_train),len(df_val),len(df_test)

(5634, 4225, 1409, 1409)

In [15]:
df_train.reset_index(drop=True)
df_val.reset_index(drop=True)
df_test.reset_index(drop=True)

y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

del df_train['churn']
del df_val['churn']
del df_test['churn']

# EDA

In [16]:
# We didn't delte the column of churn in df_full_train to do some exploratory data analysis

In [17]:
df_full_train.reset_index(drop=True)

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.70,258.35,0
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.90,3160.55,1
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.40,2044.75,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,0781-lkxbr,male,1,no,no,9,yes,yes,fiber_optic,no,...,yes,no,yes,yes,month-to-month,yes,electronic_check,100.50,918.60,1
5630,3507-gasnp,male,0,no,yes,60,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.95,1189.90,0
5631,8868-wozgu,male,0,no,no,28,yes,yes,fiber_optic,no,...,yes,no,yes,yes,month-to-month,yes,electronic_check,105.70,2979.50,1
5632,1251-krreg,male,0,no,no,2,yes,yes,dsl,no,...,no,no,no,no,month-to-month,yes,mailed_check,54.40,114.10,1


In [18]:
#Looking at the distribution
# the positive percentage is the rate of the predict value

df_full_train.churn.value_counts(normalize=True)

churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

In [19]:
round(df_full_train.churn.mean(),2)

0.27

In [20]:
# We identify the numerical values
numerical = ['tenure','monthlycharges','totalcharges']

In [21]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [22]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [23]:
df_full_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

## Feature importance

In [24]:
churn_female = df_full_train[df_full_train.gender == 'female'].churn.mean()
churn_male = df_full_train[df_full_train.gender == 'male'].churn.mean()
churn_female,churn_male

(0.27682403433476394, 0.2632135306553911)

In [25]:
global_churn = df_full_train.churn.mean()
global_churn

0.26996805111821087

In [26]:
df_full_train.partner.value_counts()

partner
no     2932
yes    2702
Name: count, dtype: int64

In [27]:
churn_partner = df_full_train[df_full_train.partner == 'yes'].churn.mean()
churn_no_partner = df_full_train[df_full_train.partner == 'no'].churn.mean()
churn_partner,churn_no_partner

(0.20503330866025166, 0.3298090040927694)

In [28]:
# This exploration gives us some idea on which features are more important, for example we observe high variance in the churn within the group of partner/nopartner
# 1. Difference -> Global - Grboup 
# 2. Risk Ratio -> Group/Global


In [29]:
df_full_train[df_full_train.partner=='no'].churn.mean() / df_full_train.churn.mean()

1.2216593879412643

In [30]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [31]:
from IPython.display import display

In [32]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).churn.agg(['mean','count'])
    df_group['diff']=df_group['mean']-global_churn
    df_group['risk']=df_group['mean']/global_churn
    display(df_group)

gender


Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.006856,1.025396
male,0.263214,2838,-0.006755,0.97498


seniorcitizen


Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,-0.027698,0.897403
1,0.413377,912,0.143409,1.531208


partner


Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.329809,2932,0.059841,1.221659
yes,0.205033,2702,-0.064935,0.759472


dependents


Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.31376,3968,0.043792,1.162212
yes,0.165666,1666,-0.104302,0.613651


phoneservice


Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.241316,547,-0.028652,0.89387
yes,0.273049,5087,0.003081,1.011412


multiplelines


Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.257407,2700,-0.012561,0.953474
no_phone_service,0.241316,547,-0.028652,0.89387
yes,0.290742,2387,0.020773,1.076948


internetservice


Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.192347,1934,-0.077621,0.712482
fiber_optic,0.425171,2479,0.155203,1.574895
no,0.077805,1221,-0.192163,0.288201


onlinesecurity


Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420921,2801,0.150953,1.559152
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.153226,1612,-0.116742,0.56757


onlinebackup


Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404323,2498,0.134355,1.497672
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.217232,1915,-0.052736,0.80466


deviceprotection


Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.395875,2473,0.125907,1.466379
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.230412,1940,-0.039556,0.85348


techsupport


Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.418914,2781,0.148946,1.551717
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.159926,1632,-0.110042,0.59239


streamingtv


Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342832,2246,0.072864,1.269897
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.302723,2167,0.032755,1.121328


streamingmovies


Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.338906,2213,0.068938,1.255358
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.307273,2200,0.037305,1.138182


contract


Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431701,3104,0.161733,1.599082
one_year,0.120573,1186,-0.149395,0.446621
two_year,0.028274,1344,-0.241694,0.10473


paperlessbilling


Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.172071,2313,-0.097897,0.637375
yes,0.338151,3321,0.068183,1.25256


paymentmethod


Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.168171,1219,-0.101797,0.622928
credit_card_(automatic),0.164339,1217,-0.10563,0.608733
electronic_check,0.45589,1893,0.185922,1.688682
mailed_check,0.19387,1305,-0.076098,0.718121


In [33]:
from sklearn.metrics import mutual_info_score

In [34]:
mutual_info_score(df_full_train.churn, df_full_train.contract)

0.0983203874041556

In [35]:
mutual_info_score(df_full_train.churn, df_full_train.partner)

0.009967689095399745

In [36]:
mutual_info_score(df_full_train.churn, df_full_train.seniorcitizen)

0.009410216144208144

In [37]:
def mutual_info_churn_score(series):
    return mutual_info_score(series,df_full_train.churn)

In [38]:
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

In [39]:
df_full_train[numerical].corrwith(df_full_train.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

In [40]:
df_full_train[(df_full_train.tenure>2) & (df_full_train.tenure<=12)].churn.mean()

0.3994413407821229

In [41]:
df_full_train[df_full_train.tenure>12].churn.mean()

0.17634908339788277

In [42]:
from sklearn.feature_extraction import DictVectorizer

In [43]:
df_train = df_train.reset_index(drop=True)
train_dict = df_train[categorical+numerical].to_dict(orient='records')

In [44]:
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [45]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)

In [46]:
X_train

array([[0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        7.20000e+01, 8.42515e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+01, 1.02155e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        5.00000e+00, 4.13650e+02],
       ...,
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        2.00000e+00, 1.90050e+02],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        2.70000e+01, 7.61950e+02],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        9.00000e+00, 7.51650e+02]])

In [47]:
for i in dv.get_feature_names_out():
    print(i)

contract=month-to-month
contract=one_year
contract=two_year
dependents=no
dependents=yes
deviceprotection=no
deviceprotection=no_internet_service
deviceprotection=yes
gender=female
gender=male
internetservice=dsl
internetservice=fiber_optic
internetservice=no
monthlycharges
multiplelines=no
multiplelines=no_phone_service
multiplelines=yes
onlinebackup=no
onlinebackup=no_internet_service
onlinebackup=yes
onlinesecurity=no
onlinesecurity=no_internet_service
onlinesecurity=yes
paperlessbilling=no
paperlessbilling=yes
partner=no
partner=yes
paymentmethod=bank_transfer_(automatic)
paymentmethod=credit_card_(automatic)
paymentmethod=electronic_check
paymentmethod=mailed_check
phoneservice=no
phoneservice=yes
seniorcitizen
streamingmovies=no
streamingmovies=no_internet_service
streamingmovies=yes
streamingtv=no
streamingtv=no_internet_service
streamingtv=yes
techsupport=no
techsupport=no_internet_service
techsupport=yes
tenure
totalcharges


In [48]:
X_train.shape

(4225, 45)

In [49]:
val_dicts = df_val[categorical+numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [50]:
test_dicts = df_test[categorical+numerical].to_dict(orient='records')
X_test = dv.transform(test_dicts)

In [51]:
X_train.shape

(4225, 45)

## LOGISTIC REGRESSION  

In [52]:
from sklearn.linear_model import LogisticRegression

In [53]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [54]:
model.coef_[0].round(3)

array([ 0.475, -0.175, -0.408, -0.03 , -0.078,  0.063, -0.089, -0.081,
       -0.034, -0.073, -0.335,  0.317, -0.089,  0.004, -0.258,  0.141,
        0.009,  0.063, -0.089, -0.081,  0.266, -0.089, -0.284, -0.231,
        0.124, -0.166,  0.058, -0.087, -0.032,  0.07 , -0.059,  0.141,
       -0.249,  0.215, -0.12 , -0.089,  0.102, -0.071, -0.089,  0.052,
        0.213, -0.089, -0.232, -0.07 ,  0.   ])

In [55]:
model.intercept_[0]

-0.1090339889680666

In [56]:
y_pred = model.predict_proba(X_val)[:,1]

In [57]:
churn_decision = y_pred >= 0.5

In [58]:
df_val.customerid[churn_decision]

2504    8433-wxgna
4597    3440-jpscl
2343    2637-fkfsy
5591    7228-omtpn
4482    6711-fldfb
           ...    
2611    5976-jcjrh
4211    2034-cgrhz
3999    5276-kqwhg
6240    6521-yytyi
5282    3049-solay
Name: customerid, Length: 311, dtype: object

In [59]:
(y_val == churn_decision).mean()

0.8034066713981547

In [60]:
df_pred = pd.DataFrame()
df_pred['probability']= model.predict_proba(X_val)[:,1]
df_pred['decision']=churn_decision
df_pred['actual']=y_val

In [61]:
df_pred['correct']=df_pred.decision == df_pred.actual

In [62]:
df_pred.correct.mean()

0.8034066713981547

In [63]:
dict(zip(dv.get_feature_names_out(),model.coef_[0]))

{'contract=month-to-month': 0.47473468635491234,
 'contract=one_year': -0.17487261951833707,
 'contract=two_year': -0.40754104451165135,
 'dependents=no': -0.02968758353838218,
 'dependents=yes': -0.07799139413818812,
 'deviceprotection=no': 0.06268257912534578,
 'deviceprotection=no_internet_service': -0.08897063961502671,
 'deviceprotection=yes': -0.0813909171856637,
 'gender=female': -0.034310571468253254,
 'gender=male': -0.07336840620753204,
 'internetservice=dsl': -0.33521037976940204,
 'internetservice=fiber_optic': 0.31650204169410434,
 'internetservice=no': -0.08897063961502671,
 'monthlycharges': 0.003674678748882661,
 'multiplelines=no': -0.2581363222638565,
 'multiplelines=no_phone_service': 0.14144102797507388,
 'multiplelines=yes': 0.009016316610102127,
 'onlinebackup=no': 0.06253003664933408,
 'onlinebackup=no_internet_service': -0.08897063961502671,
 'onlinebackup=yes': -0.081238374709982,
 'onlinesecurity=no': 0.26558532252961986,
 'onlinesecurity=no_internet_service':

In [64]:
small = ['contract','tenure','monthlycharges']

In [65]:
dict_train_small = df_train[small].to_dict(orient='records')
dict_val_small = df_val[small].to_dict(orient='records')

In [66]:
dv_small = DictVectorizer(sparse=False)

In [67]:
X_train_small = dv_small.fit_transform(dict_train_small)
X_val_small = dv_small.transform(dict_val_small)

In [68]:
model_small = LogisticRegression()
model_small.fit(X_train_small,y_train)

In [69]:
w0 = model_small.intercept_[0]
w = model_small.coef_[0].round(3)

In [70]:
dict(zip(dv_small.get_feature_names_out(), w)), w0

({'contract=month-to-month': 0.97,
  'contract=one_year': -0.025,
  'contract=two_year': -0.949,
  'monthlycharges': 0.027,
  'tenure': -0.036},
 -2.476775662927698)

In [71]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [72]:
-2.47+0.97 + 50*0.027+5*(-0.036)

-0.3300000000000001

In [73]:
sigmoid(-2.47+0.97 + 50*0.027+5*(-0.036))

0.41824062315816374

### Using the model

In [74]:
((model_small.predict(X_val_small)>0.5).astype('int')==y_val).mean()

0.7963094393186657

In [75]:
dicts_full_train = df_full_train[categorical+numerical].to_dict(orient='records')

In [76]:
dv = DictVectorizer(sparse=False)

In [77]:
X_full_train = dv.fit_transform(dicts_full_train)

In [78]:
y_full_train = df_full_train.churn.values

In [79]:
model = LogisticRegression()
model.fit(X_full_train,y_full_train)

In [82]:
dicts_test = df_test[categorical+numerical].to_dict(orient='records')
X_test = dv.transform(dicts_test)


In [None]:
y_pred = model.predict_proba(X_test)[:,1]

In [None]:
churn_decision = (y_pred>=0.5)

In [None]:
(churn_decision==y_test).mean()

In [None]:
customer = dicts_test[-1]

In [None]:
X = dv.transform([customer])

In [None]:
model.predict_proba(X)[0][1]

In [None]:
y_test[-1]

In [56]:
import pickle

with open('model.bin', mode='wb') as f:
    pickle.dump(model,f)

In [57]:
!cat 'model.bin'

fit_intercept���intercept_scaling�K�class_weight�N�random_state�N�solver��lbfgs�max_iter�Kd�multi_class��auto��verbose�K �
warm_start���n_jobs�Nl1_ratio�N�n_features_in_�K-classes_��numpy.core.multiarray��_reconstruct����numpy��ndarray���K ��Cb���R�(KK��h�dtype����i8�����R�(K�<�NNNJ����J����K t�b�C               �t�b�n_iter_�hhK ��h ��R�(KK��h%�i4�����R�(Kh)NNNJ����J����K t�b�C\   �t�b�coef_�hhK ��h ��R�(KKK-��h%�f8�����R�(Kh)NNNJ����J����K t�b�Bh  b�
B�����V��?��*�Ͽ�&3���?8[R�`о��cs��ƶ��0�N�?�|�<&���cs��ƶ����zT��?d��qP�?�cs��ƶ�/'�[�Ϳ�:X-6��4�7�g9?�t�b�?�3m���?�cs��ƶ��?��	̴�4���Y��?�cs��ƶ��t�	�1ҿSI��Ϳ���W7��?Y�O@ſ1!]�߭?|�qY-F��%�j��i��Ss!�B�?��
intercept_�hhK ��h ��R�(KK��h?�x0�Ȧ黿�t�b�_sklearn_version��1.3.1�ub.