In [27]:
import pandas as pd

In [28]:
data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [29]:
data.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [30]:
data.shape

(7043, 21)

In [31]:
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [32]:
data.TotalCharges = pd.to_numeric(data.TotalCharges, errors = "coerce")

In [33]:
data.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [34]:
data.TotalCharges = data.TotalCharges.fillna(0)

In [35]:
data.isnull().sum().sum()

0

In [36]:
data.columns = data.columns.str.lower().str.replace(" ", '_')

In [37]:
string_columns = list(data.dtypes[data.dtypes == 'object'].index)

In [38]:
string_columns

['customerid',
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'churn']

In [41]:
for col in string_columns:
    data[col] = data[col].str.lower().str.replace(" ", "_")

In [42]:
data.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,no
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,yes


In [43]:
data.churn.head()

0     no
1     no
2    yes
3     no
4    yes
Name: churn, dtype: object

In [44]:
data.churn = (data.churn == 'yes').astype(int)

In [45]:
data.churn.head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int32

In [46]:
data.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,0
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,0
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,1
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,0
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,1


In [47]:
from sklearn.model_selection import train_test_split

In [49]:
train_data_main, test_data = train_test_split(data, test_size=0.2, random_state=17)

In [50]:
train_data, val_data = train_test_split(train_data_main, test_size=0.25, random_state=17)

In [51]:
y_train = train_data.churn.values

In [52]:
y_val = val_data.churn.values

In [53]:
del train_data['churn']
del val_data['churn']

In [62]:
categorical_cols = [cname for cname in train_data.columns if train_data[cname].dtype == 'object']
categorical_cols.remove('customerid')
categorical_cols.insert(1,'seniorcitizen')
categorical_cols

['gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

In [64]:
numerical_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
numerical_cols.remove('seniorcitizen')
numerical_cols

['tenure', 'monthlycharges', 'totalcharges']

In [66]:
train_dict = train_data[categorical_cols + numerical_cols].to_dict(orient='records')

In [69]:
train_dict[:1]

[{'gender': 'female',
  'seniorcitizen': 0,
  'partner': 'no',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'yes',
  'internetservice': 'fiber_optic',
  'onlinesecurity': 'no',
  'onlinebackup': 'no',
  'deviceprotection': 'no',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'month-to-month',
  'paperlessbilling': 'yes',
  'paymentmethod': 'electronic_check',
  'tenure': 1,
  'monthlycharges': 77.15,
  'totalcharges': 77.15}]

In [70]:
from sklearn.feature_extraction import DictVectorizer

In [71]:
dv = DictVectorizer(sparse = False)

In [72]:
dv.fit(train_dict)

In [73]:
X_train = dv.transform(train_dict)
X_train[0]

array([ 1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,
        0.  ,  0.  ,  1.  ,  0.  , 77.15,  0.  ,  0.  ,  1.  ,  1.  ,
        0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  1.  ,  1.  ,  0.  ,
        0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  1.  ,  0.  ,
        0.  ,  1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  , 77.15])

In [74]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

In [75]:
from sklearn.linear_model import LogisticRegression

In [76]:
model = LogisticRegression(solver = 'liblinear', random_state=17)

In [77]:
model.fit(X_train, y_train)

In [78]:
val_dict = val_data[categorical_cols + numerical_cols].to_dict(orient='records')

In [79]:
X_val = dv.transform(val_dict)

In [80]:
y_pred = model.predict_proba(X_val)

In [81]:
y_pred[:5]

array([[0.67242032, 0.32757968],
       [0.82340283, 0.17659717],
       [0.99140539, 0.00859461],
       [0.97584785, 0.02415215],
       [0.74663569, 0.25336431]])

In [82]:
model.score(X_val, y_val)

0.8048261178140526

In [83]:
model.score(X_train, y_train)

0.8042603550295858

In [85]:
model.intercept_[0] #constant value of model

-0.13402507437942301

In [86]:
dict(zip(dv.get_feature_names_out(),model.coef_[0].round(3)))

{'contract=month-to-month': 0.626,
 'contract=one_year': -0.302,
 'contract=two_year': -0.458,
 'dependents=no': 0.061,
 'dependents=yes': -0.195,
 'deviceprotection=no': 0.027,
 'deviceprotection=no_internet_service': -0.119,
 'deviceprotection=yes': -0.042,
 'gender=female': -0.047,
 'gender=male': -0.087,
 'internetservice=dsl': -0.323,
 'internetservice=fiber_optic': 0.308,
 'internetservice=no': -0.119,
 'monthlycharges': -0.001,
 'multiplelines=no': -0.317,
 'multiplelines=no_phone_service': 0.1,
 'multiplelines=yes': 0.083,
 'onlinebackup=no': 0.122,
 'onlinebackup=no_internet_service': -0.119,
 'onlinebackup=yes': -0.138,
 'onlinesecurity=no': 0.186,
 'onlinesecurity=no_internet_service': -0.119,
 'onlinesecurity=yes': -0.202,
 'paperlessbilling=no': -0.229,
 'paperlessbilling=yes': 0.095,
 'partner=no': -0.043,
 'partner=yes': -0.091,
 'paymentmethod=bank_transfer_(automatic)': -0.054,
 'paymentmethod=credit_card_(automatic)': -0.27,
 'paymentmethod=electronic_check': 0.244,
 

In [87]:
customer = {
    'customerid': '8879-zkjof',
    'gender': 'male',
  'seniorcitizen': 1,
  'partner': 'no',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'dsl',
  'onlinesecurity': 'no',
  'onlinebackup': 'no',
  'deviceprotection': 'no',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'one_year',
  'paperlessbilling': 'yes',
  'paymentmethod': 'bank_transfer_(automatic)',
  'tenure': 41,
  'monthlycharges': 79.85,
  'totalcharges': 2990.15
}

In [88]:
x_new = dv.transform([customer])

In [89]:
model.predict_proba(x_new)

array([[0.90112966, 0.09887034]])

There is no need to apply a promotion with a low probability of this customer unsubscribing.