In [243]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mutual_info_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

### DATA PREPARATION

In [127]:
df = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [128]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [129]:
df.columns = df.columns.str.upper().str.replace(' ', '_')
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
for c in categorical_columns:
    df[c] = df[c].str.upper().str.replace(' ', '_')

In [130]:
df.columns

Index(['CUSTOMERID', 'GENDER', 'SENIORCITIZEN', 'PARTNER', 'DEPENDENTS',
       'TENURE', 'PHONESERVICE', 'MULTIPLELINES', 'INTERNETSERVICE',
       'ONLINESECURITY', 'ONLINEBACKUP', 'DEVICEPROTECTION', 'TECHSUPPORT',
       'STREAMINGTV', 'STREAMINGMOVIES', 'CONTRACT', 'PAPERLESSBILLING',
       'PAYMENTMETHOD', 'MONTHLYCHARGES', 'TOTALCHARGES', 'CHURN'],
      dtype='object')

In [131]:
df.dtypes
# Total charges should be numeric, there are some values with '-' representing not available, so we coerce the to_numeric

CUSTOMERID           object
GENDER               object
SENIORCITIZEN         int64
PARTNER              object
DEPENDENTS           object
TENURE                int64
PHONESERVICE         object
MULTIPLELINES        object
INTERNETSERVICE      object
ONLINESECURITY       object
ONLINEBACKUP         object
DEVICEPROTECTION     object
TECHSUPPORT          object
STREAMINGTV          object
STREAMINGMOVIES      object
CONTRACT             object
PAPERLESSBILLING     object
PAYMENTMETHOD        object
MONTHLYCHARGES      float64
TOTALCHARGES         object
CHURN                object
dtype: object

In [132]:
pd.to_numeric(df.TOTALCHARGES, errors='coerce')

0         29.85
1       1889.50
2        108.15
3       1840.75
4        151.65
         ...   
7038    1990.50
7039    7362.90
7040     346.45
7041     306.60
7042    6844.50
Name: TOTALCHARGES, Length: 7043, dtype: float64

In [133]:
df.TOTALCHARGES = pd.to_numeric(df.TOTALCHARGES, errors='coerce')

In [134]:
# with missing values we put 0
df.TOTALCHARGES = df.TOTALCHARGES.fillna(0)

In [135]:
df.CHURN = (df.CHURN == 'YES').astype(int)

### SETTING UP THE VALIDATION FRAMEWOK

In [136]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [137]:
len(df_full_train), len(df_test)

(5634, 1409)

In [138]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [139]:
len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [140]:
# Making index sequencial, not mandatory and no effect
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [141]:
y_train = df_train.CHURN.values
y_val = df_val.CHURN.values
y_test = df_test.CHURN.values

In [142]:
# Drop target variable from df
df_train = df_train.drop(['CHURN'], axis=1)
df_val = df_val.drop(['CHURN'], axis=1)
df_test = df_test.drop(['CHURN'], axis=1)

### EDA

In [143]:
df_full_train = df_full_train.reset_index(drop=True)

In [144]:
df_full_train.isnull().sum()

CUSTOMERID          0
GENDER              0
SENIORCITIZEN       0
PARTNER             0
DEPENDENTS          0
TENURE              0
PHONESERVICE        0
MULTIPLELINES       0
INTERNETSERVICE     0
ONLINESECURITY      0
ONLINEBACKUP        0
DEVICEPROTECTION    0
TECHSUPPORT         0
STREAMINGTV         0
STREAMINGMOVIES     0
CONTRACT            0
PAPERLESSBILLING    0
PAYMENTMETHOD       0
MONTHLYCHARGES      0
TOTALCHARGES        0
CHURN               0
dtype: int64

In [145]:
# normalize=True to display as %, so it has a CHURN RATE OF 27%
df_full_train.CHURN.value_counts(normalize=True)

CHURN
0    0.730032
1    0.269968
Name: proportion, dtype: float64

In [146]:
global_churn_rate = df_full_train.CHURN.mean()
round(global_churn_rate,2)

0.27

In [147]:
numerical = ['TENURE','MONTHLYCHARGES','TOTALCHARGES']

In [148]:
categorical = ['GENDER', 'SENIORCITIZEN', 'PARTNER', 'DEPENDENTS', 
              'PHONESERVICE', 'MULTIPLELINES', 'INTERNETSERVICE',
              'ONLINESECURITY', 'ONLINEBACKUP', 'DEVICEPROTECTION', 'TECHSUPPORT',
              'STREAMINGTV', 'STREAMINGMOVIES', 'CONTRACT', 'PAPERLESSBILLING',
              'PAYMENTMETHOD']

In [149]:
df_full_train[categorical].nunique()

GENDER              2
SENIORCITIZEN       2
PARTNER             2
DEPENDENTS          2
PHONESERVICE        2
MULTIPLELINES       3
INTERNETSERVICE     3
ONLINESECURITY      3
ONLINEBACKUP        3
DEVICEPROTECTION    3
TECHSUPPORT         3
STREAMINGTV         3
STREAMINGMOVIES     3
CONTRACT            3
PAPERLESSBILLING    2
PAYMENTMETHOD       4
dtype: int64

### 3.5 FEATURE IMPORTANCE: CHURN RATE AND RISK RATIO
- Churn rate
- Risk ratio
- Mutual info

**Churn rate**

In [150]:
global_churn = df_full_train.CHURN.mean()
global_churn

0.26996805111821087

GENDER

In [151]:
churn_female = df_full_train[df_full_train.GENDER == 'FEMALE'].CHURN.mean()
churn_female

0.27682403433476394

In [152]:
churn_male = df_full_train[df_full_train.GENDER == 'MALE'].CHURN.mean()
churn_male

0.2632135306553911

PARTNER

In [153]:
df_full_train.PARTNER.value_counts()

PARTNER
NO     2932
YES    2702
Name: count, dtype: int64

In [154]:
churn_no_partner = df_full_train[df_full_train.PARTNER == 'NO'].CHURN.mean()
churn_no_partner

0.3298090040927694

In [155]:
churn_partner = df_full_train[df_full_train.PARTNER == 'YES'].CHURN.mean()
churn_partner

0.20503330866025166

In [156]:
global_churn - churn_no_partner

-0.05984095297455855

In [157]:
global_churn  - churn_male

0.006754520462819769

If difference is negative then the subset is important to the churn rate

**Risk Ratio**

In [158]:
# Risk ratio
churn_no_partner / global_churn

1.2216593879412643

In [159]:
churn_partner / global_churn

0.7594724924338315

People with no partner are more likely to churn, when the ratio is also > 1

In [160]:
df_group = df_full_train.groupby('GENDER').CHURN.agg(['mean', 'count'])
df_group

Unnamed: 0_level_0,mean,count
GENDER,Unnamed: 1_level_1,Unnamed: 2_level_1
FEMALE,0.276824,2796
MALE,0.263214,2838


In [169]:
from IPython.display import display
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).CHURN.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_churn
    df_group['risk'] = df_group['mean'] / global_churn
    display(df_group)

GENDER


Unnamed: 0_level_0,mean,count,diff,risk
GENDER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FEMALE,0.276824,2796,0.006856,1.025396
MALE,0.263214,2838,-0.006755,0.97498


SENIORCITIZEN


Unnamed: 0_level_0,mean,count,diff,risk
SENIORCITIZEN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,-0.027698,0.897403
1,0.413377,912,0.143409,1.531208


PARTNER


Unnamed: 0_level_0,mean,count,diff,risk
PARTNER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NO,0.329809,2932,0.059841,1.221659
YES,0.205033,2702,-0.064935,0.759472


DEPENDENTS


Unnamed: 0_level_0,mean,count,diff,risk
DEPENDENTS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NO,0.31376,3968,0.043792,1.162212
YES,0.165666,1666,-0.104302,0.613651


PHONESERVICE


Unnamed: 0_level_0,mean,count,diff,risk
PHONESERVICE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NO,0.241316,547,-0.028652,0.89387
YES,0.273049,5087,0.003081,1.011412


MULTIPLELINES


Unnamed: 0_level_0,mean,count,diff,risk
MULTIPLELINES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NO,0.257407,2700,-0.012561,0.953474
NO_PHONE_SERVICE,0.241316,547,-0.028652,0.89387
YES,0.290742,2387,0.020773,1.076948


INTERNETSERVICE


Unnamed: 0_level_0,mean,count,diff,risk
INTERNETSERVICE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DSL,0.192347,1934,-0.077621,0.712482
FIBER_OPTIC,0.425171,2479,0.155203,1.574895
NO,0.077805,1221,-0.192163,0.288201


ONLINESECURITY


Unnamed: 0_level_0,mean,count,diff,risk
ONLINESECURITY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NO,0.420921,2801,0.150953,1.559152
NO_INTERNET_SERVICE,0.077805,1221,-0.192163,0.288201
YES,0.153226,1612,-0.116742,0.56757


ONLINEBACKUP


Unnamed: 0_level_0,mean,count,diff,risk
ONLINEBACKUP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NO,0.404323,2498,0.134355,1.497672
NO_INTERNET_SERVICE,0.077805,1221,-0.192163,0.288201
YES,0.217232,1915,-0.052736,0.80466


DEVICEPROTECTION


Unnamed: 0_level_0,mean,count,diff,risk
DEVICEPROTECTION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NO,0.395875,2473,0.125907,1.466379
NO_INTERNET_SERVICE,0.077805,1221,-0.192163,0.288201
YES,0.230412,1940,-0.039556,0.85348


TECHSUPPORT


Unnamed: 0_level_0,mean,count,diff,risk
TECHSUPPORT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NO,0.418914,2781,0.148946,1.551717
NO_INTERNET_SERVICE,0.077805,1221,-0.192163,0.288201
YES,0.159926,1632,-0.110042,0.59239


STREAMINGTV


Unnamed: 0_level_0,mean,count,diff,risk
STREAMINGTV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NO,0.342832,2246,0.072864,1.269897
NO_INTERNET_SERVICE,0.077805,1221,-0.192163,0.288201
YES,0.302723,2167,0.032755,1.121328


STREAMINGMOVIES


Unnamed: 0_level_0,mean,count,diff,risk
STREAMINGMOVIES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NO,0.338906,2213,0.068938,1.255358
NO_INTERNET_SERVICE,0.077805,1221,-0.192163,0.288201
YES,0.307273,2200,0.037305,1.138182


CONTRACT


Unnamed: 0_level_0,mean,count,diff,risk
CONTRACT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MONTH-TO-MONTH,0.431701,3104,0.161733,1.599082
ONE_YEAR,0.120573,1186,-0.149395,0.446621
TWO_YEAR,0.028274,1344,-0.241694,0.10473


PAPERLESSBILLING


Unnamed: 0_level_0,mean,count,diff,risk
PAPERLESSBILLING,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NO,0.172071,2313,-0.097897,0.637375
YES,0.338151,3321,0.068183,1.25256


PAYMENTMETHOD


Unnamed: 0_level_0,mean,count,diff,risk
PAYMENTMETHOD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BANK_TRANSFER_(AUTOMATIC),0.168171,1219,-0.101797,0.622928
CREDIT_CARD_(AUTOMATIC),0.164339,1217,-0.10563,0.608733
ELECTRONIC_CHECK,0.45589,1893,0.185922,1.688682
MAILED_CHECK,0.19387,1305,-0.076098,0.718121


### Feature importance: Mutual information
The higher is more important

In [171]:
mutual_info_score(df_full_train.CHURN, df_full_train.CONTRACT)

0.0983203874041556

In [172]:
mutual_info_score(df_full_train.CHURN, df_full_train.GENDER)

0.0001174846211139946

In [173]:
mutual_info_score(df_full_train.CHURN, df_full_train.PARTNER)

0.009967689095399745

In [174]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.CHURN)

In [178]:
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending = False)
# Most important variable is CONTRACT and least important is GENDER

CONTRACT            0.098320
ONLINESECURITY      0.063085
TECHSUPPORT         0.061032
INTERNETSERVICE     0.055868
ONLINEBACKUP        0.046923
DEVICEPROTECTION    0.043453
PAYMENTMETHOD       0.043210
STREAMINGTV         0.031853
STREAMINGMOVIES     0.031581
PAPERLESSBILLING    0.017589
DEPENDENTS          0.012346
PARTNER             0.009968
SENIORCITIZEN       0.009410
MULTIPLELINES       0.000857
PHONESERVICE        0.000229
GENDER              0.000117
dtype: float64

### Feature importance: Correlation
For numerical columns - correlation coefficient (measure dependency btw 2 variables)

In [180]:
df_full_train[numerical].corrwith(df_full_train.CHURN)

TENURE           -0.351885
MONTHLYCHARGES    0.196805
TOTALCHARGES     -0.196353
dtype: float64

TENURE

In [181]:
df_full_train[df_full_train.TENURE <= 2].CHURN.mean()

0.5953420669577875

In [183]:
df_full_train[(df_full_train.TENURE > 2) & (df_full_train.TENURE <= 12)].CHURN.mean()

0.3994413407821229

In [184]:
df_full_train[(df_full_train.TENURE > 12)].CHURN.mean()

0.17634908339788277

MONTHLYCHARGES    

In [185]:
df_full_train[df_full_train.MONTHLYCHARGES <= 20].CHURN.mean()

0.08795411089866156

In [186]:
df_full_train[(df_full_train.MONTHLYCHARGES > 20) & (df_full_train.MONTHLYCHARGES <= 50)].CHURN.mean()

0.18340943683409436

In [187]:
df_full_train[(df_full_train.MONTHLYCHARGES > 50)].CHURN.mean()

0.32499341585462205

### One-hot encoding
with scikit-learn
DictVectorizer is "smart" enough to dont one hot encode numerical

In [220]:
train_dicts = df_train[categorical+numerical].to_dict(orient='records')

In [221]:
dv = DictVectorizer(sparse=False)

In [222]:
X_train = dv.fit_transform(train_dicts)

In [223]:
X_train.shape

(4225, 45)

In [224]:
val_dicts = df_val[categorical+numerical].to_dict(orient='records')

In [225]:
X_val = dv.transform(val_dicts)

### Logistic regression

In [228]:
model = LogisticRegression()

In [229]:
model.fit(X_train, y_train)

In [233]:
model.intercept_[0]

-0.10903193927647864

In [232]:
model.coef_[0].round(3)

array([ 0.475, -0.175, -0.408, -0.03 , -0.078,  0.063, -0.089, -0.081,
       -0.034, -0.073, -0.335,  0.316, -0.089,  0.004, -0.258,  0.141,
        0.009,  0.063, -0.089, -0.081,  0.266, -0.089, -0.284, -0.231,
        0.124, -0.166,  0.058, -0.087, -0.032,  0.07 , -0.059,  0.141,
       -0.249,  0.215, -0.12 , -0.089,  0.102, -0.071, -0.089,  0.052,
        0.213, -0.089, -0.232, -0.07 ,  0.   ])

In [234]:
# Hard predict
model.predict(X_train)

array([0, 1, 1, ..., 1, 0, 1])

In [236]:
# Soft predict
model.predict_proba(X_train)[:,1]

array([0.09548398, 0.6793306 , 0.63367965, ..., 0.53163501, 0.04254407,
       0.69872981])

In [237]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred

array([0.00899722, 0.20453172, 0.21223009, ..., 0.13639407, 0.79977048,
       0.83740707])

In [238]:
churn_decision = (y_pred>= 0.5)

In [239]:
churn_decision

array([False, False, False, ..., False,  True,  True])

In [242]:
df_val[churn_decision].CUSTOMERID

3       8433-WXGNA
8       3440-JPSCL
11      2637-FKFSY
12      7228-OMTPN
19      6711-FLDFB
           ...    
1397    5976-JCJRH
1398    2034-CGRHZ
1399    5276-KQWHG
1407    6521-YYTYI
1408    3049-SOLAY
Name: CUSTOMERID, Length: 311, dtype: object

In [244]:
y_val

array([0, 0, 0, ..., 0, 1, 1])

In [245]:
churn_decision.astype(int)

array([0, 0, 0, ..., 0, 1, 1])

In [246]:
accuracy_score(y_val, churn_decision.astype(int))

0.8034066713981547

### Model interpretation
Look at coeficients
Train a smaller model with fewer features

In [249]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'CONTRACT=MONTH-TO-MONTH': 0.475,
 'CONTRACT=ONE_YEAR': -0.175,
 'CONTRACT=TWO_YEAR': -0.408,
 'DEPENDENTS=NO': -0.03,
 'DEPENDENTS=YES': -0.078,
 'DEVICEPROTECTION=NO': 0.063,
 'DEVICEPROTECTION=NO_INTERNET_SERVICE': -0.089,
 'DEVICEPROTECTION=YES': -0.081,
 'GENDER=FEMALE': -0.034,
 'GENDER=MALE': -0.073,
 'INTERNETSERVICE=DSL': -0.335,
 'INTERNETSERVICE=FIBER_OPTIC': 0.316,
 'INTERNETSERVICE=NO': -0.089,
 'MONTHLYCHARGES': 0.004,
 'MULTIPLELINES=NO': -0.258,
 'MULTIPLELINES=NO_PHONE_SERVICE': 0.141,
 'MULTIPLELINES=YES': 0.009,
 'ONLINEBACKUP=NO': 0.063,
 'ONLINEBACKUP=NO_INTERNET_SERVICE': -0.089,
 'ONLINEBACKUP=YES': -0.081,
 'ONLINESECURITY=NO': 0.266,
 'ONLINESECURITY=NO_INTERNET_SERVICE': -0.089,
 'ONLINESECURITY=YES': -0.284,
 'PAPERLESSBILLING=NO': -0.231,
 'PAPERLESSBILLING=YES': 0.124,
 'PARTNER=NO': -0.166,
 'PARTNER=YES': 0.058,
 'PAYMENTMETHOD=BANK_TRANSFER_(AUTOMATIC)': -0.087,
 'PAYMENTMETHOD=CREDIT_CARD_(AUTOMATIC)': -0.032,
 'PAYMENTMETHOD=ELECTRONIC_CHECK': 0.07,
 

In [250]:
small = ['CONTRACT', 'TENURE', 'MONTHLYCHARGES']

In [251]:
df_train[small]

Unnamed: 0,CONTRACT,TENURE,MONTHLYCHARGES
0,TWO_YEAR,72,115.50
1,MONTH-TO-MONTH,10,95.25
2,MONTH-TO-MONTH,5,75.55
3,MONTH-TO-MONTH,5,80.85
4,TWO_YEAR,18,20.10
...,...,...,...
4220,ONE_YEAR,52,80.85
4221,MONTH-TO-MONTH,18,25.15
4222,MONTH-TO-MONTH,2,90.00
4223,TWO_YEAR,27,24.50


In [253]:
dicts_train_small = df_train[small].to_dict(orient='records')
dicts_val_small = df_val[small].to_dict(orient='records')

In [255]:
dv_small = DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)
dv_small.get_feature_names_out()

array(['CONTRACT=MONTH-TO-MONTH', 'CONTRACT=ONE_YEAR',
       'CONTRACT=TWO_YEAR', 'MONTHLYCHARGES', 'TENURE'], dtype=object)

In [256]:
X_train_small = dv_small.transform(dicts_train_small)

In [258]:
model_small = LogisticRegression()
model_small.fit(X_train_small, y_train)

In [259]:
model_small.intercept_[0]

-2.4767756619324204

In [260]:
model_small.coef_[0]

array([ 0.97004492, -0.02513651, -0.94872202,  0.02748236, -0.03618424])

In [261]:
dict(zip(dv_small.get_feature_names_out(), model_small.coef_[0].round(3)))

{'CONTRACT=MONTH-TO-MONTH': 0.97,
 'CONTRACT=ONE_YEAR': -0.025,
 'CONTRACT=TWO_YEAR': -0.949,
 'MONTHLYCHARGES': 0.027,
 'TENURE': -0.036}

### Using the model

In [262]:
dicts_full_train = df_full_train[categorical + numerical].to_dict(orient='records')

In [263]:
dv = DictVectorizer(sparse=False)

In [264]:
X_full_train = dv.fit_transform(dicts_full_train)

In [267]:
y_full_train = df_full_train.CHURN.values

In [270]:
model = LogisticRegression()
model.fit(X_full_train,y_full_train)

In [271]:
dicts_test = df_test[categorical + numerical].to_dict(orient='records')

In [272]:
X_test = dv.transform(dicts_test)

In [274]:
y_pred =model.predict_proba(X_test)[:,1]

In [278]:
churn_decision = (y_pred>= 0.5)

In [279]:
accuracy_score(y_test, churn_decision.astype(int))

0.815471965933286

In [281]:
# Using the model
customer = dicts_test[10]
customer

In [284]:
X_small = dv.transform([customer])

In [285]:
model.predict_proba(X_small)
# 40% of chances of churning

array([[0.5943189, 0.4056811]])

In [288]:
# lets check and we were right, he was not going to churn
y_test[10]

0