In [1]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [4]:
# This is a reasonable amount of churn for analysis
df['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [5]:
# Are all customer ID's unique?
print(df[df.customerID.duplicated() == True])
# Yes, which means I don't need it
df = df.drop(columns='customerID')

Empty DataFrame
Columns: [customerID, gender, SeniorCitizen, Partner, Dependents, tenure, PhoneService, MultipleLines, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges, Churn]
Index: []

[0 rows x 21 columns]


In [6]:
# For this one, you also have to determine which features are significant to churn

### Feature Engineering ###

In [7]:
# turn Y/N into booleans

df['gender'] = df['gender'].map(dict(Female=1, Male=0))
df['Partner'] = df['Partner'].map(dict(Yes=1, No=0))
df['Dependents'] = df['Dependents'].map(dict(Yes=1, No=0))
df['PhoneService'] = df['PhoneService'].map(dict(Yes=1, No=0))
df['PaperlessBilling'] = df['PaperlessBilling'].map(dict(Yes=1, No=0))
df['Churn'] = df['Churn'].map(dict(Yes=1, No=0))

In [8]:
# encoding w/Dummies
contract_dummies = pd.get_dummies(df['Contract'])
payment_dummies = pd.get_dummies(df['PaymentMethod'])
internet_dummies = pd.get_dummies(df['InternetService'])

internet_dummies = internet_dummies.rename(columns={'No':'No Internet'})

In [9]:
# convert remaining Y/N/NS, into booleans. No Service becomes no.

df['OnlineSecurity'] = df['OnlineSecurity'].str.replace('No internet service', 'No').map(dict(Yes=1, No=0))
df['OnlineBackup'] = df['OnlineBackup'].str.replace('No internet service', 'No').map(dict(Yes=1, No=0))
df['DeviceProtection'] = df['DeviceProtection'].str.replace('No internet service', 'No').map(dict(Yes=1, No=0))
df['TechSupport'] = df['TechSupport'].str.replace('No internet service', 'No').map(dict(Yes=1, No=0))
df['StreamingTV'] = df['StreamingTV'].str.replace('No internet service', 'No').map(dict(Yes=1, No=0))
df['StreamingMovies'] = df['StreamingMovies'].str.replace('No internet service', 'No').map(dict(Yes=1, No=0))

df['MultipleLines'] = df['MultipleLines'].str.replace('No phone service', 'No').map(dict(Yes=1, No=0))

In [10]:
df = df.drop(columns=['InternetService','Contract','PaymentMethod'])

In [11]:
df = pd.concat([df, payment_dummies, contract_dummies, internet_dummies], axis=1)

In [12]:
df['MonthlyCharges'] = df.MonthlyCharges.astype('float')

In [13]:
# TotalCharges wasn't converting properly. A quick glance in excel indicated there were spaces used as NaNs
# As TotalCharges can't be less than zero, encoding them as -1 flags them, while allowing conversion to float

df['TotalCharges'] = df['TotalCharges'].str.replace(' ', '-1').astype('float')

In [14]:
# -1's are converted into Monthly charges, as any total charge would include a monthly charge

df['TotalCharges'] = np.where(df['TotalCharges'] == -1, df['MonthlyCharges'], df['TotalCharges'])

In [15]:
# Standard Scale MonthlyCharges and TotalCharges

numeric_vals = df[['TotalCharges', 'MonthlyCharges', 'tenure']]

In [16]:
# dropping non-processed columns (and tenure, which is the only non-boolean value)
df = df.drop(columns=['TotalCharges', 'MonthlyCharges', 'tenure'])
df = df.iloc[:, 0:22].astype('bool')

In [17]:
# adding in columns, and adding tenure back in
df = pd.concat([df, numeric_vals], axis=1)

In [18]:
X = df.drop(columns='Churn')

In [19]:
y = df[['Churn']]

Analysis

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [21]:
model = LogisticRegression(penalty='l1', max_iter=1000,
                           C=0.8, solver='liblinear', random_state=42)

In [22]:
model.fit(X_train, np.ravel(y_train,order='C'))

LogisticRegression(C=0.8, max_iter=1000, penalty='l1', random_state=42,
                   solver='liblinear')

In [23]:
y_pred = model.predict(X_test)

In [24]:
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

       False      0.862     0.899     0.880      1036
        True      0.681     0.601     0.638       373

    accuracy                          0.820      1409
   macro avg      0.771     0.750     0.759      1409
weighted avg      0.814     0.820     0.816      1409



In [25]:
print(confusion_matrix(y_true=y_test, y_pred=y_pred))

[[931 105]
 [149 224]]
