In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model.logistic import LogisticRegression

%matplotlib inline

### Load Training Data

In [2]:
train_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
pd.set_option('display.max_rows', 15)
display(train_data)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.10,1949.4,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7036,7750-EYXWZ,Female,0,No,No,12,No,No phone service,DSL,No,...,Yes,Yes,Yes,Yes,One year,No,Electronic check,60.65,743.3,No
7037,2569-WGERO,Female,0,No,No,72,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),21.15,1419.4,No


### Analyze Training Data 

In [4]:
analysis = pd.DataFrame(columns=['col_name','null_num','type_num'])
for col in train_data:
    row = pd.Series({'col_name':col, 
                     'null_num':train_data[col].isnull().sum(), 
                     'type_num':train_data[col].unique().size})
    analysis = analysis.append(row, ignore_index=True)
    
    
pd.set_option('display.max_rows', 21)
display(analysis)

# data balance check
cnt_train_data = train_data.groupby('Churn')["Churn"].count()
display(cnt_train_data)
print("train_data ratio:", cnt_train_data[0]/cnt_train_data[1])

Unnamed: 0,col_name,null_num,type_num
0,customerID,0,7043
1,gender,0,2
2,SeniorCitizen,0,2
3,Partner,0,2
4,Dependents,0,2
5,tenure,0,73
6,PhoneService,0,2
7,MultipleLines,0,3
8,InternetService,0,3
9,OnlineSecurity,0,3


Churn
No     5174
Yes    1869
Name: Churn, dtype: int64

train_data ratio: 2.7683253076511503


### Drop Error Data   (TotalCharges == " ")

In [5]:
err_rows = pd.to_numeric(train_data['TotalCharges'], errors='coerce').isnull()
err_idx = err_rows.index[err_rows == True]
print("Before:", err_idx, len(err_idx))

train_data = train_data.drop(err_idx)

err_rows = pd.to_numeric(train_data['TotalCharges'], errors='coerce').isnull()
err_idx = err_rows.index[err_rows == True]
print("After:", err_idx, len(err_idx))

display(train_data)

Before: Int64Index([488, 753, 936, 1082, 1340, 3331, 3826, 4380, 5218, 6670, 6754], dtype='int64') 11
After: Int64Index([], dtype='int64') 0


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.10,1949.4,No
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.80,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,...,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


### Data Preprocess

In [6]:
train_Y = pd.DataFrame(data=train_data['Churn'].map({"Yes": 1, "No": 0}).astype(int))
train_X = train_data[['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']]

# Binary to 0, 1  ->   gender(male, female), Partner, Dependents, PhoneService, PaperlessBilling, Churn)
train_X.insert(0, 'gender', (train_data['gender'].map({"Male": 1, "Female": 0}).astype(int)))     
train_X.insert(2, 'Partner', (train_data['Partner'].map({"Yes": 1, "No": 0}).astype(int)))
train_X.insert(3, 'Dependents', (train_data['Dependents'].map({"Yes": 1, "No": 0}).astype(int)))
train_X.insert(5, 'PhoneService', (train_data['PhoneService'].map({"Yes": 1, "No": 0}).astype(int)))
train_X.insert(6, 'PaperlessBilling', (train_data['PhoneService'].map({"Yes": 1, "No": 0}).astype(int)))


# Categorical
cat_features = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod'] 
print("Categorical Features:", cat_features, len(cat_features))
train_X_cat = pd.get_dummies(train_data[cat_features])


# Concat
train_X = pd.concat([train_X, train_X_cat], axis=1, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
display(train_X)
print(train_X.columns)


Categorical Features: ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod'] 10


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1,0,0,29.85,29.85,0,...,1,0,0,1,0,0,0,0,1,0
1,1,0,0,0,34,1,1,56.95,1889.5,1,...,1,0,0,0,1,0,0,0,0,1
2,1,0,0,0,2,1,1,53.85,108.15,1,...,1,0,0,1,0,0,0,0,0,1
3,1,0,0,0,45,0,0,42.30,1840.75,0,...,1,0,0,0,1,0,1,0,0,0
4,0,0,0,0,2,1,1,70.70,151.65,1,...,1,0,0,1,0,0,0,0,1,0
5,0,0,0,0,8,1,1,99.65,820.5,0,...,0,0,1,1,0,0,0,0,1,0
6,1,0,0,1,22,1,1,89.10,1949.4,0,...,1,0,0,1,0,0,0,1,0,0
7,0,0,0,0,10,0,0,29.75,301.9,0,...,1,0,0,1,0,0,0,0,0,1
8,0,0,1,0,28,1,1,104.80,3046.05,0,...,0,0,1,1,0,0,0,0,1,0
9,1,0,0,1,62,1,1,56.15,3487.95,1,...,1,0,0,0,1,0,1,0,0,0


Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges',
       'MultipleLines_No', 'MultipleLines_No phone service',
       'MultipleLines_Yes', 'InternetService_DSL',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No', 'OnlineSecurity_No internet service',
       'OnlineSecurity_Yes', 'OnlineBackup_No',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No', 'DeviceProtection_No internet service',
       'DeviceProtection_Yes', 'TechSupport_No',
       'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit 

### Hold-out 20% Test

In [7]:
x_train, x_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.2, random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(5625, 40) (1407, 40) (5625, 1) (1407, 1)


In [8]:
cnt_train = y_train.groupby('Churn')['Churn'].count()
cnt_test = y_test.groupby('Churn')['Churn'].count()

print("train ratio:", cnt_train[0]/cnt_train[1])
print("test ratio:", cnt_test[0]/cnt_test[1])

train ratio: 2.762541806020067
test ratio: 2.7620320855614975


### Logistic Regression

In [9]:
clf = LogisticRegression()
theta = clf.fit(x_train, y_train)
scores = clf.score(x_test, y_test)
print("Logistic(θ=" + str(theta) + "): R2 Score: %f" % (scores.mean()))

Logistic(θ=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)): R2 Score: 0.786780


  y = column_or_1d(y, warn=True)
