## Dataset Load

In [243]:
import pandas as pd

dataset = pd.read_csv('./data/dataset.csv')
dataset.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Remove Useless Attributes

In [244]:
dataset.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Remove ID to avoid overfitting

In [245]:
dataset = dataset.drop(columns='customerID')

## Remove Useless Instances
Delete instance which has at least one missing value

In [246]:
dataset = dataset.dropna(how='any')

## Transform Categorical Values

Binarize 'gender'

In [247]:
dataset['gender'] = dataset['gender'].replace(['Male', 'Female'], [0, 1])
dataset.head()

  dataset['gender'] = dataset['gender'].replace(['Male', 'Female'], [0, 1])


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,0,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,0,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,0,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,1,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Binarize labels

In [248]:
binary_features = ['Churn']
dataset[binary_features] = dataset[binary_features].replace(['Yes', 'No'], [0, 1])
dataset.head()

  dataset[binary_features] = dataset[binary_features].replace(['Yes', 'No'], [0, 1])


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,1
1,0,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,1
2,0,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,0
3,0,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,1
4,1,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,0


Convert categorical features to dummies

In [249]:
categorical_features = ['MultipleLines', 'InternetService', 'Contract', 'PaymentMethod', 'StreamingTV', 'StreamingMovies', 'OnlineSecurity', 'OnlineBackup', 'TechSupport', 'DeviceProtection', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
dummies = pd.get_dummies(dataset[categorical_features], dtype=int)
dataset = pd.concat([dataset, dummies], axis=1).drop(columns=categorical_features)
dataset

Unnamed: 0,gender,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,...,DeviceProtection_No internet service,DeviceProtection_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,PaperlessBilling_No,PaperlessBilling_Yes
0,1,0,1,29.85,29.85,1,0,1,0,1,...,0,0,0,1,1,0,1,0,0,1
1,0,0,34,56.95,1889.5,1,1,0,0,1,...,0,1,1,0,1,0,0,1,1,0
2,0,0,2,53.85,108.15,0,1,0,0,1,...,0,0,1,0,1,0,0,1,0,1
3,0,0,45,42.30,1840.75,1,0,1,0,1,...,0,1,1,0,1,0,1,0,1,0
4,1,0,2,70.70,151.65,0,1,0,0,0,...,0,0,1,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,0,24,84.80,1990.5,1,0,0,1,1,...,0,1,0,1,0,1,0,1,0,1
7039,1,0,72,103.20,7362.9,1,0,0,1,0,...,0,1,0,1,0,1,0,1,0,1
7040,1,0,11,29.60,346.45,1,0,1,0,1,...,0,0,0,1,0,1,1,0,0,1
7041,0,1,4,74.40,306.6,0,0,0,1,0,...,0,0,0,1,1,0,0,1,0,1


Remove non numerical data from 'TotalCharges'

In [250]:
dataset[dataset['TotalCharges'] == ' '] = 0

## Train-Test Split

In [251]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(dataset.drop(columns='Churn'), dataset['Churn'], test_size=0.25)

## Fit and Predict

In [252]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier

lr = LogisticRegression()
rf = RandomForestClassifier(n_estimators=20, criterion='gini')
dc = DummyClassifier()

lr.fit(x_train, y_train)
lr_pred = lr.predict(x_test)

rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)

dc.fit(x_train, y_train)
dc_pred = dc.predict(x_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Compute Scores

In [253]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

print(f"""Logistic Regression: 
{accuracy_score(y_test, lr_pred)} 
{f1_score(y_test, lr_pred)} 
{confusion_matrix(y_test, lr_pred)}""")
print(f"""Random Forest: 
{accuracy_score(y_test, rf_pred)} 
{f1_score(y_test, rf_pred)} 
{confusion_matrix(y_test, rf_pred)}""")
print(f"""Dummy Classifier: 
{accuracy_score(y_test, dc_pred)} 
{f1_score(y_test, dc_pred)} 
{confusion_matrix(y_test, dc_pred)}""")

Logistic Regression: 
0.8018171493469619 
0.8679530836171018 
[[ 265  204]
 [ 145 1147]]
Random Forest: 
0.7853492333901193 
0.8577878103837472 
[[ 243  226]
 [ 152 1140]]
Dummy Classifier: 
0.7336740488358887 
0.8463806092368162 
[[   0  469]
 [   0 1292]]
