## Data cleaning and feature engineering

In [50]:
import pandas as pd
import numpy as np
%matplotlib inline 
import matplotlib.pyplot as plt

In [51]:
#df=pd.read_csv("../moreCleanData.csv")
df=pd.read_csv("clean_csv", index_col=0)
df.head()

Unnamed: 0,GENDER,SENIORCITIZEN,PARTNER,DEPENDENTS,TENURE,PHONESERVICE,PAPERLESSBILLING,MONTHLYCHARGES,TOTALCHARGES,MONTHLY_MINUTES_OF_USE,...,STREAMINGMOVIES_No,STREAMINGMOVIES_No internet service,STREAMINGMOVIES_Yes,CONTRACT_Month-to-month,CONTRACT_One year,CONTRACT_Two year,PAYMENTMETHOD_Bank transfer automatic,PAYMENTMETHOD_Credit card automatic,PAYMENTMETHOD_Electronic check,PAYMENTMETHOD_Mailed check
0,0,0,1,0,1,0,1,29.85,29.85,0,...,1,0,0,1,0,0,0,0,1,0
1,1,0,0,0,45,0,0,42.3,1840.75,0,...,1,0,0,0,1,0,1,0,0,0
2,0,0,0,0,10,0,0,29.75,301.9,0,...,1,0,0,1,0,0,0,0,0,1
3,1,1,0,0,1,0,1,39.65,39.65,0,...,0,0,1,1,0,0,0,0,1,0
4,1,0,1,1,1,0,0,30.2,30.2,0,...,1,0,0,1,0,0,0,0,1,0


In [54]:
#Prepare target & features for modeling
cols = df.columns
train_cols = cols.drop(["CHURN"])
features = df[train_cols]
target = df["CHURN"]
train_cols

Index(['GENDER', 'SENIORCITIZEN', 'PARTNER', 'DEPENDENTS', 'TENURE',
       'PHONESERVICE', 'PAPERLESSBILLING', 'MONTHLYCHARGES', 'TOTALCHARGES',
       'MONTHLY_MINUTES_OF_USE', 'TOTAL_MINUTES_OF_USE', 'MONTHLY_SMS',
       'TOTAL_SMS', 'MULTIPLELINES_No', 'MULTIPLELINES_No phone service',
       'MULTIPLELINES_Yes', 'INTERNETSERVICE_DSL',
       'INTERNETSERVICE_Fiber optic', 'INTERNETSERVICE_No',
       'ONLINESECURITY_No', 'ONLINESECURITY_No internet service',
       'ONLINESECURITY_Yes', 'ONLINEBACKUP_No',
       'ONLINEBACKUP_No internet service', 'ONLINEBACKUP_Yes',
       'DEVICEPROTECTION_No', 'DEVICEPROTECTION_No internet service',
       'DEVICEPROTECTION_Yes', 'TECHSUPPORT_No',
       'TECHSUPPORT_No internet service', 'TECHSUPPORT_Yes', 'STREAMINGTV_No',
       'STREAMINGTV_No internet service', 'STREAMINGTV_Yes',
       'STREAMINGMOVIES_No', 'STREAMINGMOVIES_No internet service',
       'STREAMINGMOVIES_Yes', 'CONTRACT_Month-to-month', 'CONTRACT_One year',
       'CONTRAC

## Logistic regression

In [55]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(features, target)
predictions = lr.predict(features)
# False positives.
fp_filter = (predictions == 1) & (target == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (target == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (target == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (target == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
print(tpr)
print(fpr)

0.4724451578384163
0.0995361422497101


## Logistic regression with k fold cross-validation

In [56]:
# KFold cross-validation and balanced class-weight
from sklearn.cross_validation import cross_val_predict, KFold
lr = LogisticRegression(class_weight="balanced")
#lr.fit(features, target)
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (target == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (target == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (target == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (target == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
print(f"tpr: {tpr}")
print(f"fpr: {fpr}")
#0.56,0.10;bal 0.79,0.27;
### BEGIN SOLUTION
print(f"First 10 Predictions:   {predictions[:20].tolist()}")
print(f"First 10 Actual labels: {target.values[:20].tolist()}")
### END SOLUTION
# Model Accuracy
#print('Test Acc: %.3f' % lr.score(features, target))

from sklearn.metrics import roc_curve, auc, roc_auc_score
auc = roc_auc_score(target, predictions)
print(f"auc: {auc}")
# Calculate classification report
target_names=['1','0']
from sklearn.metrics import classification_report
print(classification_report(target, predictions,
                            target_names=target_names))

tpr: 0.7758159443552702
fpr: 0.27870119829918827
First 10 Predictions:   [1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1]
First 10 Actual labels: [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0]
auc: 0.7485573730280409
             precision    recall  f1-score   support

          1       0.90      0.72      0.80      5174
          0       0.50      0.78      0.61      1869

avg / total       0.79      0.74      0.75      7043



## Logistic regression with manually set penalty

In [57]:
# Setting a penalty to the class-weight
penalty=penalty = {
    0: 1,
    1: 3
}

lr = LogisticRegression(class_weight=penalty)
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (target == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (target == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (target == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (target == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
print(tpr)
print(fpr)
### BEGIN SOLUTION
print(f"First 10 Predictions:   {predictions[:20].tolist()}")
print(f"First 10 Actual labels: {target.values[:20].tolist()}")
### END SOLUTION
#penalty (2:0.7153558052434457,0.2294163123308852),(3:0.7961476725521669,0.2962891379976807),(4:0.7758159443552702,0.30459992269037495)

0.7961476725521669
0.2962891379976807
First 10 Predictions:   [1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1]
First 10 Actual labels: [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0]


In [60]:
# Setting a penalty to the class-weight
penalty=penalty = {
    0: 1,
    1: 3
}

lr = LogisticRegression(class_weight=penalty)
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (target == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (target == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (target == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (target == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
print(tpr)
print(fpr)
### BEGIN SOLUTION
print(f"First 10 Predictions:   {predictions[:20].tolist()}")
print(f"First 10 Actual labels: {target.values[:20].tolist()}")
# Model Accuracy
#print('Test Acc: %.3f' % lr.score(features, target))

from sklearn.metrics import roc_curve, auc, roc_auc_score

auc = roc_auc_score(target, predictions)
print(f"auc: {auc}")
# Calculate classification report
target_names=['1','0']
from sklearn.metrics import classification_report
print(classification_report(target, predictions,
                            target_names=target_names))
### END SOLUTION
#'TOTALCHARGES'-:0.81,0.31,
# 'MONTHLY_MINUTES_OF_USE'-:0.78,0.32,
#'TOTAL_MINUTES_OF_USE'-:0.80,0.28
# 'MONTHLY_SMS',0.8,0.28
#'TOTAL_SMS', 0.8,0.27
#'TENURE',0.81,0.31
#important: contract,
#0.88,0.43 auc 0.7282159

0.8111289459604066
0.3063393892539621
First 10 Predictions:   [1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
First 10 Actual labels: [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0]
auc: 0.7523947783532223
             precision    recall  f1-score   support

          1       0.91      0.69      0.79      5174
          0       0.49      0.81      0.61      1869

avg / total       0.80      0.72      0.74      7043



## BC: Logistic regression

In [61]:
# Assign X (features) and y (target)
### BEGIN SOLUTION
X = features
y = target
print(X.shape, y.shape)

(7043, 38) (7043,)


In [62]:
### BEGIN SOLUTION
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
### END SOLUTION

In [63]:
# Setting a penalty to the class-weight
# penalty=penalty = {
#     0: 1,
#     1: 3
# }

In [64]:
### BEGIN SOLUTION
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(class_weight="balanced")
classifier
### END SOLUTION

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [65]:
### BEGIN SOLUTION
classifier.fit(X_train, y_train)
### END SOLUTION

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [66]:
### BEGIN SOLUTION
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
### END SOLUTION

Training Data Score: 0.7298371828852708
Testing Data Score: 0.7285633162975582


In [67]:
predictions = classifier.predict(X_test)

In [68]:
# False positives.
fp_filter = (predictions == 1) & (y_test == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (y_test == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (y_test == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (y_test == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)
print(tpr)
print(fpr)

0.7837259100642399
0.29134466769706335


In [69]:
### BEGIN SOLUTION
print(f"First 10 Predictions:   {predictions[:10].tolist()}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")
### END SOLUTION
# Model Accuracy
print('Test Acc: %.3f' % classifier.score(X_test, y_test))
from sklearn.metrics import roc_curve, auc, roc_auc_score
auc = roc_auc_score(y_test, predictions)
print(f"auc: {auc}")
# Calculate classification report
target_names=['1','0']
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=target_names))

First 10 Predictions:   [1, 1, 1, 1, 0, 0, 1, 1, 1, 1]
First 10 Actual labels: [1, 0, 0, 1, 0, 0, 1, 0, 1, 1]
Test Acc: 0.729
auc: 0.7461906211835883
             precision    recall  f1-score   support

          1       0.90      0.71      0.79      1294
          0       0.49      0.78      0.60       467

avg / total       0.79      0.73      0.74      1761

