In [1]:
pip install imblearn

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.10.1 imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


In [19]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, classification_report

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'telecom_churn.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
churn_data = pd.read_csv(file_content_stream)
churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [20]:
## defining the input and target variables
x = churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'MonthlyCharge', 'DayMins']]
y = churn_data['Churn']

## splitting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

In [21]:
## running over-sampling: creating synthetic dataset
x_over, y_over = RandomOverSampler().fit_resample(x_train, y_train)

## building random forest and adaboost models

In [22]:
## fitting models
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(x_over, y_over)
ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), 
                            n_estimators =500, learning_rate = 0.01).fit(x_over, y_over)

## predicting on test data set
rf_preds = rf_md.predict_proba(x_test)[:, 1]
ada_preds = ada_md.predict_proba(x_test)[:, 1]

## ROC
fpr, tpr, threshold = roc_curve(y_test, rf_preds)
fpr, tpr, threshold = roc_curve(y_test, ada_preds)


In [23]:
## random forest optimal cutoff
rf_cutoff = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'cutoff': threshold})
ada_cutoff = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'cutoff': threshold})

## computing the distance to perfect model
rf_cutoff['Euclidean distance'] = np.sqrt(rf_cutoff['fpr']**2 + (rf_cutoff['tpr'] - 1)**2)
ada_cutoff['Euclidean distance'] = np.sqrt(ada_cutoff['fpr']**2 + (ada_cutoff['tpr'] - 1)**2)

## sorting values
rf_cutoff = rf_cutoff.sort_values(by = 'Euclidean distance').reset_index(drop = True)
ada_cutoff = ada_cutoff.sort_values(by = 'Euclidean distance').reset_index(drop = True)

## changing likelihoods to labels based on cutoff
rf_preds_labels = np.where(rf_preds < rf_cutoff['cutoff'][0], 0, 1)
ada_preds_labels = np.where(ada_preds < ada_cutoff['cutoff'][0], 0, 1)

## classification report
print(classification_report(y_test, rf_preds_labels))
print(classification_report(y_test, ada_preds_labels))

              precision    recall  f1-score   support

           0       0.97      0.80      0.88       570
           1       0.42      0.86      0.56        97

    accuracy                           0.81       667
   macro avg       0.70      0.83      0.72       667
weighted avg       0.89      0.81      0.83       667

              precision    recall  f1-score   support

           0       0.97      0.86      0.91       570
           1       0.50      0.84      0.63        97

    accuracy                           0.86       667
   macro avg       0.74      0.85      0.77       667
weighted avg       0.90      0.86      0.87       667



In [None]:
## based on these results, I would use the AdaBoost model because the evaluation
## metrics. AdaBoost has a higher precision, recall, and F1 score.