## Reading the csv data file and creating a data-frame called churn

In [1]:
import boto3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, classification_report

# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'gabriel-predictive-analytics'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key = "telecom_churn.csv"

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
churn = pd.read_csv(file_content_stream)
churn.head(1)

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0


## Looking at the relative frequency table of the Churn variable.

In [8]:
# Relative Frequency table
round(churn['Churn'].value_counts() / churn.shape[0], 3)

0    0.855
1    0.145
Name: Churn, dtype: float64

As we can see, this is an unbalanced dataset.

#### Let's use the following variable to predict Churn: AccountWeeks, ContractRenewal, CustServCalls, MonthlyCharge, and DayMins as the predictor variables, and Churn is the target variable. 

#### Let's then split the data into two data-frames (taking into account the proportion of 0s and 1s): train (80%) and test (20%).

In [3]:
# Defining the input and target variables
X = churn[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'MonthlyCharge', 'DayMins']]
Y = churn['Churn']

# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y)

#### Using the train data-frame, let's build a random forest classification model with 500 trees and the maximum depth of each tree equal to 3.

#### Then, estimate the cutoff value that makes the random forest classification model the closest to the perfect model based on the ROC curve. Using the optimal cutoff value.

In [4]:
# Building Random Forest Classifier model
RF_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)

# Predicting on test dataset
RF_pred = RF_md.predict_proba(X_test)[:,1]

# Computing the ROC curve
fpr, tpr, threshold = roc_curve(Y_test, RF_pred)

# Creating a data-frame
cutoffs = pd.DataFrame({'False_Positive': fpr, 'True_Positive': tpr, 'Cutoff': threshold})
cutoffs.head()

Unnamed: 0,False_Positive,True_Positive,Cutoff
0,0.0,0.0,1.650456
1,0.0,0.010309,0.650456
2,0.0,0.041237,0.626021
3,0.003509,0.041237,0.611743
4,0.003509,0.103093,0.588655


#### Checking the classification report.

In [5]:
# Finding the optimal cut-off
cutoffs['True_Positive_minus_1'] = cutoffs['True_Positive'] - 1
cutoffs['Euclidean_dist'] = np.sqrt(cutoffs['False_Positive']**2 + cutoffs['True_Positive_minus_1']**2)

## Sorting based on the Euclidean distance
cutoffs = cutoffs.sort_values(by = 'Euclidean_dist').reset_index(drop = True)

## Changing likelihoods to labels
RF_pred = np.where(RF_pred < cutoffs['Cutoff'][0], 0, 1)

# Printing classification report
print(classification_report(Y_test, RF_pred))

              precision    recall  f1-score   support

           0       0.97      0.87      0.92       570
           1       0.53      0.87      0.66        97

    accuracy                           0.87       667
   macro avg       0.75      0.87      0.79       667
weighted avg       0.91      0.87      0.88       667



#### Repeating the same process but now using an ada-boost classification model with 500 trees, the maximum depth of each tree equal to 3, and learning rate equal to 0.01

In [6]:
#### AdaBoost Classifier
Ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)
    
# Predicting on test
ADA_pred = Ada_md.predict_proba(X_test)[:,1]

# Computing the ROC curve
fpr, tpr, threshold = roc_curve(Y_test, ADA_pred)

# Creating a data-frame
cutoffs_ADA = pd.DataFrame({'False_Positive': fpr, 'True_Positive': tpr, 'Cutoff': threshold})

# Finding the optimal cut-off
cutoffs_ADA['True_Positive_minus_1'] = cutoffs_ADA['True_Positive'] - 1
cutoffs_ADA['Euclidean_dist'] = np.sqrt(cutoffs_ADA['False_Positive']**2 + cutoffs_ADA['True_Positive_minus_1']**2)

## Sorting based on the Euclidean distance
cutoffs_ADA = cutoffs_ADA.sort_values(by = 'Euclidean_dist').reset_index(drop = True)


## Changing likelihoods to labels
ADA_pred = np.where(ADA_pred < cutoffs_ADA['Cutoff'][0], 0, 1)

# Printing classification report
print(classification_report(Y_test, ADA_pred))

              precision    recall  f1-score   support

           0       0.98      0.89      0.93       570
           1       0.58      0.87      0.69        97

    accuracy                           0.89       667
   macro avg       0.78      0.88      0.81       667
weighted avg       0.92      0.89      0.90       667



### Given the results from both models, I would choose the AdaBoost Classifier model because it had a higher precision socore in the class 1.