In [1]:
import boto3
import pandas as pd
import botocore
from itertools import product
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, accuracy_score

## Use dataframes to store parameters to build models and store total scores
def expand_grid(dictionary):
    return pd.DataFrame([row for row in product(*dictionary.values())], columns = dictionary.keys())

rf_dictionary = {'n_tree': [100, 500, 1000, 1500, 2000], 'depth': [3, 5, 7],
                 'total_accuracy' : [0.0], 'total_recall' : [0.0], 'total_loops' : [0]}
boosting_dictionary = {'n_tree': [100, 500, 1000, 1500, 2000], 'depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001],
                  'total_accuracy' : [0.0], 'total_recall' : [0.0], 'total_loops' : [0]}

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'danhtran358-data-445-bucket'
bucket = s3.Bucket(bucket_name)

## Get file content
telecom_train_stream = bucket.Object('churn-bigml-80(1).csv').get().get('Body')
telecom_test_stream = bucket.Object('churn-bigml-20(1).csv').get().get('Body')

## Reading the csv file
telecom_train = pd.read_csv(telecom_train_stream)
telecom_test = pd.read_csv(telecom_test_stream)

## 15.b
## Numeralize categorical variable Churn
telecom_train['Churn'] = np.where(telecom_train['Churn'] == False, 0, 1)
telecom_test['Churn'] = np.where(telecom_test['Churn'] == False, 0, 1)

## Numeralize categorical variable International_plan
telecom_train['International_plan'] = np.where(telecom_train['International_plan'] == 'No', 0, 1)
telecom_test['International_plan'] = np.where(telecom_test['International_plan'] == 'No', 0, 1)

## Numeralize categorical variable Voice_mail_plan
telecom_train['Voice_mail_plan'] = np.where(telecom_train['Voice_mail_plan'] == 'No', 0, 1)
telecom_test['Voice_mail_plan'] = np.where(telecom_test['Voice_mail_plan'] == 'No', 0, 1)

## Create total_charge variable
telecom_train['total_charge'] = np.sum(telecom_train[['Total_day_charge', 'Total_eve_charge', 'Total_night_charge', 'Total_intl_charge']], axis = 1)
telecom_test['total_charge'] = np.sum(telecom_test[['Total_day_charge', 'Total_eve_charge', 'Total_night_charge', 'Total_intl_charge']], axis = 1)

## 15.c
## Filter the input variables
telecom_train = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls', 'Churn']]
telecom_test = telecom_test[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls', 'Churn']]

## 15.e-f
## Use the remaining inputs (exclude Voice_mail_plan) and target variables
X = telecom_train[['Account_length', 'International_plan', 'total_charge', 'Customer_service_calls']]
Y = telecom_train['Churn']

In [2]:
## this a workaround because I lost kernel while running the loops

## function to write write data_frame to csv file object in S3 bucket
def write_data_to_s3(data_file_object, data_frame):
    ## add content from the lists of recall scores
    content = data_frame.to_csv(index=False)

    ## store as new csv file
    data_file_object.put(Body = content)
    

## function to read Random Forest data stored in s3 csv to dataframe
def read_rf_data(rf_data_file):
    try:
        rf_data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            rf_results = expand_grid(rf_dictionary)
            write_data_to_s3(rf_data_file, rf_results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(rf_data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(rf_data_file.get().get('Body'))

    
## function to read AdaBoosting/Gradient Boosting data stored in s3 csv to dataframe
def read_boosting_data(boosting_data_file):
    try:
        boosting_data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            boosting_results = expand_grid(boosting_dictionary)
            write_data_to_s3(boosting_data_file, boosting_results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(boosting_data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(boosting_data_file.get().get('Body'))

In [5]:
## read Random Forest data stored in s3 file
rf_data_file = bucket.Object('rf_data.csv')
rf_results = read_rf_data(rf_data_file)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for i in range(rf_results.at[1, 'total_loops'], 100):
    ## Split data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    ## lists to store scores
    accuracy_scores = list()
    recall_scores = list()
    
    ## Build random forest models for each parameter combination and store scores
    for j in range(rf_results.shape[0]):
        
        ## Building model
        parameters = rf_results.loc[j]
        md_rf = RandomForestClassifier(max_depth = parameters['depth'],
                                       n_estimators = int(parameters['n_tree'])).fit(X_train, Y_train)
        
        ## Predicting
        pred_rf = md_rf.predict_proba(X_test)[:, 1]
        pred_rf = np.where(pred_rf < 0.1, 0, 1)

        ## Computing and storing scores to lists
        accuracy_scores.append(accuracy_score(Y_test, pred_rf))
        recall_scores.append(recall_score(Y_test, pred_rf))
    
    ## Accumulating scores after all scores calculated to avoid losing Kernel halfway and some combinations not being built
    for j in range(rf_results.shape[0]):
        
        ## Accumulating scores to dataframe and incrementing number of loops
        rf_results.at[j, 'total_loops'] = i + 1
        rf_results.at[j, 'total_accuracy'] = rf_results.at[j, 'total_accuracy'] + accuracy_scores[j]
        rf_results.at[j, 'total_recall'] = rf_results.at[j, 'total_recall'] + recall_scores[j]
        
    ## Writing data to s3
    write_data_to_s3(rf_data_file, rf_results)

rf_results

Unnamed: 0,n_tree,depth,total_accuracy,total_recall,total_loops
0,100,3,87.308989,86.75641,100
1,100,5,89.2603,86.74359,100
2,100,7,89.844569,86.641026,100
3,500,3,87.305243,86.74359,100
4,500,5,89.453184,86.730769,100
5,500,7,89.868914,86.615385,100
6,1000,3,87.308989,86.74359,100
7,1000,5,89.486891,86.74359,100
8,1000,7,89.848315,86.551282,100
9,1500,3,87.305243,86.730769,100
