In [99]:
## 6.a
import boto3, botocore
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score
from itertools import product

## fetch file content from s3
s3 = boto3.resource('s3')
bucket = s3.Bucket('danhtran358-data-445-bucket')

bucket_object = bucket.Object('project_cleaned_data.csv')
## read file content to data-frame
diabetes_cleaned = pd.read_csv(bucket_object.get().get('Body'))
diabetes_cleaned

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89,66,23,94,28.1,0.167,21,0
1,0,137,40,35,168,43.1,2.288,33,1
2,3,78,50,32,88,31.0,0.248,26,1
3,2,197,70,45,543,30.5,0.158,53,1
4,1,189,60,23,846,30.1,0.398,59,1
...,...,...,...,...,...,...,...,...,...
387,0,181,88,44,510,43.3,0.222,26,1
388,1,128,88,39,110,36.5,1.057,37,1
389,2,88,58,26,16,28.4,0.766,22,0
390,10,101,76,48,180,32.9,0.171,63,0


In [100]:
bucket_object = bucket.Object('project_cleaned_data_extended_after_LASSO.csv')
## read file content to data-frame
diabetes_extended = pd.read_csv(bucket_object.get().get('Body'))
diabetes_extended

Unnamed: 0,Pregnancies,Glucose,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,35,33.6,0.627,50,1
1,1,85,29,26.6,0.351,31,0
2,1,89,23,28.1,0.167,21,0
3,0,137,35,43.1,2.288,33,1
4,3,78,32,31.0,0.248,26,1
...,...,...,...,...,...,...,...
529,9,170,31,44.0,0.403,43,1
530,10,101,48,32.9,0.171,63,0
531,2,122,27,36.8,0.340,27,0
532,5,121,23,26.2,0.245,30,0


In [101]:
## function to write write data_frame to csv file object in S3 bucket
def write_data_to_s3(file_name, data_frame):
    ## file object in s3 bucket
    data_file = bucket.Object(file_name)
    
    ## add content from the lists of recall scores
    content = data_frame.to_csv(index=False)

    ## store as new csv file
    data_file.put(Body = content)
    

## function to read Random Forest data stored in s3 csv to dataframe
def read_rf_data(file_name):
    try:
        ## file object in s3 bucket
        rf_data_file = bucket.Object(file_name)
        
        rf_data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            results = expand_grid(rf_dictionary)
            
            ## will not work on extended data with 8 feature columns
            results = results.drop(results[(results['extended_data'] == 'Y') & (results['input_layer'] == 8)].index)
            
            ## create columns for all types of cut-off values and scores
            for i in range(len(cut_off)):
                for j in range(len(score_to_evaluate)):
                    col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                    results[col] = 0.0
                    
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(rf_data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(rf_data_file.get().get('Body'))

    
## function to read AdaBoosting/Gradient Boosting data stored in s3 csv to dataframe
def read_boosting_data(file_name):
    try:
        ## file object in s3 bucket
        boosting_data_file = bucket.Object(file_name)
        
        boosting_data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            results = expand_grid(boosting_dictionary)
            
            ## will not work on extended data with 8 feature columns
            results = results.drop(results[(results['extended_data'] == 'Y') & (results['input_layer'] == 8)].index)
            
            ## create columns for all types of cut-off values and scores
            for i in range(len(cut_off)):
                for j in range(len(score_to_evaluate)):
                    col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                    results[col] = 0.0
                    
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(boosting_data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(boosting_data_file.get().get('Body'))
    

## function to read Random Forest data stored in s3 csv to dataframe
def read_data_from_s3(file_name, X = None):
    try:
        ## file object in s3 bucket
        data_file = bucket.Object(file_name)
        
        data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            if X is None:
                results = expand_grid(dictionary)

                ## will not work on extended data with 8 feature columns
                results = results.drop(results[(results['extended_data'] == 'Y') & (results['input_layer'] == 8)].index)

                ## create columns for all types of cut-off values and scores
                for i in range(len(cut_off)):
                    for j in range(len(score_to_evaluate)):
                        col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                        results[col] = 0.0
            
            else:
                ## empty dataframe with first row has 0 for total loops
                empty_list = list()
                results = pd.DataFrame(empty_list, columns = X.columns)
                results.at[0, 'total_loops'] = 0
                   
            ## write brand new and empty file to s3
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(data_file.get().get('Body'))

In [102]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']

## List to store feature importances
all_feature_importances = list()

## read ensemble feature importances data stored in s3 file
data_file_name = 'project_ensemble_feature_importances.csv'
results = read_data_from_s3(data_file_name, X)

for loop_number in range(int(results.at[0, 'total_loops']), 1000):
    ## Split data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    ## Build models and store feature importances
    md_rf = RandomForestClassifier(max_depth = 3, n_estimators = 500).fit(X_train, Y_train)
    all_feature_importances.append(md_rf.feature_importances_)
    
    md_ada = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)
    all_feature_importances.append(md_ada.feature_importances_)
    
    md_grad = GradientBoostingClassifier(max_depth = 3, n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)
    all_feature_importances.append(md_grad.feature_importances_)
    
    all_feature_importances_df = pd.DataFrame(all_feature_importances, columns = X.columns)
    all_feature_importances_df['total_loops'] = loop_number + 1
    
    write_data_to_s3(data_file_name, all_feature_importances_df)
    
## Calculate the average importances of variables across 100 splits and 3 models
results = read_data_from_s3(data_file_name, X.columns)
print(np.mean(results, axis = 0))

Pregnancies                    0.067908
Glucose                        0.327742
BloodPressure                  0.051193
SkinThickness                  0.054225
Insulin                        0.136200
BMI                            0.110167
DiabetesPedigreeFunction       0.105079
Age                            0.147487
total_loops                 1000.000000
dtype: float64


In [103]:
## The clean data from project_cleaned_data.csv has missing value observations deleted

## This process only cleans missing value observation in the columns kept after dropping less important columns
bucket_object = bucket.Object('diabetes.csv')
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## read file content to data-frame
diabetes_not_cleaned = pd.read_csv(file_content_stream)

## dropping columns less important
diabetes_important = diabetes_not_cleaned.drop(columns = ['Pregnancies', 'BloodPressure', 'SkinThickness'])

## Preprocessing - Clean missing data values
## Glucose missing values
diabetes_important = diabetes_important.loc[diabetes_important['Glucose'] != 0]

## SkinThickness missing values
diabetes_important = diabetes_important.loc[diabetes_important['Insulin'] != 0]

## BMI missing values
diabetes_important = diabetes_important.loc[diabetes_important['BMI'] != 0]

diabetes_important = diabetes_important.reset_index(drop = True)

diabetes_important

Unnamed: 0,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,89,94,28.1,0.167,21,0
1,137,168,43.1,2.288,33,1
2,78,88,31.0,0.248,26,1
3,197,543,30.5,0.158,53,1
4,189,846,30.1,0.398,59,1
...,...,...,...,...,...,...
387,181,510,43.3,0.222,26,1
388,128,110,36.5,1.057,37,1
389,88,16,28.4,0.766,22,0
390,101,180,32.9,0.171,63,0


In [104]:
## write cleaned data to s3 bucket
write_data_to_s3('project_cleaned_data_extended_after_feature_importances.csv', diabetes_important)

In [105]:
## Use dataframes to store parameters to build models and store total scores
def expand_grid(dictionary):
    return pd.DataFrame([row for row in product(*dictionary.values())], columns = dictionary.keys())

rf_dictionary = {'extended_data' : ['Y', 'N'], 'input_layer': [5, 6, 8], 'total_loops' : [0],
                 'n_tree': [100, 500, 1000, 1500, 2000], 'depth': [3, 5, 7]}
boosting_dictionary = {'extended_data' : ['Y', 'N'], 'input_layer': [5, 6, 8], 'total_loops' : [0],
                       'n_tree': [100, 500, 1000, 1500, 2000], 'depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001]}

## lists of cut-off values and types of score to evaluate models
cut_off = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65]
score_to_evaluate = ['precision', 'recall', 'f1']


In [106]:
## update the scores in result dataset after each model is built
def update_result_scores(pred, Y_test, results, combo_number):
    
    for cut_off_id in range(len(cut_off)):
        
        ## classify labels
        current_cut_off = cut_off[cut_off_id]
        pred_labels = np.where(pred < current_cut_off, 0, 1)
        
        for score_id in range(len(score_to_evaluate)):
            
            ## updated the appropriate score
            current_score = score_to_evaluate[score_id]
            score_column = str(current_cut_off) + '_' + current_score
            if current_score == 'precision':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + precision_score(Y_test, pred_labels, zero_division = 0)
            
            elif current_score == 'recall':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + recall_score(Y_test, pred_labels)
                
            elif current_score == 'f1': 
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + f1_score(Y_test, pred_labels)

In [107]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_important = X.drop(columns = ['Pregnancies', 'BloodPressure', 'SkinThickness'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']
X_important_extended = diabetes_important.drop(columns = ['Outcome'])
Y_important_extended = diabetes_important['Outcome']

## read Random Forest data stored in s3 file
data_file_name = 'project_rf_data.csv'
rf_results_ = read_rf_data(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(rf_results_.at[0, 'total_loops'], 100):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(rf_results_.shape[0]):
        parameters = rf_results_.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 5:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important, Y, test_size = 0.2, stratify = Y)
            
            elif parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            elif parameters['input_layer'] == 8:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 5:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important_extended, Y_important_extended, test_size = 0.2, stratify = Y_important_extended)
                
            elif parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
        
        ## Building model
        md_rf = RandomForestClassifier(max_depth = parameters['depth'],
                                       n_estimators = int(parameters['n_tree'])).fit(X_train, Y_train)

        ## Predicting
        pred = md_rf.predict_proba(X_test)[:, 1]
        
        update_result_scores(pred, Y_test, rf_results_, combo_number)
        
    rf_results_['total_loops'] = loop_number + 1
    
    ## Writing data to s3
    write_data_to_s3(data_file_name, rf_results_)
	

In [108]:
## Get number of loops already run
loops_run = rf_results_.at[0, 'total_loops']

## type of score to check
score_to_check = 'f1'

## Create a dataframe to store values on a cut-off and append values for other cut-offs
cut_off_value = cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(rf_results_[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

## Appending cut=offs
for cut_off_value in range(1, len(cut_off)):
    
    column_name = str(cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(rf_results_[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.1_f1,index.1,0.15_f1,index.2,0.2_f1,index.3,0.25_f1,index.4,0.3_f1,...,index.5,0.45_f1,index.6,0.5_f1,index.7,0.55_f1,index.8,0.6_f1,index.9,0.65_f1
0,14,0.618638,5,0.662890,40,0.693587,34,0.712829,9,0.717435,...,41,0.685717,40,0.665861,11,0.648308,43,0.618995,43,0.569343
1,32,0.617936,14,0.662693,14,0.692045,40,0.709865,33,0.716882,...,8,0.684180,11,0.665586,41,0.643648,11,0.616428,5,0.568850
2,35,0.615984,38,0.662306,34,0.690986,10,0.709686,30,0.716565,...,43,0.683426,43,0.665274,43,0.642135,41,0.615657,40,0.566069
3,8,0.615832,41,0.661814,35,0.690911,4,0.708205,6,0.716149,...,11,0.682434,41,0.665210,44,0.640492,8,0.612064,11,0.562108
4,38,0.615765,35,0.660997,5,0.690291,43,0.706970,34,0.715982,...,1,0.681671,8,0.664792,1,0.639712,40,0.611674,41,0.561166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,51,0.546040,69,0.590930,54,0.637729,51,0.666353,48,0.675689,...,15,0.643942,66,0.615132,45,0.568015,60,0.488070,45,0.394826
71,66,0.546038,66,0.590632,51,0.636424,54,0.665004,47,0.674073,...,27,0.643756,60,0.614223,60,0.568008,63,0.485459,63,0.374775
72,63,0.545409,63,0.589754,63,0.635688,60,0.660776,45,0.673322,...,45,0.640844,45,0.613370,72,0.565159,69,0.482761,69,0.374487
73,60,0.544471,60,0.586559,60,0.634519,72,0.658719,51,0.673239,...,48,0.640204,69,0.613312,69,0.561900,72,0.477412,72,0.369409


In [109]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_important = X.drop(columns = ['Pregnancies', 'BloodPressure', 'SkinThickness'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']
X_important_extended = diabetes_important.drop(columns = ['Outcome'])
Y_important_extended = diabetes_important['Outcome']

## read AdaBoost data stored in s3 file
data_file_name = 'project_ada_data.csv'
ada_results_ = read_boosting_data(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(ada_results_.at[0, 'total_loops'], 100):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(ada_results_.shape[0]):
        parameters = ada_results_.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 5:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important, Y, test_size = 0.2, stratify = Y)
            
            elif parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            elif parameters['input_layer'] == 8:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 5:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important_extended, Y_important_extended, test_size = 0.2, stratify = Y_important_extended)
                
            elif parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
        
        ## Building model
        md_ada = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = parameters['depth']),
                                    n_estimators = int(parameters['n_tree']),
                                    learning_rate = parameters['learning_rate']).fit(X_train, Y_train)

        ## Predicting
        pred = md_ada.predict_proba(X_test)[:, 1]
        
        update_result_scores(pred, Y_test, ada_results_, combo_number)
        
    ada_results_['total_loops'] = loop_number + 1
    
    ## Writing data to s3
    write_data_to_s3(data_file_name, ada_results_)
	

In [110]:
loops_run = ada_results_.at[0, 'total_loops']
score_to_check = 'f1'

cut_off_value = cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(ada_results_[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

for cut_off_value in range(1, len(cut_off)):
    
    column_name = str(cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(ada_results_[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.1_f1,index.1,0.15_f1,index.2,0.2_f1,index.3,0.25_f1,index.4,0.3_f1,...,index.5,0.45_f1,index.6,0.5_f1,index.7,0.55_f1,index.8,0.6_f1,index.9,0.65_f1
0,14,0.661372,2,0.665973,110,0.679619,110,0.693940,119,0.699470,...,11,0.685346,146,0.674419,92,0.647734,7,0.635223,7,0.635071
1,5,0.658468,101,0.662970,2,0.678742,2,0.687925,110,0.698240,...,2,0.684057,2,0.674095,2,0.645959,116,0.625429,116,0.623993
2,130,0.653620,11,0.660788,11,0.678522,20,0.687765,101,0.696251,...,101,0.682819,101,0.669158,137,0.643571,97,0.622499,98,0.621928
3,93,0.652364,5,0.659210,101,0.677330,101,0.687644,11,0.694079,...,146,0.680118,11,0.666587,146,0.643374,98,0.622414,97,0.621436
4,120,0.652100,14,0.657574,20,0.676473,11,0.686710,20,0.693985,...,110,0.678501,137,0.664545,182,0.643192,132,0.621832,125,0.620680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,117,0.507980,63,0.523525,163,0.541097,73,0.564535,72,0.585276,...,175,0.578962,219,0.573979,72,0.534844,73,0.426460,54,0.278670
221,108,0.506267,126,0.522650,63,0.540959,63,0.561895,81,0.582535,...,147,0.578817,147,0.572490,81,0.530521,82,0.419856,82,0.277427
222,27,0.505738,36,0.521876,82,0.539529,82,0.559123,82,0.580973,...,219,0.578484,89,0.571807,163,0.529797,63,0.415303,63,0.258051
223,36,0.504330,72,0.520183,72,0.536155,72,0.558565,159,0.578129,...,89,0.572203,171,0.571269,82,0.528430,72,0.408114,81,0.242043


In [111]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_important = X.drop(columns = ['Pregnancies', 'BloodPressure', 'SkinThickness'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']
X_important_extended = diabetes_important.drop(columns = ['Outcome'])
Y_important_extended = diabetes_important['Outcome']

## read Gradient Boosting data stored in s3 file
data_file_name = 'project_grad_data.csv'
grad_results_ = read_boosting_data(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(grad_results_.at[0, 'total_loops'], 1):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(grad_results_.shape[0]):
        parameters = grad_results_.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 5:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important, Y, test_size = 0.2, stratify = Y)
            
            elif parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            elif parameters['input_layer'] == 8:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 5:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important_extended, Y_important_extended, test_size = 0.2, stratify = Y_important_extended)
                
            elif parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
        
        ## Building model
        md_grad = GradientBoostingClassifier(max_depth = parameters['depth'],
                                             n_estimators = int(parameters['n_tree']),
                                             learning_rate = parameters['learning_rate']).fit(X_train, Y_train)
        ## Predicting
        pred = md_grad.predict_proba(X_test)[:, 1]
        
        update_result_scores(pred, Y_test, grad_results_, combo_number)
        
    grad_results_['total_loops'] = loop_number + 1
    
    ## Writing data to s3
    write_data_to_s3(data_file_name, grad_results_)
    

In [112]:
## Get number of loops already run
loops_run = grad_results_.at[0, 'total_loops']

## type of score to check
score_to_check = 'f1'

## Create a dataframe to store values on a cut-off and append values for other cut-offs
cut_off_value = cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(grad_results_[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

## Appending cut=offs
for cut_off_value in range(1, len(cut_off)):
    
    column_name = str(cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(grad_results_[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.1_f1,index.1,0.15_f1,index.2,0.2_f1,index.3,0.25_f1,index.4,0.3_f1,...,index.5,0.45_f1,index.6,0.5_f1,index.7,0.55_f1,index.8,0.6_f1,index.9,0.65_f1
0,3,0.693781,13,0.696672,13,0.702905,38,0.707136,38,0.709995,...,10,0.677350,124,0.664992,124,0.663085,124,0.657957,124,0.656016
1,121,0.691927,103,0.695155,190,0.699512,190,0.705476,119,0.709038,...,128,0.675490,19,0.663250,22,0.656257,34,0.652243,34,0.649020
2,22,0.687308,3,0.693209,109,0.699151,100,0.704611,110,0.707341,...,19,0.672746,10,0.662478,34,0.655057,105,0.651180,105,0.648367
3,193,0.686711,22,0.693090,19,0.698764,10,0.701540,29,0.705589,...,38,0.671684,22,0.660353,19,0.654228,22,0.650174,121,0.642385
4,202,0.686056,28,0.690543,100,0.698269,119,0.701218,128,0.704360,...,109,0.671340,0,0.660014,130,0.652054,121,0.648996,22,0.641921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,200,0.495238,140,0.495238,143,0.495238,137,0.495238,182,0.494949,...,50,0.000000,182,0.000000,185,0.000000,143,0.000000,62,0.000000
221,95,0.495238,152,0.495238,14,0.495087,140,0.495238,5,0.494949,...,53,0.000000,47,0.000000,137,0.000000,140,0.000000,59,0.000000
222,94,0.495238,197,0.495238,107,0.495082,143,0.495238,137,0.494804,...,143,0.000000,98,0.000000,92,0.000000,137,0.000000,56,0.000000
223,185,0.495238,182,0.495238,17,0.494829,182,0.495238,92,0.494804,...,140,0.000000,188,0.000000,98,0.000000,98,0.000000,53,0.000000


## Reviewing

In [113]:
## Random Forest
## read Random Forest data stored in s3 file
data_file_name = 'project_rf_data.csv'
rf_results_ = read_rf_data(data_file_name)

## lists of cut-off values to review scores
review_cut_off = [0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]

## Get number of loops already run
loops_run = results.at[0, 'total_loops']

## type of score to check
score_to_check = 'f1'

## Create a dataframe to store values on a cut-off and append values for other cut-offs
cut_off_value = review_cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(rf_results_[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

## Appending cut=offs
for cut_off_value in range(1, len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(rf_results_[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.15_f1,index.1,0.2_f1,index.2,0.25_f1,index.3,0.3_f1,index.4,0.35_f1,index.5,0.4_f1,index.6,0.45_f1
0,5,0.066289,40,0.069359,34,0.071283,9,0.071744,33,0.071890,33,0.069941,41,0.068572
1,14,0.066269,14,0.069204,40,0.070987,33,0.071688,9,0.071546,43,0.069861,8,0.068418
2,38,0.066231,34,0.069099,10,0.070969,30,0.071656,34,0.071207,44,0.069828,43,0.068343
3,41,0.066181,35,0.069091,4,0.070820,6,0.071615,43,0.071092,35,0.069672,11,0.068243
4,35,0.066100,5,0.069029,43,0.070697,34,0.071598,36,0.071080,8,0.069658,1,0.068167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,69,0.059093,54,0.063773,51,0.066635,48,0.067569,59,0.066980,45,0.065408,15,0.064394
71,66,0.059063,51,0.063642,54,0.066500,47,0.067407,48,0.066409,57,0.065348,27,0.064376
72,63,0.058975,63,0.063569,60,0.066078,45,0.067332,51,0.066022,51,0.065139,45,0.064084
73,60,0.058656,60,0.063452,72,0.065872,51,0.067324,45,0.065905,48,0.064725,48,0.064020


In [114]:
## AdaBoost
## read AdaBoost data stored in s3 file
data_file_name = 'project_ada_data.csv'
ada_results_ = read_boosting_data(data_file_name)

## Get number of loops already run
loops_run = results.at[0, 'total_loops']

## type of score to check
score_to_check = 'f1'

## Create a dataframe to store values on a cut-off and append values for other cut-offs
cut_off_value = review_cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(rf_results_[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

## Appending cut=offs
for cut_off_value in range(1, len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(rf_results_[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.15_f1,index.1,0.2_f1,index.2,0.25_f1,index.3,0.3_f1,index.4,0.35_f1,index.5,0.4_f1,index.6,0.45_f1
0,5,0.066289,40,0.069359,34,0.071283,9,0.071744,33,0.071890,33,0.069941,41,0.068572
1,14,0.066269,14,0.069204,40,0.070987,33,0.071688,9,0.071546,43,0.069861,8,0.068418
2,38,0.066231,34,0.069099,10,0.070969,30,0.071656,34,0.071207,44,0.069828,43,0.068343
3,41,0.066181,35,0.069091,4,0.070820,6,0.071615,43,0.071092,35,0.069672,11,0.068243
4,35,0.066100,5,0.069029,43,0.070697,34,0.071598,36,0.071080,8,0.069658,1,0.068167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,69,0.059093,54,0.063773,51,0.066635,48,0.067569,59,0.066980,45,0.065408,15,0.064394
71,66,0.059063,51,0.063642,54,0.066500,47,0.067407,48,0.066409,57,0.065348,27,0.064376
72,63,0.058975,63,0.063569,60,0.066078,45,0.067332,51,0.066022,51,0.065139,45,0.064084
73,60,0.058656,60,0.063452,72,0.065872,51,0.067324,45,0.065905,48,0.064725,48,0.064020


In [115]:
## Gradient Boosting
## read Gradient Boosting data stored in s3 file
data_file_name = 'project_grad_data.csv'
grad_results_ = read_boosting_data(data_file_name)

## Get number of loops already run
loops_run = results.at[0, 'total_loops']

## type of score to check
score_to_check = 'f1'

## Create a dataframe to store values on a cut-off and append values for other cut-offs
cut_off_value = review_cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(rf_results_[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

## Appending cut=offs
for cut_off_value in range(1, len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(rf_results_[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.15_f1,index.1,0.2_f1,index.2,0.25_f1,index.3,0.3_f1,index.4,0.35_f1,index.5,0.4_f1,index.6,0.45_f1
0,5,0.066289,40,0.069359,34,0.071283,9,0.071744,33,0.071890,33,0.069941,41,0.068572
1,14,0.066269,14,0.069204,40,0.070987,33,0.071688,9,0.071546,43,0.069861,8,0.068418
2,38,0.066231,34,0.069099,10,0.070969,30,0.071656,34,0.071207,44,0.069828,43,0.068343
3,41,0.066181,35,0.069091,4,0.070820,6,0.071615,43,0.071092,35,0.069672,11,0.068243
4,35,0.066100,5,0.069029,43,0.070697,34,0.071598,36,0.071080,8,0.069658,1,0.068167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,69,0.059093,54,0.063773,51,0.066635,48,0.067569,59,0.066980,45,0.065408,15,0.064394
71,66,0.059063,51,0.063642,54,0.066500,47,0.067407,48,0.066409,57,0.065348,27,0.064376
72,63,0.058975,63,0.063569,60,0.066078,45,0.067332,51,0.066022,51,0.065139,45,0.064084
73,60,0.058656,60,0.063452,72,0.065872,51,0.067324,45,0.065905,48,0.064725,48,0.064020


## Reviewing

In [116]:
## Reviewing cut off from above dataframe
review_cut_off = [0.3, 0.35]

## read SVC data stored in s3 file
data_file_name = 'project_rf_data.csv'
results = read_data_from_s3(data_file_name)

## Get number of loops already run
loops_run = results.at[0, 'total_loops']

review_df = pd.DataFrame()

## Creating a dataframe with all average scores on each cut-off
for cut_off_value in range(len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_f1'
    next_f1_score_index = pd.DataFrame(results[column_name].sort_values(ascending = [False])).index
    score_columns = list()
    
    for score_to_check in score_to_evaluate:
        
        column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
        
        score_columns.append(column_name)

    parameter_df = results[['extended_data', 'input_layer', 'n_tree', 'depth']].loc[next_f1_score_index].reset_index(drop = True)
    score_df = results[score_columns].loc[next_f1_score_index].reset_index(drop = True) / loops_run
    
    review_df = pd.concat([review_df, parameter_df, score_df], axis = 1)

## Rename some columns to save space
review_df = review_df.rename(columns = {'extended_data':'ext', 'input_layer':'input'})
review_df

Unnamed: 0,ext,input,n_tree,depth,0.3_precision,0.3_recall,0.3_f1,ext.1,input.1,n_tree.1,depth.1,0.35_precision,0.35_recall,0.35_f1
0,Y,5,1500,3,0.620882,0.852692,0.717435,N,5,500,3,0.646330,0.814231,0.718897
1,N,5,500,3,0.617126,0.858846,0.716882,Y,5,1500,3,0.649484,0.801154,0.715455
2,N,5,100,3,0.620290,0.854231,0.716565,N,5,500,5,0.648800,0.793462,0.712065
3,Y,5,1000,3,0.620393,0.851923,0.716149,N,5,2000,5,0.640990,0.804231,0.710915
4,N,5,500,5,0.626628,0.839231,0.715982,N,5,1000,3,0.639442,0.806538,0.710799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,N,6,500,3,0.576162,0.822308,0.675689,N,6,2000,7,0.614098,0.743846,0.669802
71,N,6,100,7,0.576575,0.816923,0.674073,N,6,500,3,0.605812,0.741538,0.664087
72,N,6,100,3,0.577453,0.811923,0.673322,N,6,1000,3,0.594814,0.748846,0.660220
73,N,6,1000,3,0.568713,0.829615,0.673239,N,6,100,3,0.611573,0.720385,0.659051


## The best random forest was built on cleaned dataset with 5 important features using 500 trees and depth of 3, cut-off value of 0.35

In [117]:
## Reviewing cut off from above dataframe
review_cut_off = [0.3, 0.35]

## read SVC data stored in s3 file
data_file_name = 'project_ada_data.csv'
results = read_data_from_s3(data_file_name)

## Get number of loops already run
loops_run = results.at[0, 'total_loops']

review_df = pd.DataFrame()

for cut_off_value in range(len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_f1'
    next_f1_score_index = pd.DataFrame(results[column_name].sort_values(ascending = [False])).index
    score_columns = list()
    
    for score_to_check in score_to_evaluate:
        
        column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
        
        score_columns.append(column_name)

    parameter_df = results[['extended_data', 'input_layer', 'n_tree', 'depth', 'learning_rate']].loc[next_f1_score_index].reset_index(drop = True)
    score_df = results[score_columns].loc[next_f1_score_index].reset_index(drop = True) / loops_run
    
    review_df = pd.concat([review_df, parameter_df, score_df], axis = 1)

## Rename some columns to save space
review_df = review_df.rename(columns = {'extended_data':'ext', 'input_layer':'input'})
review_df

Unnamed: 0,ext,input,n_tree,depth,learning_rate,0.3_precision,0.3_recall,0.3_f1,ext.1,input.1,n_tree.1,depth.1,learning_rate.1,0.35_precision,0.35_recall,0.35_f1
0,N,5,1500,3,0.001,0.609877,0.826154,0.699470,Y,5,500,3,0.001,0.653036,0.762692,0.699181
1,N,5,1000,3,0.001,0.632009,0.787692,0.698240,N,5,1000,3,0.001,0.658170,0.750769,0.698140
2,N,5,500,3,0.001,0.643252,0.766154,0.696251,Y,5,1000,3,0.001,0.650045,0.761154,0.698020
3,Y,5,500,3,0.001,0.630886,0.780385,0.694079,N,5,1500,3,0.001,0.636102,0.779615,0.697468
4,Y,5,1000,3,0.001,0.620555,0.794615,0.693985,N,5,500,3,0.001,0.659178,0.743462,0.695327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,Y,6,1500,3,0.100,0.420907,0.963056,0.585276,N,6,2000,7,0.001,0.607192,0.576154,0.586630
221,Y,6,2000,3,0.100,0.418239,0.963056,0.582535,Y,6,1000,7,0.010,0.620044,0.563056,0.586514
222,Y,6,2000,3,0.010,0.420799,0.942778,0.580973,N,6,500,5,0.100,0.589344,0.591154,0.585545
223,N,6,1000,7,0.100,0.599191,0.566538,0.578129,N,6,1000,7,0.100,0.600794,0.561154,0.576051


## The best AdaBoost model was built on cleaned dataset with 5 important features using 500 trees, depth of 3, learning rate 0.001, cut-off values of 0.3 or 0.35

In [118]:
## Gradient Boosting
## read Random Forest data stored in s3 file
data_file_name = 'project_grad_data.csv'
grad_results_ = read_boosting_data(data_file_name)

## Reviewing cut off from above dataframe
review_cut_off = [0.3, 0.35]

## read SVC data stored in s3 file
data_file_name = 'project_grad_data.csv'
results = read_data_from_s3(data_file_name)

loops_run = results.at[0, 'total_loops']

review_df = pd.DataFrame()

for cut_off_value in range(len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_f1'
    next_f1_score_index = pd.DataFrame(results[column_name].sort_values(ascending = [False])).index
    score_columns = list()
    
    for score_to_check in score_to_evaluate:
        
        column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
        
        score_columns.append(column_name)

    parameter_df = results[['extended_data', 'input_layer', 'n_tree', 'depth', 'learning_rate']].loc[next_f1_score_index].reset_index(drop = True)
    score_df = results[score_columns].loc[next_f1_score_index].reset_index(drop = True) / loops_run
    
    review_df = pd.concat([review_df, parameter_df, score_df], axis = 1)

review_df = review_df.rename(columns = {'extended_data':'ext', 'input_layer':'input'})
review_df

Unnamed: 0,ext,input,n_tree,depth,learning_rate,0.3_precision,0.3_recall,0.3_f1,ext.1,input.1,n_tree.1,depth.1,learning_rate.1,0.35_precision,0.35_recall,0.35_f1
0,Y,5,2000,3,0.001,0.640273,0.802308,0.709995,N,5,1500,3,0.001,0.663076,0.765769,0.708543
1,N,5,1500,3,0.001,0.632538,0.811923,0.709038,Y,5,2000,3,0.001,0.663625,0.760385,0.705724
2,N,5,1000,3,0.001,0.625960,0.819231,0.707341,N,5,100,3,0.010,0.648741,0.779231,0.705637
3,Y,5,1500,3,0.001,0.626533,0.811923,0.705589,Y,5,500,3,0.001,0.660503,0.757692,0.703005
4,N,5,2000,3,0.001,0.638332,0.794231,0.704360,Y,5,1500,3,0.001,0.648974,0.771538,0.702939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,N,8,100,3,0.001,0.328942,0.999231,0.494949,Y,5,2000,7,0.100,0.724625,0.427692,0.531099
221,Y,5,100,5,0.001,0.328942,0.999231,0.494949,N,8,2000,7,0.100,0.706826,0.415769,0.517036
222,N,6,100,3,0.001,0.328856,0.998846,0.494804,N,6,1500,7,0.100,0.668153,0.425769,0.514930
223,N,5,100,3,0.001,0.328856,0.998846,0.494804,Y,6,2000,7,0.100,0.667260,0.418611,0.510546


## The best Gradient Boosing model was built on extended dataset with 5 important features using 2000 trees, learning rate 0.001, and depth of 3, cut-off values of 0.3 or 0.35
## The second best Gradient Boosing model was built on cleaned dataset with 5 important features using 1000 or 1500 trees, learning rate 0.001, and depth of 3, cut-off values of 0.3 or 0.35