In [1]:
## 6.a
import boto3, botocore
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score
from itertools import product

## fetch file content from s3
s3 = boto3.resource('s3')
bucket = s3.Bucket('danhtran358-data-445-bucket')

bucket_object = bucket.Object('project_cleaned_data.csv')
## read file content to data-frame
diabetes_cleaned = pd.read_csv(bucket_object.get().get('Body'))
diabetes_cleaned

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89,66,23,94,28.1,0.167,21,0
1,0,137,40,35,168,43.1,2.288,33,1
2,3,78,50,32,88,31.0,0.248,26,1
3,2,197,70,45,543,30.5,0.158,53,1
4,1,189,60,23,846,30.1,0.398,59,1
...,...,...,...,...,...,...,...,...,...
387,0,181,88,44,510,43.3,0.222,26,1
388,1,128,88,39,110,36.5,1.057,37,1
389,2,88,58,26,16,28.4,0.766,22,0
390,10,101,76,48,180,32.9,0.171,63,0


In [2]:
bucket_object = bucket.Object('project_cleaned_data_extended_after_LASSO.csv')
## read file content to data-frame
diabetes_extended = pd.read_csv(bucket_object.get().get('Body'))
diabetes_extended

Unnamed: 0,Pregnancies,Glucose,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,35,33.6,0.627,50,1
1,1,85,29,26.6,0.351,31,0
2,1,89,23,28.1,0.167,21,0
3,0,137,35,43.1,2.288,33,1
4,3,78,32,31.0,0.248,26,1
...,...,...,...,...,...,...,...
529,9,170,31,44.0,0.403,43,1
530,10,101,48,32.9,0.171,63,0
531,2,122,27,36.8,0.340,27,0
532,5,121,23,26.2,0.245,30,0


In [3]:
## function to write write data_frame to csv file object in S3 bucket
def write_data_to_s3(file_name, data_frame):
    ## file object in s3 bucket
    data_file = bucket.Object(file_name)
    
    ## add content from the lists of recall scores
    content = data_frame.to_csv(index=False)

    ## store as new csv file
    data_file.put(Body = content)
    

## function to read Random Forest data stored in s3 csv to dataframe
def read_rf_data(file_name):
    try:
        ## file object in s3 bucket
        rf_data_file = bucket.Object(file_name)
        
        rf_data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            results = expand_grid(rf_dictionary)
            
            ## will not work on extended data with 8 feature columns
            results = results.drop(results[(results['extended_data'] == 'Y') & (results['input_layer'] == 8)].index)
            
            ## create columns for all types of cut-off values and scores
            for i in range(len(cut_off)):
                for j in range(len(score_to_evaluate)):
                    col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                    results[col] = 0.0
                    
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(rf_data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(rf_data_file.get().get('Body'))

    
## function to read AdaBoosting/Gradient Boosting data stored in s3 csv to dataframe
def read_boosting_data(file_name):
    try:
        ## file object in s3 bucket
        boosting_data_file = bucket.Object(file_name)
        
        boosting_data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            results = expand_grid(boosting_dictionary)
            
            ## will not work on extended data with 8 feature columns
            results = results.drop(results[(results['extended_data'] == 'Y') & (results['input_layer'] == 8)].index)
            
            ## create columns for all types of cut-off values and scores
            for i in range(len(cut_off)):
                for j in range(len(score_to_evaluate)):
                    col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                    results[col] = 0.0
                    
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(boosting_data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(boosting_data_file.get().get('Body'))
    

## function to read Random Forest data stored in s3 csv to dataframe
def read_data_from_s3(file_name, X = None):
    try:
        ## file object in s3 bucket
        data_file = bucket.Object(file_name)
        
        data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            if X is None:
                results = expand_grid(dictionary)

                ## will not work on extended data with 8 feature columns
                results = results.drop(results[(results['extended_data'] == 'Y') & (results['input_layer'] == 8)].index)

                ## create columns for all types of cut-off values and scores
                for i in range(len(cut_off)):
                    for j in range(len(score_to_evaluate)):
                        col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                        results[col] = 0.0
            
            else:
                ## empty dataframe with first row has 0 for total loops
                empty_list = list()
                results = pd.DataFrame(empty_list, columns = X.columns)
                results.at[0, 'total_loops'] = 0
                   
            ## write brand new and empty file to s3
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(data_file.get().get('Body'))

In [4]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']

## List to store feature importances
all_feature_importances = list()

## read ensemble feature importances data stored in s3 file
data_file_name = 'project_ensemble_feature_importances.csv'
results = read_data_from_s3(data_file_name, X)

for loop_number in range(int(results.at[0, 'total_loops']), 1000):
    ## Split data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    ## Build models and store feature importances
    md_rf = RandomForestClassifier(max_depth = 3, n_estimators = 500).fit(X_train, Y_train)
    all_feature_importances.append(md_rf.feature_importances_)
    
    md_ada = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)
    all_feature_importances.append(md_ada.feature_importances_)
    
    md_grad = GradientBoostingClassifier(max_depth = 3, n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)
    all_feature_importances.append(md_grad.feature_importances_)
    
    all_feature_importances_df = pd.DataFrame(all_feature_importances, columns = X.columns)
    all_feature_importances_df['total_loops'] = loop_number + 1
    
    write_data_to_s3(data_file_name, all_feature_importances_df)
    
## Calculate the average importances of variables across 100 splits and 3 models
results = read_data_from_s3(data_file_name, X.columns)
print(np.mean(results, axis = 0))

Pregnancies                    0.067908
Glucose                        0.327742
BloodPressure                  0.051193
SkinThickness                  0.054225
Insulin                        0.136200
BMI                            0.110167
DiabetesPedigreeFunction       0.105079
Age                            0.147487
total_loops                 1000.000000
dtype: float64


In [5]:
## The clean data from project_cleaned_data.csv has missing value observations deleted

## This process only cleans missing value observation in the columns kept after dropping less important columns
bucket_object = bucket.Object('diabetes.csv')
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## read file content to data-frame
diabetes_not_cleaned = pd.read_csv(file_content_stream)

## dropping columns less important
diabetes_important = diabetes_not_cleaned.drop(columns = ['Pregnancies', 'BloodPressure', 'SkinThickness'])

## Preprocessing - Clean missing data values
## Glucose missing values
diabetes_important = diabetes_important.loc[diabetes_important['Glucose'] != 0]

## SkinThickness missing values
diabetes_important = diabetes_important.loc[diabetes_important['Insulin'] != 0]

## BMI missing values
diabetes_important = diabetes_important.loc[diabetes_important['BMI'] != 0]

diabetes_important = diabetes_important.reset_index(drop = True)

diabetes_important

Unnamed: 0,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,89,94,28.1,0.167,21,0
1,137,168,43.1,2.288,33,1
2,78,88,31.0,0.248,26,1
3,197,543,30.5,0.158,53,1
4,189,846,30.1,0.398,59,1
...,...,...,...,...,...,...
387,181,510,43.3,0.222,26,1
388,128,110,36.5,1.057,37,1
389,88,16,28.4,0.766,22,0
390,101,180,32.9,0.171,63,0


In [6]:
## write cleaned data to s3 bucket
write_data_to_s3('project_cleaned_data_extended_after_feature_importances.csv', diabetes_important)

In [7]:
## Use dataframes to store parameters to build models and store total scores
def expand_grid(dictionary):
    return pd.DataFrame([row for row in product(*dictionary.values())], columns = dictionary.keys())

rf_dictionary = {'extended_data' : ['Y', 'N'], 'input_layer': [5, 6, 8], 'total_loops' : [0],
                 'n_tree': [100, 500, 1000, 1500, 2000], 'depth': [3, 5, 7]}
boosting_dictionary = {'extended_data' : ['Y', 'N'], 'input_layer': [5, 6, 8], 'total_loops' : [0],
                       'n_tree': [100, 500, 1000, 1500, 2000], 'depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001]}

## lists of cut-off values and types of score to evaluate models
cut_off = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65]
score_to_evaluate = ['precision', 'recall', 'f1']


In [8]:
## update the scores in result dataset after each model is built
def update_result_scores(pred, Y_test, results, combo_number):
    
    for cut_off_id in range(len(cut_off)):
        
        ## classify labels
        current_cut_off = cut_off[cut_off_id]
        pred_labels = np.where(pred < current_cut_off, 0, 1)
        
        for score_id in range(len(score_to_evaluate)):
            
            ## updated the appropriate score
            current_score = score_to_evaluate[score_id]
            score_column = str(current_cut_off) + '_' + current_score
            if current_score == 'precision':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + precision_score(Y_test, pred_labels, zero_division = 0)
            
            elif current_score == 'recall':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + recall_score(Y_test, pred_labels)
                
            elif current_score == 'f1': 
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + f1_score(Y_test, pred_labels)

In [None]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_important = X.drop(columns = ['Pregnancies', 'BloodPressure', 'SkinThickness'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']
X_important_extended = diabetes_important.drop(columns = ['Outcome'])
Y_important_extended = diabetes_important['Outcome']

## read Random Forest data stored in s3 file
data_file_name = 'project_rf_data.csv'
results = read_rf_data(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(results.at[0, 'total_loops'], 100):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(results.shape[0]):
        parameters = results.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 5:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important, Y, test_size = 0.2, stratify = Y)
            
            elif parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            elif parameters['input_layer'] == 8:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 5:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important_extended, Y_important_extended, test_size = 0.2, stratify = Y_important_extended)
                
            elif parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
        
        ## Building model
        md_rf = RandomForestClassifier(max_depth = parameters['depth'],
                                       n_estimators = int(parameters['n_tree'])).fit(X_train, Y_train)

        ## Predicting
        pred = md_rf.predict_proba(X_test)[:, 1]
        
        update_result_scores(pred, Y_test, results, combo_number)
        
    results['total_loops'] = loop_number + 1
    
    ## Writing data to s3
    write_data_to_s3(data_file_name, results)
    

In [None]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_important = X.drop(columns = ['Pregnancies', 'BloodPressure', 'SkinThickness'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']
X_important_extended = diabetes_important.drop(columns = ['Outcome'])
Y_important_extended = diabetes_important['Outcome']

## read Random Forest data stored in s3 file
data_file_name = 'project_ada_data.csv'
results = read_boosting_data(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(results.at[0, 'total_loops'], 100):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(results.shape[0]):
        parameters = results.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 5:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important, Y, test_size = 0.2, stratify = Y)
            
            elif parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            elif parameters['input_layer'] == 8:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 5:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important_extended, Y_important_extended, test_size = 0.2, stratify = Y_important_extended)
                
            elif parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
        
        ## Building model
        md_ada = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = parameters['depth']),
                                    n_estimators = int(parameters['n_tree']),
                                    learning_rate = parameters['learning_rate']).fit(X_train, Y_train)

        ## Predicting
        pred = md_ada.predict_proba(X_test)[:, 1]
        
        update_result_scores(pred, Y_test, results, combo_number)
        
    results['total_loops'] = loop_number + 1
    
    ## Writing data to s3
    write_data_to_s3(data_file_name, results)
    

In [None]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_important = X.drop(columns = ['Pregnancies', 'BloodPressure', 'SkinThickness'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']
X_important_extended = diabetes_important.drop(columns = ['Outcome'])
Y_important_extended = diabetes_important['Outcome']

## read Random Forest data stored in s3 file
data_file_name = 'project_grad_data.csv'
results = read_boosting_data(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(results.at[0, 'total_loops'], 100):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(results.shape[0]):
        parameters = results.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 5:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important, Y, test_size = 0.2, stratify = Y)
            
            elif parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            elif parameters['input_layer'] == 8:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 5:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important_extended, Y_important_extended, test_size = 0.2, stratify = Y_important_extended)
                
            elif parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
        
        ## Building model
        md_grad = GradientBoostingClassifier(max_depth = parameters['depth'],
                                             n_estimators = int(parameters['n_tree']),
                                             learning_rate = parameters['learning_rate']).fit(X_train, Y_train)
        ## Predicting
        pred = md_grad.predict_proba(X_test)[:, 1]
        
        update_result_scores(pred, Y_test, results, combo_number)
        
    results['total_loops'] = loop_number + 1
    
    ## Writing data to s3
    write_data_to_s3(data_file_name, results)
    