In [4]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.6.2-cp36-cp36m-manylinux2010_x86_64.whl (458.3 MB)
[K     |████████████████████████████████| 458.3 MB 12 kB/s s eta 0:00:01     |█████████████                   | 186.9 MB 92.3 MB/s eta 0:00:03██████████████▎           | 291.0 MB 82.0 MB/s eta 0:00:03     |█████████████████████████▉      | 369.4 MB 82.0 MB/s eta 0:00:02
[?25hCollecting clang~=5.0
  Downloading clang-5.0.tar.gz (30 kB)
Collecting six~=1.15.0
  Downloading six-1.15.0-py2.py3-none-any.whl (10 kB)
Collecting tensorboard<2.7,>=2.6.0
  Downloading tensorboard-2.6.0-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 59.9 MB/s eta 0:00:01
Collecting typing-extensions~=3.7.4
  Downloading typing_extensions-3.7.4.3-py3-none-any.whl (22 kB)
Collecting gast==0.4.0
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting keras-preprocessing~=1.1.2
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[K     |████████████████████████████

In [49]:
## 6.a
import boto3, botocore
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, f1_score
from itertools import product

## fetch file content from s3
s3 = boto3.resource('s3')
bucket = s3.Bucket('danhtran358-data-445-bucket')

bucket_object = bucket.Object('project_cleaned_data.csv')
## read file content to data-frame
diabetes_cleaned = pd.read_csv(bucket_object.get().get('Body'))
diabetes_cleaned

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89,66,23,94,28.1,0.167,21,0
1,0,137,40,35,168,43.1,2.288,33,1
2,3,78,50,32,88,31.0,0.248,26,1
3,2,197,70,45,543,30.5,0.158,53,1
4,1,189,60,23,846,30.1,0.398,59,1
...,...,...,...,...,...,...,...,...,...
387,0,181,88,44,510,43.3,0.222,26,1
388,1,128,88,39,110,36.5,1.057,37,1
389,2,88,58,26,16,28.4,0.766,22,0
390,10,101,76,48,180,32.9,0.171,63,0


In [50]:
bucket_object = bucket.Object('project_cleaned_data_extended_after_LASSO.csv')
## read file content to data-frame
diabetes_extended = pd.read_csv(bucket_object.get().get('Body'))
diabetes_extended

Unnamed: 0,Pregnancies,Glucose,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,35,33.6,0.627,50,1
1,1,85,29,26.6,0.351,31,0
2,1,89,23,28.1,0.167,21,0
3,0,137,35,43.1,2.288,33,1
4,3,78,32,31.0,0.248,26,1
...,...,...,...,...,...,...,...
529,9,170,31,44.0,0.403,43,1
530,10,101,48,32.9,0.171,63,0
531,2,122,27,36.8,0.340,27,0
532,5,121,23,26.2,0.245,30,0


In [51]:
## Use dataframes to store parameters to build models and store total scores
def expand_grid(dictionary):
    return pd.DataFrame([row for row in product(*dictionary.values())], columns = dictionary.keys())

dictionary = {'extended_data' : ['Y', 'N'], 'input_layer': [6, 8], 'mid_layer_1': [2, 3, 4], 'mid_layer_2': [2, 3, 4], 'total_loops' : [0],
                 'batch_size' : [20, 40, 60], 'mlp' : ['mlp1_tanh', 'mlp1_relu', 'mlp2_tanh', 'mlp2_relu', 'mlp2_tanh_relu', 'mlp2_relu_tanh']}

## lists of cut-off values and types of score to evaluate models
cut_off = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
score_to_evaluate = ['precision', 'recall', 'f1']

In [52]:
## function to write write data_frame to csv file object in S3 bucket
def write_data_to_s3(file_name, data_frame):
    ## file object in s3 bucket
    data_file = bucket.Object(file_name)
    
    ## add content from the lists of recall scores
    content = data_frame.to_csv(index=False)

    ## store as new csv file
    data_file.put(Body = content)
    

## function to read Random Forest data stored in s3 csv to dataframe
def read_data_from_s3(file_name):
    try:
        ## file object in s3 bucket
        data_file = bucket.Object(file_name)
        
        data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            results = expand_grid(dictionary)
            
            ## will not work on extended data with 8 feature columns
            results = results.drop(results[(results['extended_data'] == 'Y') & (results['input_layer'] == 8)].index)
            
            ## create columns for all types of cut-off values and scores
            for i in range(len(cut_off)):
                for j in range(len(score_to_evaluate)):
                    col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                    results[col] = 0.0
                    
            ## write brand new and empty file to s3
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(data_file.get().get('Body'))

In [53]:
def mlp1_tanh_predict(X_train, X_test, Y_train, input_layer, mid_layer_1, batch_size):
    ## Multilayer perceptron 1 mid layer tanh
    ## Define mlp structure
    mlp_md1 = tf.keras.models.Sequential([
          tf.keras.layers.Dense(mid_layer_1, input_dim = input_layer, activation = 'tanh'),
          tf.keras.layers.Dense(2, activation = 'softmax')
    ])

    ## Compile and fit model to data
    mlp_md1.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = [tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    mlp_md1.fit(X_train, tf.keras.utils.to_categorical(Y_train), epochs = 100, batch_size = batch_size, verbose = 0)

    ## Predict probability 
    predict_md1 = mlp_md1.predict(X_test)[:,1]
    
    return predict_md1
    

def mlp1_relu_predict(X_train, X_test, Y_train, input_layer, mid_layer_1, batch_size):
    ## Multilayer perceptron 1 mid layer relu
    ## Define mlp structure
    mlp_md2 = tf.keras.models.Sequential([
          tf.keras.layers.Dense(mid_layer_1, input_dim = input_layer, activation = 'relu'),
          tf.keras.layers.Dense(2, activation = 'softmax')
    ])

    ## Compile and fit model to data
    mlp_md2.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = [tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    mlp_md2.fit(X_train, tf.keras.utils.to_categorical(Y_train), epochs = 100, batch_size = batch_size, verbose = 0)

    ## Predict probability
    predict_md2 = mlp_md2.predict(X_test)[:,1]

    return predict_md2


def mlp2_tanh_predict(X_train, X_test, Y_train, input_layer, mid_layer_1, mid_layer_2, batch_size):
    ## Multilayer perceptron 2 mid layer, both tanh
    ## Define mlp structure
    mlp_md1 = tf.keras.models.Sequential([
          tf.keras.layers.Dense(mid_layer_1, input_dim = input_layer, activation = 'tanh'),
          tf.keras.layers.Dense(mid_layer_2, activation = 'tanh'),
          tf.keras.layers.Dense(2, activation = 'softmax')
    ])

    ## Compile and fit model to data
    mlp_md1.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = [tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    mlp_md1.fit(X_train, tf.keras.utils.to_categorical(Y_train), epochs = 100, batch_size = batch_size, verbose = 0)

    ## Predict probability 
    predict_md1 = mlp_md1.predict(X_test)[:,1]
    
    return predict_md1
    

def mlp2_relu_predict(X_train, X_test, Y_train, input_layer, mid_layer_1, mid_layer_2, batch_size):
    ## Multilayer perceptron 2 layers, both relu
    ## Define mlp structure
    mlp_md2 = tf.keras.models.Sequential([
          tf.keras.layers.Dense(mid_layer_1, input_dim = input_layer, activation = 'relu'),
          tf.keras.layers.Dense(mid_layer_2, activation = 'relu'),
          tf.keras.layers.Dense(2, activation = 'softmax')
    ])

    ## Compile and fit model to data
    mlp_md2.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = [tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    mlp_md2.fit(X_train, tf.keras.utils.to_categorical(Y_train), epochs = 100, batch_size = batch_size, verbose = 0)

    ## Predict probability
    predict_md2 = mlp_md2.predict(X_test)[:,1]

    return predict_md2


def mlp2_tanh_relu_predict(X_train, X_test, Y_train, input_layer, mid_layer_1, mid_layer_2, batch_size):
    ## Multilayer perceptron 2 layers, tanh and relu
    ## Define mlp structure
    mlp_md1 = tf.keras.models.Sequential([
          tf.keras.layers.Dense(mid_layer_1, input_dim = input_layer, activation = 'tanh'),
          tf.keras.layers.Dense(mid_layer_2, activation = 'relu'),
          tf.keras.layers.Dense(2, activation = 'softmax')
    ])

    ## Compile and fit model to data
    mlp_md1.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = [tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    mlp_md1.fit(X_train, tf.keras.utils.to_categorical(Y_train), epochs = 100, batch_size = batch_size, verbose = 0)

    ## Predict probability 
    predict_md1 = mlp_md1.predict(X_test)[:,1]
    
    return predict_md1
    

def mlp2_relu_tanh_predict(X_train, X_test, Y_train, input_layer, mid_layer_1, mid_layer_2, batch_size):
    ## Multilayer perceptron 2 layers, relu and tanh
    ## Define mlp structure
    mlp_md2 = tf.keras.models.Sequential([
          tf.keras.layers.Dense(mid_layer_1, input_dim = input_layer, activation = 'relu'),
          tf.keras.layers.Dense(mid_layer_2, activation = 'tanh'),
          tf.keras.layers.Dense(2, activation = 'softmax')
    ])

    ## Compile and fit model to data
    mlp_md2.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = [tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    mlp_md2.fit(X_train, tf.keras.utils.to_categorical(Y_train), epochs = 100, batch_size = batch_size, verbose = 0)

    ## Predict probability
    predict_md2 = mlp_md2.predict(X_test)[:,1]

    return predict_md2

In [54]:
## build the appropriate model and update the result dataset after each model is built
def update_results(X_train, X_test, Y_train, Y_test, results, combo_number):
    parameters = results.loc[combo_number]
    
    if parameters['mlp'] == 'mlp1_tanh':
        pred = mlp1_tanh_predict(X_train, X_test, Y_train, parameters['input_layer'], parameters['mid_layer_1'], parameters['batch_size'])
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['mlp'] == 'mlp1_relu':
        pred = mlp1_relu_predict(X_train, X_test, Y_train, parameters['input_layer'], parameters['mid_layer_1'], parameters['batch_size'])
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['mlp'] == 'mlp2_tanh':
        pred = mlp2_tanh_predict(X_train, X_test, Y_train, parameters['input_layer'], parameters['mid_layer_1'], parameters['mid_layer_2'], parameters['batch_size'])
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['mlp'] == 'mlp2_relu':
        pred = mlp2_relu_predict(X_train, X_test, Y_train, parameters['input_layer'], parameters['mid_layer_1'], parameters['mid_layer_2'], parameters['batch_size'])
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['mlp'] == 'mlp2_tanh_relu':
        pred = mlp2_tanh_relu_predict(X_train, X_test, Y_train, parameters['input_layer'], parameters['mid_layer_1'], parameters['mid_layer_2'], parameters['batch_size'])
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['mlp'] == 'mlp2_relu_tanh':
        pred = mlp2_relu_tanh_predict(X_train, X_test, Y_train, parameters['input_layer'], parameters['mid_layer_1'], parameters['mid_layer_2'], parameters['batch_size'])
        update_result_scores(pred, Y_test, results, combo_number)

## update the scores in result dataset after each model is built
def update_result_scores(pred, Y_test, results, combo_number):
    for cut_off_id in range(len(cut_off)):
        
        ## classify labels
        current_cut_off = cut_off[cut_off_id]
        pred_labels = np.where(pred < current_cut_off, 0, 1)
        
        for score_id in range(len(score_to_evaluate)):
            
            ## updated the appropriate score
            current_score = score_to_evaluate[score_id]
            score_column = str(current_cut_off) + '_' + current_score
            
            if current_score == 'precision':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + precision_score(Y_test, pred_labels, zero_division = 0)
            
            elif current_score == 'recall':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + recall_score(Y_test, pred_labels)
                
            elif current_score == 'f1': 
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + f1_score(Y_test, pred_labels)

In [55]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']

## read MLP data stored in s3 file
data_file_name = 'project_mlp_result_with_batch.csv'
results = read_data_from_s3(data_file_name)

scaler = MinMaxScaler()

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(results.at[1, 'total_loops'], 100):
    
    ## Build MLP models for each parameter combination and store scores
    for combo_number in range(results.shape[0]):
        parameters = results.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            else:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
                
        ## scale input variables to 0-1 scale
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)
        
        update_results(X_train, X_test, Y_train, Y_test, results, combo_number)
        
    results['total_loops'] = loop_number + 1
    ## Writing data to s3
    write_data_to_s3(data_file_name, results)


In [56]:
## Get number of loops already run
loops_run = results.at[0, 'total_loops']

## type of score to check
score_to_check = 'f1'

## Create a dataframe to store values on a cut-off and append values for other cut-offs
cut_off_value = cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

## Appending cut=offs
for cut_off_value in range(1, len(cut_off)):
    
    column_name = str(cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.2_f1,index.1,0.25_f1,index.2,0.3_f1,index.3,0.35_f1,index.4,0.4_f1,index.5,0.45_f1,index.6,0.5_f1
0,128,0.639295,146,0.659928,146,0.672734,146,0.674097,146,0.672721,146,0.667687,146,0.656245
1,146,0.638058,128,0.658484,128,0.669551,92,0.673442,126,0.669026,92,0.643405,92,0.622597
2,92,0.627010,92,0.651750,92,0.665469,126,0.671641,128,0.658789,128,0.641986,128,0.618988
3,148,0.613729,308,0.639769,126,0.657583,128,0.669071,92,0.658009,126,0.638853,108,0.594501
4,74,0.611754,148,0.638848,144,0.653765,108,0.662456,144,0.653821,108,0.627745,126,0.593631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,169,0.494791,393,0.494942,175,0.498497,333,0.279579,177,0.158100,376,0.084284,340,0.043403
482,267,0.494663,301,0.494862,213,0.495894,9,0.265993,171,0.133392,171,0.081072,249,0.040958
483,211,0.494600,355,0.494493,211,0.495685,177,0.263322,195,0.131534,195,0.069908,195,0.040151
484,391,0.494486,265,0.492863,393,0.494867,171,0.227999,393,0.118358,339,0.058393,376,0.031683


## 0.35 looks to be the best cut-off value

In [57]:
## Reviewing cut off from above dataframe
review_cut_off = [0.3, 0.35]

## read MLP data stored in s3 file
data_file_name = 'project_mlp_result_with_batch.csv'
results = read_data_from_s3(data_file_name)

## number of loops already run
loops_run = results.at[0, 'total_loops']

review_df = pd.DataFrame()

## Displaying all average score for the cut-offs chosen to review
for cut_off_value in range(len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_f1'
    next_f1_score_index = pd.DataFrame(results[column_name].sort_values(ascending = [False])).index
    score_columns = list()
    
    for score_to_check in score_to_evaluate:
        
        column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
        
        score_columns.append(column_name)

    parameter_df = results[['extended_data', 'input_layer', 'mid_layer_1', 'mid_layer_2', 'batch_size', 'mlp']].loc[next_f1_score_index].reset_index(drop = True)
    score_df = results[score_columns].loc[next_f1_score_index].reset_index(drop = True) / loops_run
    
    review_df = pd.concat([review_df, parameter_df, score_df], axis = 1)

review_df = review_df.rename(columns = {'extended_data':'ext', 'input_layer':'input', 'mid_layer_1':'mid1', 'mid_layer_2':'mid2', 'batch_size':'batch'})
review_df

Unnamed: 0,ext,input,mid1,mid2,batch,mlp,0.3_precision,0.3_recall,0.3_f1,ext.1,input.1,mid1.1,mid2.1,batch.1,mlp.1,0.35_precision,0.35_recall,0.35_f1
0,Y,6,4,4,20,mlp2_tanh,0.555694,0.867222,0.672734,Y,6,4,4,20,mlp2_tanh,0.581090,0.820278,0.674097
1,Y,6,4,3,20,mlp2_tanh,0.563047,0.847500,0.669551,Y,6,3,4,20,mlp2_tanh,0.589568,0.809722,0.673442
2,Y,6,3,4,20,mlp2_tanh,0.549435,0.859444,0.665469,Y,6,4,3,20,mlp1_tanh,0.581929,0.815556,0.671641
3,Y,6,4,3,20,mlp1_tanh,0.533583,0.878611,0.657583,Y,6,4,3,20,mlp2_tanh,0.592055,0.793889,0.669071
4,Y,6,4,4,20,mlp1_tanh,0.534700,0.860556,0.653765,Y,6,4,2,20,mlp1_tanh,0.574651,0.801944,0.662456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,N,6,2,2,60,mlp1_relu,0.350748,0.903077,0.498497,N,8,2,2,40,mlp2_relu,0.287668,0.326538,0.279579
482,N,6,2,4,60,mlp2_relu,0.338265,0.946923,0.495894,Y,6,2,2,40,mlp2_relu,0.291768,0.303611,0.265993
483,N,6,2,4,60,mlp1_relu,0.341074,0.931154,0.495685,N,6,2,2,60,mlp2_relu,0.257620,0.326154,0.263322
484,N,8,3,2,60,mlp2_relu,0.336992,0.953846,0.494867,N,6,2,2,40,mlp2_relu,0.243525,0.253077,0.227999


In [None]:
## Excluding these results from the project