In [2]:
## 6.a
import boto3, botocore
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score
from itertools import product

## fetch file content from s3
s3 = boto3.resource('s3')
bucket = s3.Bucket('danhtran358-data-445-bucket')

bucket_object = bucket.Object('project_cleaned_data.csv')
## read file content to data-frame
diabetes_cleaned = pd.read_csv(bucket_object.get().get('Body'))
diabetes_cleaned

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89,66,23,94,28.1,0.167,21,0
1,0,137,40,35,168,43.1,2.288,33,1
2,3,78,50,32,88,31.0,0.248,26,1
3,2,197,70,45,543,30.5,0.158,53,1
4,1,189,60,23,846,30.1,0.398,59,1
...,...,...,...,...,...,...,...,...,...
387,0,181,88,44,510,43.3,0.222,26,1
388,1,128,88,39,110,36.5,1.057,37,1
389,2,88,58,26,16,28.4,0.766,22,0
390,10,101,76,48,180,32.9,0.171,63,0


In [3]:
bucket_object = bucket.Object('project_cleaned_data_extended_after_LASSO.csv')
## read file content to data-frame
diabetes_extended = pd.read_csv(bucket_object.get().get('Body'))
diabetes_extended

Unnamed: 0,Pregnancies,Glucose,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,35,33.6,0.627,50,1
1,1,85,29,26.6,0.351,31,0
2,1,89,23,28.1,0.167,21,0
3,0,137,35,43.1,2.288,33,1
4,3,78,32,31.0,0.248,26,1
...,...,...,...,...,...,...,...
529,9,170,31,44.0,0.403,43,1
530,10,101,48,32.9,0.171,63,0
531,2,122,27,36.8,0.340,27,0
532,5,121,23,26.2,0.245,30,0


In [4]:
## Use dataframes to store parameters to build models and store total scores
def expand_grid(dictionary):
    return pd.DataFrame([row for row in product(*dictionary.values())], columns = dictionary.keys())

dictionary = {'extended_data' : ['Y', 'N'], 'input_layer': [6, 8], 'total_loops' : [0], 'svc' : ['rbf', 'poly', 'linear', 'sigmoid']}

## lists of cut-off values and types of score to evaluate models
cut_off = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
score_to_evaluate = ['precision', 'recall', 'f1']

In [5]:
## function to write write data_frame to csv file object in S3 bucket
def write_data_to_s3(file_name, data_frame):
    ## file object in s3 bucket
    data_file = bucket.Object(file_name)
    
    ## add content from the lists of recall scores
    content = data_frame.to_csv(index=False)

    ## store as new csv file
    data_file.put(Body = content)
    

## function to read Random Forest data stored in s3 csv to dataframe
def read_data_from_s3(file_name):
    try:
        ## file object in s3 bucket
        data_file = bucket.Object(file_name)
        
        data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            results = expand_grid(dictionary)
            
            ## will not work on extended data with 8 feature columns
            results = results.drop(results[(results['extended_data'] == 'Y') & (results['input_layer'] == 8)].index)
            
            ## create columns for all types of cut-off values and scores
            for i in range(len(cut_off)):
                for j in range(len(score_to_evaluate)):
                    col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                    results[col] = 0.0
                   
            ## write brand new and empty file to s3
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(data_file.get().get('Body'))

In [6]:
def svc_rbf_predict(X_train, X_test, Y_train):
    ## Building the svc with kernel = 'rbf'
    md_svc_rbf = SVC(kernel = 'rbf', probability = True).fit(X_train, Y_train)

    ## Making predictions on the test dataset
    pred_svc_rbf = md_svc_rbf.predict_proba(X_test)[:,1]
    
    return pred_svc_rbf
    
def svc_poly_predict(X_train, X_test, Y_train):
    ## Building the svc with kernel = 'poly'
    md_svc_poly = SVC(kernel = 'poly', probability = True).fit(X_train, Y_train)

    ## Making predictions on the test dataset
    pred_svc_poly = md_svc_poly.predict_proba(X_test)[:,1]
    
    return pred_svc_poly
    
def svc_linear_predict(X_train, X_test, Y_train):
    ## Building the svc with kernel = 'rbf'
    md_svc_linear = SVC(kernel = 'linear', probability = True).fit(X_train, Y_train)

    ## Making predictions on the test dataset
    pred_svc_linear = md_svc_linear.predict_proba(X_test)[:,1]
    
    return pred_svc_linear
    
def svc_sigmoid_predict(X_train, X_test, Y_train):
    ## Building the svc with kernel = 'rbf'
    md_svc_sigmoid = SVC(kernel = 'sigmoid', probability = True).fit(X_train, Y_train)

    ## Making predictions on the test dataset
    pred_svc_sigmoid = md_svc_sigmoid.predict_proba(X_test)[:,1]
    
    return pred_svc_sigmoid

In [7]:
## build the appropriate model and update the result dataset after each model is built
def update_results(X_train, X_test, Y_train, Y_test, results, combo_number):
    parameters = results.loc[combo_number]
    
    if parameters['svc'] == 'rbf':
        pred = svc_rbf_predict(X_train, X_test, Y_train)
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['svc'] == 'poly':
        pred = svc_poly_predict(X_train, X_test, Y_train)
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['svc'] == 'linear':
        pred = svc_linear_predict(X_train, X_test, Y_train)
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['svc'] == 'sigmoid':
        pred = svc_sigmoid_predict(X_train, X_test, Y_train)
        update_result_scores(pred, Y_test, results, combo_number)

## update the scores in result dataset after each model is built
def update_result_scores(pred, Y_test, results, combo_number):
    for cut_off_id in range(len(cut_off)):
        
        ## classify labels
        current_cut_off = cut_off[cut_off_id]
        pred_labels = np.where(pred < current_cut_off, 0, 1)
        
        for score_id in range(len(score_to_evaluate)):
            
            ## updated the appropriate score
            current_score = score_to_evaluate[score_id]
            score_column = str(current_cut_off) + '_' + current_score
            
            if current_score == 'precision':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + precision_score(Y_test, pred_labels, zero_division = 0)
            
            elif current_score == 'recall':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + recall_score(Y_test, pred_labels)
                
            elif current_score == 'f1': 
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + f1_score(Y_test, pred_labels)

In [8]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']

## read SVC data stored in s3 file
data_file_name = 'project_svc_result.csv'
results = read_data_from_s3(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(results.at[1, 'total_loops'], 100):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(results.shape[0]):
        parameters = results.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            else:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
                
        ## scale input variables to 0-1 scale
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)
        
        update_results(X_train, X_test, Y_train, Y_test, results, combo_number)
        
    results['total_loops'] = loop_number + 1
    ## Writing data to s3
    write_data_to_s3(data_file_name, results)


In [9]:
## Get number of loops already run
loops_run = results.at[0, 'total_loops']

## type of score to check
score_to_check = 'f1'

## Create a dataframe to store values on a cut-off and append values for other cut-offs
cut_off_value = cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

## Appending cut=offs
for cut_off_value in range(1, len(cut_off)):
    
    column_name = str(cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.1_f1,index.1,0.15_f1,index.2,0.2_f1,index.3,0.25_f1,index.4,0.3_f1,index.5,0.35_f1,index.6,0.4_f1,index.7,0.45_f1,index.8,0.5_f1
0,7,0.60956,2,0.632833,2,0.658268,6,0.671173,6,0.680764,6,0.68305,2,0.674438,3,0.669096,3,0.670555
1,3,0.603189,6,0.627277,6,0.653615,2,0.670093,2,0.676281,2,0.680895,6,0.671745,2,0.66785,7,0.661314
2,11,0.599217,7,0.625588,10,0.642383,0,0.665369,0,0.672751,10,0.673265,10,0.670702,7,0.662689,2,0.65976
3,2,0.596576,3,0.62178,0,0.641736,8,0.660828,8,0.672164,0,0.671365,0,0.668998,6,0.661253,10,0.650129
4,6,0.588659,10,0.615758,7,0.638466,10,0.660296,10,0.667697,8,0.668186,3,0.666254,0,0.659008,6,0.648298
5,10,0.579814,11,0.609302,8,0.635763,4,0.654465,4,0.66123,7,0.6634,7,0.661159,10,0.658933,0,0.646637
6,0,0.510046,0,0.584231,3,0.634676,7,0.647461,7,0.655443,3,0.662024,8,0.655144,8,0.64433,8,0.627787
7,4,0.509678,8,0.579931,4,0.633644,3,0.644683,3,0.652748,4,0.659228,4,0.652029,4,0.638412,11,0.609623
8,8,0.509548,4,0.578127,11,0.613921,1,0.639447,1,0.641007,1,0.623737,11,0.617555,11,0.613138,4,0.606811
9,1,0.499589,1,0.508486,1,0.581304,11,0.617054,9,0.622147,11,0.621182,1,0.593399,1,0.572708,1,0.550452


In [10]:
## 2nd run
## lists of cut-off values and types of score to evaluate models
cut_off = [0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]

## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']

## read SVC data stored in s3 file
data_file_name = 'project_svc_result_2nd_run.csv'
results = read_data_from_s3(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(results.at[1, 'total_loops'], 100):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(results.shape[0]):
        parameters = results.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            else:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
                
        ## scale input variables to 0-1 scale
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)
        
        update_results(X_train, X_test, Y_train, Y_test, results, combo_number)
        
    results['total_loops'] = loop_number + 1
    ## Writing data to s3
    write_data_to_s3(data_file_name, results)

loops_run = results.at[1, 'total_loops']
score_to_check = 'f1'

cut_off_value = cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

for cut_off_value in range(1, len(cut_off)):
    
    column_name = str(cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.15_f1,index.1,0.2_f1,index.2,0.25_f1,index.3,0.3_f1,index.4,0.35_f1,...,index.5,0.5_f1,index.6,0.55_f1,index.7,0.6_f1,index.8,0.65_f1,index.9,0.7_f1
0,6,0.627485,6,0.655862,6,0.675448,6,0.684298,6,0.685334,...,6,0.672102,3,0.667547,3,0.66643,3,0.66424,3,0.652821
1,2,0.626885,2,0.650898,10,0.669279,10,0.681876,8,0.684369,...,2,0.66686,7,0.656815,7,0.65558,7,0.641233,7,0.621009
2,3,0.626143,10,0.648457,0,0.666644,8,0.679768,10,0.683003,...,3,0.664243,6,0.654212,6,0.642353,2,0.624445,2,0.60612
3,7,0.62367,0,0.643704,4,0.664958,2,0.677075,2,0.680431,...,10,0.654639,2,0.651903,2,0.639452,6,0.623668,6,0.600562
4,10,0.618262,4,0.638034,2,0.664768,0,0.673822,0,0.67353,...,7,0.653489,10,0.642741,10,0.625672,10,0.608726,10,0.580813
5,11,0.602881,3,0.637595,8,0.664229,4,0.673092,4,0.666541,...,0,0.649563,0,0.610491,11,0.597326,11,0.583764,11,0.564218
6,0,0.581729,8,0.636979,3,0.645387,3,0.65092,3,0.654431,...,8,0.64032,11,0.607514,0,0.567348,0,0.504176,0,0.440807
7,4,0.576654,7,0.632672,7,0.641872,7,0.648959,7,0.652009,...,4,0.618767,8,0.604277,8,0.558393,8,0.494553,9,0.437269
8,8,0.57438,11,0.61189,1,0.636236,1,0.639285,1,0.627834,...,11,0.615342,4,0.567907,4,0.514276,4,0.463171,8,0.430162
9,1,0.516435,1,0.593338,9,0.62442,9,0.630813,9,0.623441,...,9,0.558445,9,0.526576,1,0.495014,9,0.461925,1,0.426095


In [11]:
## 3rd run
## lists of cut-off values and types of score to evaluate models
cut_off = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55]

## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']

## read SVC data stored in s3 file
data_file_name = 'project_svc_result_3rd_run.csv'
results = read_data_from_s3(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(results.at[1, 'total_loops'], 100):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(results.shape[0]):
        parameters = results.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            else:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
                
        ## scale input variables to 0-1 scale
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)
        
        update_results(X_train, X_test, Y_train, Y_test, results, combo_number)
        
    results['total_loops'] = loop_number + 1
    ## Writing data to s3
    write_data_to_s3(data_file_name, results)

loops_run = results.at[1, 'total_loops']
score_to_check = 'f1'

cut_off_value = cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

for cut_off_value in range(1, len(cut_off)):
    
    column_name = str(cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.2_f1,index.1,0.25_f1,index.2,0.3_f1,index.3,0.35_f1,index.4,0.4_f1,index.5,0.45_f1,index.6,0.5_f1,index.7,0.55_f1
0,6,0.656095,6,0.673095,10,0.681905,10,0.684751,10,0.681801,2,0.673503,3,0.66616,3,0.665026
1,10,0.654569,4,0.67302,6,0.679079,6,0.682475,2,0.679312,10,0.668789,2,0.664337,7,0.656345
2,0,0.650666,0,0.671822,2,0.677611,2,0.679747,6,0.677511,6,0.665602,7,0.656136,2,0.655403
3,2,0.650414,10,0.669034,8,0.674489,4,0.67505,4,0.66382,3,0.662784,10,0.653857,10,0.641227
4,4,0.648672,8,0.668463,4,0.673418,8,0.672224,0,0.660283,4,0.655844,6,0.651922,6,0.639622
5,8,0.640274,2,0.667637,0,0.669655,0,0.668583,3,0.657051,7,0.652202,4,0.629281,11,0.615048
6,3,0.635699,3,0.643386,3,0.650168,3,0.654183,8,0.654883,0,0.648238,0,0.624193,4,0.592382
7,7,0.630391,1,0.640178,7,0.645311,7,0.650391,7,0.651055,8,0.64249,8,0.62149,0,0.591311
8,11,0.624318,7,0.639039,11,0.635388,11,0.632109,11,0.63082,11,0.626137,11,0.620241,8,0.584216
9,1,0.592075,11,0.632129,1,0.633279,1,0.619936,1,0.596561,1,0.581525,1,0.557362,1,0.527459


In [12]:
## Reviewing cut off from above dataframe
review_cut_off = [0.3, 0.35, 0.4]

## read SVC data stored in s3 file
data_file_name = 'project_svc_result_3rd_run.csv'
results = read_data_from_s3(data_file_name)

## number of loops already run
loops_run = results.at[0, 'total_loops']

review_df = pd.DataFrame()

## Displaying all average score for the cut-offs chosen to review
for cut_off_value in range(len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_f1'
    next_f1_score_index = pd.DataFrame(results[column_name].sort_values(ascending = [False])).index
    score_columns = list()
    
    for score_to_check in score_to_evaluate:
        
        column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
        
        score_columns.append(column_name)

    parameter_df = results[['extended_data', 'input_layer', 'svc']].loc[next_f1_score_index].reset_index(drop = True)
    score_df = results[score_columns].loc[next_f1_score_index].reset_index(drop = True) / loops_run
    
    review_df = pd.concat([review_df, parameter_df, score_df], axis = 1)

review_df = review_df.rename(columns = {'extended_data':'ext', 'input_layer':'input', 'svc':'kernel'})
review_df

Unnamed: 0,ext,input,kernel,0.3_precision,0.3_recall,0.3_f1,ext.1,input.1,kernel.1,0.35_precision,0.35_recall,0.35_f1,ext.2,input.2,kernel.2,0.4_precision,0.4_recall,0.4_f1
0,N,8,linear,0.579429,0.847692,0.681905,N,8,linear,0.605928,0.807308,0.684751,N,8,linear,0.632391,0.759615,0.681801
1,N,6,linear,0.582629,0.831538,0.679079,N,6,linear,0.609617,0.793846,0.682475,Y,6,linear,0.617919,0.769167,0.679312
2,Y,6,linear,0.571829,0.846111,0.677611,Y,6,linear,0.595318,0.8075,0.679747,N,6,linear,0.636369,0.744615,0.677511
3,N,8,rbf,0.579953,0.821923,0.674489,N,6,rbf,0.609001,0.779231,0.67505,N,6,rbf,0.631271,0.723846,0.66382
4,N,6,rbf,0.581767,0.819615,0.673418,N,8,rbf,0.607574,0.771154,0.672224,Y,6,rbf,0.647703,0.689444,0.660283
5,Y,6,rbf,0.588682,0.793611,0.669655,Y,6,rbf,0.616243,0.746944,0.668583,Y,6,sigmoid,0.542737,0.865278,0.657051
6,Y,6,sigmoid,0.517475,0.903611,0.650168,Y,6,sigmoid,0.529989,0.885278,0.654183,N,8,rbf,0.621951,0.708846,0.654883
7,N,6,sigmoid,0.511,0.888846,0.645311,N,6,sigmoid,0.52479,0.868846,0.650391,N,6,sigmoid,0.537357,0.840769,0.651055
8,N,8,sigmoid,0.512249,0.86,0.635388,N,8,sigmoid,0.522787,0.824615,0.632109,N,8,sigmoid,0.53779,0.789615,0.63082
9,Y,6,poly,0.601173,0.686667,0.633279,Y,6,poly,0.640899,0.620278,0.619936,Y,6,poly,0.671659,0.558056,0.596561


## Support Vector Machine Classifiers look good with cleaned dataset and cut off value of 0.35, both with full number and reduced number of features using LASSO and linear kernel