In [1]:
## 6.a
import boto3, botocore
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score
from itertools import product

## fetch file content from s3
s3 = boto3.resource('s3')
bucket = s3.Bucket('danhtran358-data-445-bucket')

bucket_object = bucket.Object('project_cleaned_data.csv')
## read file content to data-frame
diabetes_cleaned = pd.read_csv(bucket_object.get().get('Body'))
diabetes_cleaned

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89,66,23,94,28.1,0.167,21,0
1,0,137,40,35,168,43.1,2.288,33,1
2,3,78,50,32,88,31.0,0.248,26,1
3,2,197,70,45,543,30.5,0.158,53,1
4,1,189,60,23,846,30.1,0.398,59,1
...,...,...,...,...,...,...,...,...,...
387,0,181,88,44,510,43.3,0.222,26,1
388,1,128,88,39,110,36.5,1.057,37,1
389,2,88,58,26,16,28.4,0.766,22,0
390,10,101,76,48,180,32.9,0.171,63,0


In [2]:
bucket_object = bucket.Object('project_cleaned_data_extended_after_LASSO.csv')
## read file content to data-frame
diabetes_extended = pd.read_csv(bucket_object.get().get('Body'))
diabetes_extended

Unnamed: 0,Pregnancies,Glucose,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,35,33.6,0.627,50,1
1,1,85,29,26.6,0.351,31,0
2,1,89,23,28.1,0.167,21,0
3,0,137,35,43.1,2.288,33,1
4,3,78,32,31.0,0.248,26,1
...,...,...,...,...,...,...,...
529,9,170,31,44.0,0.403,43,1
530,10,101,48,32.9,0.171,63,0
531,2,122,27,36.8,0.340,27,0
532,5,121,23,26.2,0.245,30,0


In [3]:
## Use dataframes to store parameters to build models and store total scores
def expand_grid(dictionary):
    return pd.DataFrame([row for row in product(*dictionary.values())], columns = dictionary.keys())

dictionary = {'extended_data' : ['Y', 'N'], 'input_layer': [6, 8], 'total_loops' : [0], 'svc' : ['rbf', 'poly', 'linear', 'sigmoid']}

## lists of cut-off values and types of score to evaluate models
cut_off = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
score_to_evaluate = ['precision', 'recall', 'f1']

In [4]:
## function to write write data_frame to csv file object in S3 bucket
def write_data_to_s3(file_name, data_frame):
    ## file object in s3 bucket
    data_file = bucket.Object(file_name)
    
    ## add content from the lists of recall scores
    content = data_frame.to_csv(index=False)

    ## store as new csv file
    data_file.put(Body = content)
    

## function to read Random Forest data stored in s3 csv to dataframe
def read_data_from_s3(file_name):
    try:
        ## file object in s3 bucket
        data_file = bucket.Object(file_name)
        
        data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            results = expand_grid(dictionary)
            
            ## will not work on extended data with 8 feature columns
            results = results.drop(results[(results['extended_data'] == 'Y') & (results['input_layer'] == 8)].index)
            
            ## create columns for all types of cut-off values and scores
            for i in range(len(cut_off)):
                for j in range(len(score_to_evaluate)):
                    col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                    results[col] = 0.0
                   
            ## write brand new and empty file to s3
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(data_file.get().get('Body'))

In [5]:
def svc_rbf_predict(X_train, X_test, Y_train):
    ## Building the svc with kernel = 'rbf'
    md_svc_rbf = SVC(kernel = 'rbf', probability = True).fit(X_train, Y_train)

    ## Making predictions on the test dataset
    pred_svc_rbf = md_svc_rbf.predict_proba(X_test)[:,1]
    
    return pred_svc_rbf
    
def svc_poly_predict(X_train, X_test, Y_train):
    ## Building the svc with kernel = 'poly'
    md_svc_poly = SVC(kernel = 'poly', probability = True).fit(X_train, Y_train)

    ## Making predictions on the test dataset
    pred_svc_poly = md_svc_poly.predict_proba(X_test)[:,1]
    
    return pred_svc_poly
    
def svc_linear_predict(X_train, X_test, Y_train):
    ## Building the svc with kernel = 'rbf'
    md_svc_linear = SVC(kernel = 'linear', probability = True).fit(X_train, Y_train)

    ## Making predictions on the test dataset
    pred_svc_linear = md_svc_linear.predict_proba(X_test)[:,1]
    
    return pred_svc_linear
    
def svc_sigmoid_predict(X_train, X_test, Y_train):
    ## Building the svc with kernel = 'rbf'
    md_svc_sigmoid = SVC(kernel = 'sigmoid', probability = True).fit(X_train, Y_train)

    ## Making predictions on the test dataset
    pred_svc_sigmoid = md_svc_sigmoid.predict_proba(X_test)[:,1]
    
    return pred_svc_sigmoid

In [8]:
## build the appropriate model and update the result dataset after each model is built
def update_results(X_train, X_test, Y_train, Y_test, results, combo_number):
    parameters = results.loc[combo_number]
    
    if parameters['svc'] == 'rbf':
        pred = svc_rbf_predict(X_train, X_test, Y_train)
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['svc'] == 'poly':
        pred = svc_poly_predict(X_train, X_test, Y_train)
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['svc'] == 'linear':
        pred = svc_linear_predict(X_train, X_test, Y_train)
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['svc'] == 'sigmoid':
        pred = svc_sigmoid_predict(X_train, X_test, Y_train)
        update_result_scores(pred, Y_test, results, combo_number)

## update the scores in result dataset after each model is built
def update_result_scores(pred, Y_test, results, combo_number):
    for cut_off_id in range(len(cut_off)):
        
        ## classify labels
        current_cut_off = cut_off[cut_off_id]
        pred_labels = np.where(pred < current_cut_off, 0, 1)
        
        for score_id in range(len(score_to_evaluate)):
            
            ## updated the appropriate score
            current_score = score_to_evaluate[score_id]
            score_column = str(current_cut_off) + '_' + current_score
            
            if current_score == 'precision':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + precision_score(Y_test, pred_labels, zero_division = 0)
            
            elif current_score == 'recall':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + recall_score(Y_test, pred_labels)
                
            elif current_score == 'f1': 
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + f1_score(Y_test, pred_labels)

In [9]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']

## read SVC data stored in s3 file
data_file_name = 'project_svc_result.csv'
results = read_data_from_s3(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(results.at[1, 'total_loops'], 100):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(results.shape[0]):
        parameters = results.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            else:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
                
        ## scale input variables to 0-1 scale
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)
        
        update_results(X_train, X_test, Y_train, Y_test, results, combo_number)
        
    results['total_loops'] = loop_number + 1
    ## Writing data to s3
    write_data_to_s3(data_file_name, results)


In [12]:
loops_run = results.at[1, 'total_loops']
score_to_check = 'f1'

cut_off_value = cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

for cut_off_value in range(1, len(cut_off)):
    
    column_name = str(cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.1_f1,index.1,0.15_f1,index.2,0.2_f1,index.3,0.25_f1,index.4,0.3_f1,index.5,0.35_f1,index.6,0.4_f1,index.7,0.45_f1,index.8,0.5_f1
0,2,0.596751,2,0.642837,2,0.661649,9,0.676333,4,0.686567,2,0.67538,7,0.672906,7,0.676066,7,0.680674
1,3,0.5887,6,0.626545,1,0.661588,5,0.676244,9,0.684976,6,0.674792,6,0.665101,3,0.652845,3,0.653036
2,7,0.586177,10,0.620778,4,0.660058,10,0.674294,2,0.681516,9,0.672678,2,0.660875,6,0.647882,2,0.63222
3,6,0.577903,3,0.617307,0,0.659633,4,0.674215,6,0.680059,7,0.66788,3,0.654229,2,0.646967,6,0.628884
4,10,0.570936,1,0.616475,10,0.65763,6,0.673403,10,0.679497,4,0.667804,4,0.651749,4,0.627891,4,0.613643
5,1,0.540671,7,0.614914,6,0.65749,2,0.671874,5,0.672272,5,0.664739,5,0.647895,5,0.621841,0,0.609735
6,5,0.533774,4,0.614611,5,0.653198,1,0.661128,8,0.661904,10,0.662014,9,0.644279,0,0.620786,5,0.605549
7,8,0.527187,0,0.608978,9,0.644481,0,0.660714,0,0.65582,8,0.66015,8,0.639873,10,0.611121,1,0.605546
8,9,0.524094,8,0.606081,8,0.642245,8,0.658106,1,0.65015,0,0.65183,0,0.638232,1,0.607889,10,0.599376
9,0,0.515529,5,0.600392,3,0.640995,7,0.636043,7,0.648757,3,0.645909,10,0.634823,8,0.607627,8,0.589227


In [13]:
## 2nd run
## lists of cut-off values and types of score to evaluate models
cut_off = [0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]

## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']

## read SVC data stored in s3 file
data_file_name = 'project_svc_result_2nd_run.csv'
results = read_data_from_s3(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(results.at[1, 'total_loops'], 100):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(results.shape[0]):
        parameters = results.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            else:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
                
        ## scale input variables to 0-1 scale
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)
        
        update_results(X_train, X_test, Y_train, Y_test, results, combo_number)
        
    results['total_loops'] = loop_number + 1
    ## Writing data to s3
    write_data_to_s3(data_file_name, results)

loops_run = results.at[1, 'total_loops']
score_to_check = 'f1'

cut_off_value = cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

for cut_off_value in range(1, len(cut_off)):
    
    column_name = str(cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

In [16]:
## 3rd run
## lists of cut-off values and types of score to evaluate models
cut_off = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55]

## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']

## read SVC data stored in s3 file
data_file_name = 'project_svc_result_3rd_run.csv'
results = read_data_from_s3(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(results.at[1, 'total_loops'], 100):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(results.shape[0]):
        parameters = results.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            else:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
                
        ## scale input variables to 0-1 scale
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)
        
        update_results(X_train, X_test, Y_train, Y_test, results, combo_number)
        
    results['total_loops'] = loop_number + 1
    ## Writing data to s3
    write_data_to_s3(data_file_name, results)

loops_run = results.at[1, 'total_loops']
score_to_check = 'f1'

cut_off_value = cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

for cut_off_value in range(1, len(cut_off)):
    
    column_name = str(cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.2_f1,index.1,0.25_f1,index.2,0.3_f1,index.3,0.35_f1,index.4,0.4_f1,index.5,0.45_f1,index.6,0.5_f1,index.7,0.55_f1
0,2,0.66919,5,0.679775,2,0.686833,2,0.68064,7,0.675659,7,0.677439,7,0.677698,7,0.660888
1,1,0.669149,2,0.679061,6,0.68481,6,0.671703,2,0.661917,3,0.652039,3,0.652971,3,0.636878
2,6,0.659645,6,0.675883,4,0.68054,9,0.670013,6,0.655395,2,0.650618,2,0.635144,2,0.618299
3,0,0.659372,4,0.671913,10,0.680243,10,0.668645,3,0.654759,6,0.635923,10,0.618454,4,0.601032
4,5,0.658267,10,0.671809,9,0.67973,7,0.66802,10,0.649777,10,0.632099,6,0.616227,1,0.599811
5,10,0.657326,9,0.67091,5,0.678521,4,0.666129,4,0.648803,4,0.626161,1,0.611996,10,0.596755
6,4,0.651746,1,0.664285,8,0.661663,5,0.66588,9,0.646225,5,0.624666,4,0.610613,6,0.596118
7,9,0.642199,0,0.658952,1,0.659111,8,0.658728,5,0.642342,1,0.621515,5,0.606918,0,0.592323
8,3,0.637893,8,0.657248,0,0.656249,0,0.651644,8,0.635666,0,0.618098,0,0.602746,5,0.583266
9,8,0.637684,7,0.632836,7,0.64598,1,0.651202,0,0.635125,9,0.605669,8,0.586088,8,0.572841


In [42]:
## Reviewing cut off from above dataframe
review_cut_off = [0.25, 0.3, 0.35]

## read SVC data stored in s3 file
data_file_name = 'project_svc_result_3rd_run.csv'
results = read_data_from_s3(data_file_name)

loops_run = results.at[1, 'total_loops']

review_df = pd.DataFrame()

for cut_off_value in range(len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_f1'
    next_f1_score_index = pd.DataFrame(results[column_name].sort_values(ascending = [False])).index
    score_columns = list()
    
    for score_to_check in score_to_evaluate:
        
        column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
        
        score_columns.append(column_name)

    parameter_df = results[['extended_data', 'input_layer', 'svc']].loc[next_f1_score_index].reset_index(drop = True)
    score_df = results[score_columns].loc[next_f1_score_index].reset_index(drop = True) / loops_run
    
    review_df = pd.concat([review_df, parameter_df, score_df], axis = 1)

review_df = review_df.rename(columns = {'extended_data':'ext', 'input_layer':'input'})
review_df

Unnamed: 0,ext,input,svc,0.25_precision,0.25_recall,0.25_f1,ext.1,input.1,svc.1,0.3_precision,0.3_recall,0.3_f1,ext.2,input.2,svc.2,0.35_precision,0.35_recall,0.35_f1
0,N,6,poly,0.58969,0.809385,0.679775,Y,6,linear,0.615905,0.781389,0.686833,Y,6,linear,0.647377,0.722778,0.68064
1,Y,6,linear,0.575565,0.832167,0.679061,N,6,linear,0.609202,0.788846,0.68481,N,6,linear,0.635785,0.719538,0.671703
2,N,6,linear,0.566114,0.844538,0.675883,N,6,rbf,0.648542,0.723231,0.68054,N,8,poly,0.669292,0.677769,0.670013
3,N,6,rbf,0.600638,0.769154,0.671913,N,8,linear,0.602892,0.786,0.680243,N,8,linear,0.633526,0.714462,0.668645
4,N,8,linear,0.558744,0.847,0.671809,N,8,poly,0.639181,0.732923,0.67973,N,6,sigmoid,0.573692,0.805538,0.66802
5,N,8,poly,0.573367,0.814769,0.67091,N,6,poly,0.639162,0.729692,0.678521,N,6,rbf,0.673233,0.667538,0.666129
6,Y,6,poly,0.578081,0.786,0.664285,N,8,rbf,0.608526,0.732154,0.661663,N,6,poly,0.670976,0.668846,0.66588
7,Y,6,rbf,0.591436,0.749722,0.658952,Y,6,poly,0.626831,0.701056,0.659111,N,8,rbf,0.643637,0.681615,0.658728
8,N,8,rbf,0.557711,0.806,0.657248,Y,6,rbf,0.634398,0.685111,0.656249,Y,6,rbf,0.672281,0.638444,0.651644
9,N,6,sigmoid,0.509872,0.838923,0.632836,N,6,sigmoid,0.535086,0.82,0.64598,Y,6,poly,0.661682,0.647278,0.651202
