In [63]:
## import packages and modules
import boto3
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score

## fetch file content from s3
s3 = boto3.resource('s3')
bucket = s3.Bucket('danhtran358-data-445-bucket')
bucket_object = bucket.Object('project_cleaned_data.csv')
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## read file content to data-frame
diabetes = pd.read_csv(file_content_stream)
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89,66,23,94,28.1,0.167,21,0
1,0,137,40,35,168,43.1,2.288,33,1
2,3,78,50,32,88,31.0,0.248,26,1
3,2,197,70,45,543,30.5,0.158,53,1
4,1,189,60,23,846,30.1,0.398,59,1
...,...,...,...,...,...,...,...,...,...
387,0,181,88,44,510,43.3,0.222,26,1
388,1,128,88,39,110,36.5,1.057,37,1
389,2,88,58,26,16,28.4,0.766,22,0
390,10,101,76,48,180,32.9,0.171,63,0


In [64]:
## defining input and target variables
X = diabetes.drop(columns = ['Outcome'])
Y = diabetes['Outcome']

In [65]:
## List to store coefficients
lasso_coef = list()

for i in range(1000):

    # Splitting the data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    ## Estimate lambda using cv
    lasso_cv = LassoCV(normalize = True, cv = 20).fit(X_train, Y_train)
    
    ## Building LASSO model
    lasso_md = Lasso(alpha = lasso_cv.alpha_, normalize = True).fit(X_train, Y_train)
    
    lasso_coef.append(lasso_md.coef_)

lasso_coef_df = pd.DataFrame(lasso_coef)
lasso_coef_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.010155,0.005077,0.0,0.000415,-0.0,0.009536,0.135296,0.006483
1,0.011337,0.006271,4.2e-05,0.0,0.0,0.011084,0.102664,0.003495
2,0.025795,0.006385,0.00156,0.0,0.000188,0.011271,0.221333,0.002103
3,0.010325,0.006166,-0.0,0.002747,0.0,0.00484,0.125756,0.005772
4,0.004818,0.005709,0.0,0.002792,0.0,0.007325,0.128592,0.007229


In [70]:
## function to write write data_frame to csv file object in S3 bucket
def write_data_to_s3(file_name, data_frame):
    ## file object in s3 bucket
    data_file = bucket.Object(file_name)
    
    ## add content from the lists of recall scores
    content = data_frame.to_csv(index=False)

    ## store as new csv file
    data_file.put(Body = content)

lasso_coef_df_with_col_name = pd.DataFrame(lasso_coef, columns = X.columns)
write_data_to_s3('project_lasso_coef.csv', lasso_coef_df_with_col_name)

In [71]:
## get the count of 0 in each column
print((lasso_coef_df == 0).sum())

0     63
1      0
2    692
3    159
4    865
5      0
6      1
7      0
dtype: int64


In [56]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1,89,66,23,94,28.1,0.167,21
1,0,137,40,35,168,43.1,2.288,33
2,3,78,50,32,88,31.0,0.248,26
3,2,197,70,45,543,30.5,0.158,53
4,1,189,60,23,846,30.1,0.398,59


In [73]:
X_lasso = diabetes[['Pregnancies', 'Glucose', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
X_lasso

Unnamed: 0,Pregnancies,Glucose,SkinThickness,BMI,DiabetesPedigreeFunction,Age
0,1,89,23,28.1,0.167,21
1,0,137,35,43.1,2.288,33
2,3,78,32,31.0,0.248,26
3,2,197,45,30.5,0.158,53
4,1,189,23,30.1,0.398,59
...,...,...,...,...,...,...
387,0,181,44,43.3,0.222,26
388,1,128,39,36.5,1.057,37
389,2,88,26,28.4,0.766,22
390,10,101,48,32.9,0.171,63


In [75]:
## The clean data from project_cleaned_data.csv has missing value observations deleted

## This process only cleans missing value observation in the columns kept after LASSO
bucket_object = bucket.Object('diabetes.csv')
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## read file content to data-frame
diabetes_not_cleaned = pd.read_csv(file_content_stream)

## Preprocessing - Clean missing data values
## Glucose missing values
diabetes_cleaned = diabetes_not_cleaned.loc[diabetes_not_cleaned['Glucose'] != 0]
diabetes_cleaned

## SkinThickness missing values
diabetes_cleaned = diabetes_cleaned.loc[diabetes_cleaned['SkinThickness'] != 0]
diabetes_cleaned

## BMI missing values
diabetes_cleaned = diabetes_cleaned.loc[diabetes_cleaned['BMI'] != 0]
diabetes_cleaned

## write cleaned data to s3 bucket
write_data_to_s3('project_cleaned_data_extended_after_LASSO.csv', diabetes_cleaned)

diabetes_cleaned

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
6,3,78,50,32,88,31.0,0.248,26,1
...,...,...,...,...,...,...,...,...,...
761,9,170,74,31,0,44.0,0.403,43,1
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0


In [83]:
diabetes_cleaned = diabetes_cleaned.reset_index(drop = True)

X_lasso_extended = diabetes_cleaned.drop(columns = ['Outcome'])
Y_lasso_extended = diabetes_cleaned['Outcome']
X_lasso_extended

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,1,89,66,23,94,28.1,0.167,21
3,0,137,40,35,168,43.1,2.288,33
4,3,78,50,32,88,31.0,0.248,26
...,...,...,...,...,...,...,...,...
529,9,170,74,31,0,44.0,0.403,43
530,10,101,76,48,180,32.9,0.171,63
531,2,122,70,27,0,36.8,0.340,27
532,5,121,72,23,112,26.2,0.245,30


In [134]:
## lists to store results
accuracy_scores = list()
recall_scores = list()
lasso_accuracy_scores = list()
lasso_recall_scores = list()

kfold = KFold(n_splits = 20, shuffle = True)
scaler = MinMaxScaler()

cut_off_values = [0.1, 0.15, 0.2, 0.25, 0.3]

    ## Running on cleaned data
for train_ix, test_ix in kfold.split(X):
    ## split data into train and validation folds
    X_train, X_test = X.loc[train_ix], X.loc[test_ix]
    X_train_lasso, X_test_lasso = X_lasso.loc[train_ix], X_lasso.loc[test_ix]
    Y_train, Y_test = Y.loc[train_ix], Y.loc[test_ix]

    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
    X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)
    X_train_lasso = pd.DataFrame(scaler.fit_transform(X_train_lasso), columns = X_train_lasso.columns)
    X_test_lasso = pd.DataFrame(scaler.fit_transform(X_test_lasso), columns = X_test_lasso.columns)

    ## build logistic models
    logit_md = LogisticRegression().fit(X_train, Y_train)
    logit_lasso_md = LogisticRegression().fit(X_train_lasso, Y_train)

    ## predict the likelihood
    pred = logit_md1.predict_proba(X_test)[:,1]
    pred_lasso = logit_lasso_md.predict_proba(X_test_lasso)[:,1]

    accuracy = list()
    recall = list()
    lasso_accuracy = list()
    lasso_recall = list()
    for cut_off in cut_off_values:

        ## label likelihood
        pred_labels = np.where(pred < cut_off, 0, 1)
        pred_labels_lasso = np.where(pred_lasso < cut_off, 0, 1)

        ## compute and store score
        accuracy.append(accuracy_score(Y_test, pred_labels))
        recall.append(recall_score(Y_test, pred_labels))
        lasso_accuracy.append(accuracy_score(Y_test, pred_labels_lasso))
        lasso_recall.append(recall_score(Y_test, pred_labels_lasso))

    ## compute and store score
    accuracy_scores.append(accuracy)
    recall_scores.append(recall)
    lasso_accuracy_scores.append(lasso_accuracy)
    lasso_recall_scores.append(lasso_recall)

## compute average scores
avg_accuracy_score = np.mean(accuracy_scores, axis = 0)
avg_recall_score = np.mean(recall_scores, axis = 0)
avg_accuracy_score_lasso = np.mean(lasso_accuracy_scores, axis = 0)
avg_recall_score_lasso = np.mean(lasso_recall_scores, axis = 0)

In [135]:
## Runing on cleaned data extended
extended_accuracy_scores = list()
extended_recall_scores = list()
    
for train_ix, test_ix in kfold.split(X_lasso_extended):
    ## split data into train and validation folds
    X_train, X_test = X_lasso_extended.loc[train_ix], X_lasso_extended.loc[test_ix]
    Y_train, Y_test = Y_lasso_extended.loc[train_ix], Y_lasso_extended.loc[test_ix]

    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
    X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

    ## build logistic models
    logit_md = LogisticRegression().fit(X_train, Y_train)

    ## predict the likelihood
    pred = logit_md1.predict_proba(X_test)[:,1]

    extended_accuracy = list()
    extended_recall = list()
    for cut_off in cut_off_values:

        ## label likelihood
        pred_labels = np.where(pred < cut_off, 0, 1)
        
        extended_accuracy.append(accuracy_score(Y_test, pred_labels))
        extended_recall.append(recall_score(Y_test, pred_labels))
        

    ## compute and store score
    extended_accuracy_scores.append(extended_accuracy)
    extended_recall_scores.append(extended_recall)

## compute average scores
avg_accuracy_score_extended = np.mean(extended_accuracy_scores, axis = 0)
avg_recall_score_extended = np.mean(extended_recall_scores, axis = 0)

In [136]:
print('Cut-off values:', cut_off_values)
print('Logistic with all features')
print('Accuracy:', avg_accuracy_score)
print('Recall:  ', avg_recall_score)
print()
print('Logistic LASSO')
print('Accuracy:', avg_accuracy_score_lasso)
print('Recall:  ', avg_recall_score_lasso)
print()
print('Logistic with exteneded data')
print('Accuracy:', avg_accuracy_score_extended)
print('Recall:  ', avg_recall_score_extended)

Cut-off values: [0.1, 0.15, 0.2, 0.25, 0.3]
Logistic with all features
Accuracy: [0.49736842 0.56434211 0.61552632 0.65894737 0.68434211]
Recall:   [0.99       0.95736111 0.92055556 0.90041667 0.87402778]

Logistic LASSO
Accuracy: [0.48473684 0.55368421 0.6        0.64118421 0.66921053]
Recall:   [1.         0.96152778 0.92055556 0.90527778 0.87402778]

Logistic with exteneded data
Accuracy: [0.4960114  0.57834758 0.63276353 0.6775641  0.71673789]
Recall:   [0.98660839 0.98105284 0.96331779 0.94121406 0.89650808]


In [137]:
cut_off = 0.3

## lists to store results
accuracy_scores = list()
recall_scores = list()
lasso_accuracy_scores = list()
lasso_recall_scores = list()

kfold = KFold(n_splits = 20, shuffle = True)
scaler = MinMaxScaler()

## Running on cleaned data
for train_ix, test_ix in kfold.split(X):
    ## split data into train and validation folds
    X_train, X_test = X.loc[train_ix], X.loc[test_ix]
    X_train_lasso, X_test_lasso = X_lasso.loc[train_ix], X_lasso.loc[test_ix]
    Y_train, Y_test = Y.loc[train_ix], Y.loc[test_ix]

    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
    X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)
    X_train_lasso = pd.DataFrame(scaler.fit_transform(X_train_lasso), columns = X_train_lasso.columns)
    X_test_lasso = pd.DataFrame(scaler.fit_transform(X_test_lasso), columns = X_test_lasso.columns)
    
    ## build logistic models
    logit_md = LogisticRegression().fit(X_train, Y_train)
    logit_lasso_md = LogisticRegression().fit(X_train_lasso, Y_train)
    
    ## predict the likelihood
    pred = logit_md1.predict_proba(X_test)[:,1]
    pred_lasso = logit_lasso_md.predict_proba(X_test_lasso)[:,1]
    
    ## label likelihood
    pred = np.where(pred < cut_off, 0, 1)
    pred_lasso = np.where(pred_lasso < cut_off, 0, 1)
    
    ## compute and store F1 score
    accuracy_scores.append(accuracy_score(Y_test, pred))
    recall_scores.append(recall_score(Y_test, pred))
    lasso_accuracy_scores.append(accuracy_score(Y_test, pred_lasso))
    lasso_recall_scores.append(recall_score(Y_test, pred_lasso))

## compute average scores
avg_accuracy_score = np.mean(accuracy_scores)
avg_recall_score = np.mean(recall_scores)
avg_accuracy_score_lasso = np.mean(lasso_accuracy_scores)
avg_recall_score_lasso = np.mean(lasso_recall_scores)

In [108]:
## Runing on cleaned data extended
extended_accuracy_scores = list()
extended_recall_scores = list()
    
for train_ix, test_ix in kfold.split(X_lasso_extended):
    ## split data into train and validation folds
    X_train, X_test = X_lasso_extended.loc[train_ix], X_lasso_extended.loc[test_ix]
    Y_train, Y_test = Y_lasso_extended.loc[train_ix], Y_lasso_extended.loc[test_ix]

    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
    X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

    ## build logistic models
    logit_md = LogisticRegression().fit(X_train, Y_train)

    ## predict the likelihood
    pred = logit_md1.predict_proba(X_test)[:,1]

    ## label likelihood
    pred = np.where(pred < cut_off, 0, 1)        

    ## compute and store score
    extended_accuracy_scores.append(accuracy_score(Y_test, pred))
    extended_recall_scores.append(recall_score(Y_test, pred))

## compute average scores
avg_accuracy_score_extended = np.mean(extended_accuracy_scores)
avg_recall_score_extended = np.mean(extended_recall_scores)

In [109]:
print('Cut-off values:', 0.25)
print('Logistic with all features')
print('Accuracy:', avg_accuracy_score)
print('Recall:  ', avg_recall_score)
print()
print('Logistic LASSO')
print('Accuracy:', avg_accuracy_score_lasso)
print('Recall:  ', avg_recall_score_lasso)
print()
print('Logistic with exteneded data')
print('Accuracy:', avg_accuracy_score_extended)
print('Recall:  ', avg_recall_score_extended)

Cut-off values: 0.25
Logistic with all features
Accuracy: 0.693421052631579
Recall:   0.9346031746031747

Logistic LASSO
Accuracy: 0.6702631578947368
Recall:   0.9103174603174604

Logistic with exteneded data
Accuracy: 0.6849002849002848
Recall:   0.8745833333333335
