In [1]:
## import packages and modules
import boto3, botocore
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

## fetch file content from s3
s3 = boto3.resource('s3')
bucket = s3.Bucket('danhtran358-data-445-bucket')
bucket_object = bucket.Object('project_cleaned_data.csv')
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## read file content to data-frame
diabetes = pd.read_csv(file_content_stream)
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89,66,23,94,28.1,0.167,21,0
1,0,137,40,35,168,43.1,2.288,33,1
2,3,78,50,32,88,31.0,0.248,26,1
3,2,197,70,45,543,30.5,0.158,53,1
4,1,189,60,23,846,30.1,0.398,59,1
...,...,...,...,...,...,...,...,...,...
387,0,181,88,44,510,43.3,0.222,26,1
388,1,128,88,39,110,36.5,1.057,37,1
389,2,88,58,26,16,28.4,0.766,22,0
390,10,101,76,48,180,32.9,0.171,63,0


In [2]:
## defining input and target variables
X = diabetes.drop(columns = ['Outcome'])
Y = diabetes['Outcome']

In [3]:
## function to write write data_frame to csv file object in S3 bucket
def write_data_to_s3(file_name, data_frame):
    ## file object in s3 bucket
    data_file = bucket.Object(file_name)
    
    ## add content from the lists of recall scores
    content = data_frame.to_csv(index=False)

    ## store as new csv file
    data_file.put(Body = content)
    

## function to read Random Forest data stored in s3 csv to dataframe
def read_data_from_s3(file_name):
    try:
        ## file object in s3 bucket
        data_file = bucket.Object(file_name)
        
        data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            lasso_coef = list()
            results = pd.DataFrame(lasso_coef, columns = X.columns)
            results.at[0, 'total_loops'] = 0
                    
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(data_file.get().get('Body'))

In [4]:
## List to store coefficients
lasso_coef = list()

file_name = 'project_lasso_coef.csv'
results = read_data_from_s3(file_name)

for loop_number in range(int(results.at[0, 'total_loops']), 1000):
    
    # Splitting the data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    ## Estimate lambda using cv
    lasso_cv = LassoCV(normalize = True, cv = 20).fit(X_train, Y_train)
    
    ## Building LASSO model
    lasso_md = Lasso(alpha = lasso_cv.alpha_, normalize = True).fit(X_train, Y_train)
    
    lasso_coef.append(lasso_md.coef_)
    lasso_coef_df = pd.DataFrame(lasso_coef, columns = X.columns)
    lasso_coef_df['total_loops'] = loop_number + 1
    
    write_data_to_s3(file_name, lasso_coef_df)

lasso_coef_df = read_data_from_s3(file_name)
lasso_coef_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,total_loops
0,0.00888,0.005662,0.0,0.002969,-0.0,0.00796,0.121045,0.005041,1000
1,0.014686,0.005755,0.0,0.002318,0.0,0.008935,0.170847,0.006046,1000
2,0.005665,0.005004,0.0,0.000927,-0.0,0.008257,0.152042,0.00832,1000
3,0.010149,0.005789,0.000398,0.000572,-0.0,0.009409,0.12591,0.005683,1000
4,0.019601,0.005939,-0.0,0.002652,-0.0,0.008242,0.131973,0.003155,1000


In [5]:
## get the count of 0 in each column
print((lasso_coef_df == 0).sum())

Pregnancies                  43
Glucose                       0
BloodPressure               639
SkinThickness               126
Insulin                     789
BMI                           0
DiabetesPedigreeFunction      1
Age                           1
total_loops                   0
dtype: int64


In [6]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1,89,66,23,94,28.1,0.167,21
1,0,137,40,35,168,43.1,2.288,33
2,3,78,50,32,88,31.0,0.248,26
3,2,197,70,45,543,30.5,0.158,53
4,1,189,60,23,846,30.1,0.398,59


In [7]:
X_lasso = diabetes[['Pregnancies', 'Glucose', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
X_lasso

Unnamed: 0,Pregnancies,Glucose,SkinThickness,BMI,DiabetesPedigreeFunction,Age
0,1,89,23,28.1,0.167,21
1,0,137,35,43.1,2.288,33
2,3,78,32,31.0,0.248,26
3,2,197,45,30.5,0.158,53
4,1,189,23,30.1,0.398,59
...,...,...,...,...,...,...
387,0,181,44,43.3,0.222,26
388,1,128,39,36.5,1.057,37
389,2,88,26,28.4,0.766,22
390,10,101,48,32.9,0.171,63


In [8]:
## The clean data from project_cleaned_data.csv has missing value observations deleted

## This process only cleans missing value observation in the columns kept after LASSO
bucket_object = bucket.Object('diabetes.csv')
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## read file content to data-frame
diabetes_not_cleaned = pd.read_csv(file_content_stream)

## dropping columns after LASSO
diabetes_lasso = diabetes_not_cleaned.drop(columns = ['BloodPressure', 'Insulin'])

## Preprocessing - Clean missing data values
## Glucose missing values
diabetes_cleaned = diabetes_lasso.loc[diabetes_lasso['Glucose'] != 0]
diabetes_cleaned

## SkinThickness missing values
diabetes_cleaned = diabetes_cleaned.loc[diabetes_cleaned['SkinThickness'] != 0]
diabetes_cleaned

## BMI missing values
diabetes_cleaned = diabetes_cleaned.loc[diabetes_cleaned['BMI'] != 0]
diabetes_cleaned

diabetes_cleaned = diabetes_cleaned.reset_index(drop = True)

diabetes_cleaned

Unnamed: 0,Pregnancies,Glucose,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,35,33.6,0.627,50,1
1,1,85,29,26.6,0.351,31,0
2,1,89,23,28.1,0.167,21,0
3,0,137,35,43.1,2.288,33,1
4,3,78,32,31.0,0.248,26,1
...,...,...,...,...,...,...,...
529,9,170,31,44.0,0.403,43,1
530,10,101,48,32.9,0.171,63,0
531,2,122,27,36.8,0.340,27,0
532,5,121,23,26.2,0.245,30,0


In [9]:
## write cleaned data to s3 bucket
write_data_to_s3('project_cleaned_data_extended_after_LASSO.csv', diabetes_cleaned)

In [10]:
X_lasso_extended = diabetes_cleaned.drop(columns = ['Outcome'])
Y_lasso_extended = diabetes_cleaned['Outcome']
X_lasso_extended

Unnamed: 0,Pregnancies,Glucose,SkinThickness,BMI,DiabetesPedigreeFunction,Age
0,6,148,35,33.6,0.627,50
1,1,85,29,26.6,0.351,31
2,1,89,23,28.1,0.167,21
3,0,137,35,43.1,2.288,33
4,3,78,32,31.0,0.248,26
...,...,...,...,...,...,...
529,9,170,31,44.0,0.403,43
530,10,101,48,32.9,0.171,63
531,2,122,27,36.8,0.340,27
532,5,121,23,26.2,0.245,30


In [49]:
## lists to store results
precision_scores = list()
recall_scores = list()
f1_scores = list()
lasso_precision_scores = list()
lasso_recall_scores = list()
lasso_f1_scores = list()

## Runing on cleaned data extended
extended_precision_scores = list()
extended_recall_scores = list()
extended_f1_scores = list()

kfold = StratifiedKFold(n_splits = 20, shuffle = True)
scaler = MinMaxScaler()

cut_off_values = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65]

for i in range(50):
    ## Running on cleaned data
    for train_ix, test_ix in kfold.split(X, Y):
        ## split data into train and validation folds
        X_train, X_test = X.loc[train_ix], X.loc[test_ix]
        X_train_lasso, X_test_lasso = X_lasso.loc[train_ix], X_lasso.loc[test_ix]
        Y_train, Y_test = Y.loc[train_ix], Y.loc[test_ix]

        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
        X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)
        X_train_lasso = pd.DataFrame(scaler.fit_transform(X_train_lasso), columns = X_train_lasso.columns)
        X_test_lasso = pd.DataFrame(scaler.fit_transform(X_test_lasso), columns = X_test_lasso.columns)

        ## build logistic models
        logit_md = LogisticRegression().fit(X_train, Y_train)
        logit_lasso_md = LogisticRegression().fit(X_train_lasso, Y_train)

        ## predict the likelihood
        pred = logit_md.predict_proba(X_test)[:,1]
        pred_lasso = logit_lasso_md.predict_proba(X_test_lasso)[:,1]

        precision = list()
        recall = list()
        f1s = list()
        lasso_precision = list()
        lasso_recall = list()
        lasso_f1 = list()
        for cut_off in cut_off_values:

            ## label likelihood
            pred_labels = np.where(pred < cut_off, 0, 1)
            pred_labels_lasso = np.where(pred_lasso < cut_off, 0, 1)

            ## compute and store score
            precision.append(precision_score(Y_test, pred_labels))
            recall.append(recall_score(Y_test, pred_labels, zero_division = 0))
            f1s.append(f1_score(Y_test, pred_labels, zero_division = 0))
            lasso_precision.append(precision_score(Y_test, pred_labels_lasso))
            lasso_recall.append(recall_score(Y_test, pred_labels_lasso, zero_division = 0))
            lasso_f1.append(f1_score(Y_test, pred_labels_lasso))

        ## compute and store score
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1s)
        lasso_precision_scores.append(lasso_precision)
        lasso_recall_scores.append(lasso_recall)
        lasso_f1_scores.append(lasso_f1)

    for train_ix, test_ix in kfold.split(X_lasso_extended, Y_lasso_extended):
        ## split data into train and validation folds
        X_train, X_test = X_lasso_extended.loc[train_ix], X_lasso_extended.loc[test_ix]
        Y_train, Y_test = Y_lasso_extended.loc[train_ix], Y_lasso_extended.loc[test_ix]

        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
        X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

        ## build logistic models
        logit_md = LogisticRegression().fit(X_train, Y_train)

        ## predict the likelihood
        pred = logit_md.predict_proba(X_test)[:,1]

        extended_precision = list()
        extended_recall = list()
        extended_f1 = list()
        for cut_off in cut_off_values:

            ## label likelihood
            pred_labels = np.where(pred < cut_off, 0, 1)

            extended_precision.append(precision_score(Y_test, pred_labels))
            extended_recall.append(recall_score(Y_test, pred_labels, zero_division = 0))
            extended_f1.append(f1_score(Y_test, pred_labels, zero_division = 0))


        ## compute and store score
        extended_precision_scores.append(extended_precision)
        extended_recall_scores.append(extended_recall)
        extended_f1_scores.append(extended_f1)

## compute average scores
avg_precision_score = np.mean(precision_scores, axis = 0)
avg_recall_score = np.mean(recall_scores, axis = 0)
avg_f1_score = np.mean(f1_scores, axis = 0)
avg_precision_score_lasso = np.mean(lasso_precision_scores, axis = 0)
avg_recall_score_lasso = np.mean(lasso_recall_scores, axis = 0)
avg_f1_score_lasso = np.mean(lasso_f1_scores, axis = 0)

## compute average scores
avg_precision_score_extended = np.mean(extended_precision_scores, axis = 0)
avg_recall_score_extended = np.mean(extended_recall_scores, axis = 0)
avg_f1_score_extended = np.mean(extended_f1_scores, axis = 0)



print('Cut-off values:', cut_off_values)
print('Logistic with all features')
print('Precision:', avg_precision_score)
print('Recall:   ', avg_recall_score)
print('F1 score: ', avg_f1_score)
print()
print('Logistic LASSO')
print('Precision:', avg_precision_score_lasso)
print('Recall:   ', avg_recall_score_lasso)
print('F1 score:', avg_f1_score_lasso)
print()
print('Logistic with exteneded data')
print('Precision:', avg_precision_score_extended)
print('Recall:   ', avg_recall_score_extended)
print('F1 score: ', avg_f1_score_extended)

Cut-off values: [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65]
Logistic with all features
Precision: [0.38983683 0.42997625 0.46419359 0.49581232 0.52459059 0.54976102
 0.57774468 0.59993754 0.61951608 0.64027304 0.65579906 0.67698005]
Recall:    [0.98569048 0.97390476 0.95971429 0.94066667 0.91438095 0.88664286
 0.85609524 0.81609524 0.77747619 0.73521429 0.68742857 0.63940476]
F1 score:  [0.55732403 0.59404675 0.62221334 0.64469713 0.66050553 0.67187074
 0.68190379 0.68226643 0.67949968 0.67339229 0.659711   0.64509116]

Logistic LASSO
Precision: [0.3885559  0.42932562 0.4643482  0.49606545 0.5263602  0.55396506
 0.58335505 0.60265271 0.62485795 0.6480414  0.66594938 0.69187885]
Recall:    [0.98833333 0.97433333 0.96195238 0.94278571 0.91519048 0.88685714
 0.85154762 0.81195238 0.77130952 0.72785714 0.67954762 0.62847619]
F1 score: [0.55644715 0.59346231 0.62277513 0.64513056 0.66203461 0.67452543
 0.68325337 0.68172031 0.67953857 0.67342292 0.66000671 0.64421226]

In [50]:
## lists to store results
precision_scores = list()
recall_scores = list()
f1_scores = list()
lasso_precision_scores = list()
lasso_recall_scores = list()
lasso_f1_scores = list()

## Runing on cleaned data extended
extended_precision_scores = list()
extended_recall_scores = list()
extended_f1_scores = list()

kfold = StratifiedKFold(n_splits = 20, shuffle = True)
scaler = MinMaxScaler()

cut_off_values = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55]

for i in range(100):
    ## Running on cleaned data
    for train_ix, test_ix in kfold.split(X, Y):
        ## split data into train and validation folds
        X_train, X_test = X.loc[train_ix], X.loc[test_ix]
        X_train_lasso, X_test_lasso = X_lasso.loc[train_ix], X_lasso.loc[test_ix]
        Y_train, Y_test = Y.loc[train_ix], Y.loc[test_ix]

        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
        X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)
        X_train_lasso = pd.DataFrame(scaler.fit_transform(X_train_lasso), columns = X_train_lasso.columns)
        X_test_lasso = pd.DataFrame(scaler.fit_transform(X_test_lasso), columns = X_test_lasso.columns)

        ## build logistic models
        logit_md = LogisticRegression().fit(X_train, Y_train)
        logit_lasso_md = LogisticRegression().fit(X_train_lasso, Y_train)

        ## predict the likelihood
        pred = logit_md.predict_proba(X_test)[:,1]
        pred_lasso = logit_lasso_md.predict_proba(X_test_lasso)[:,1]

        precision = list()
        recall = list()
        f1s = list()
        lasso_precision = list()
        lasso_recall = list()
        lasso_f1 = list()
        for cut_off in cut_off_values:

            ## label likelihood
            pred_labels = np.where(pred < cut_off, 0, 1)
            pred_labels_lasso = np.where(pred_lasso < cut_off, 0, 1)

            ## compute and store score
            precision.append(precision_score(Y_test, pred_labels))
            recall.append(recall_score(Y_test, pred_labels, zero_division = 0))
            f1s.append(f1_score(Y_test, pred_labels, zero_division = 0))
            lasso_precision.append(precision_score(Y_test, pred_labels_lasso))
            lasso_recall.append(recall_score(Y_test, pred_labels_lasso, zero_division = 0))
            lasso_f1.append(f1_score(Y_test, pred_labels_lasso))

        ## compute and store score
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1s)
        lasso_precision_scores.append(lasso_precision)
        lasso_recall_scores.append(lasso_recall)
        lasso_f1_scores.append(lasso_f1)

    for train_ix, test_ix in kfold.split(X_lasso_extended, Y_lasso_extended):
        ## split data into train and validation folds
        X_train, X_test = X_lasso_extended.loc[train_ix], X_lasso_extended.loc[test_ix]
        Y_train, Y_test = Y_lasso_extended.loc[train_ix], Y_lasso_extended.loc[test_ix]

        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
        X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

        ## build logistic models
        logit_md = LogisticRegression().fit(X_train, Y_train)

        ## predict the likelihood
        pred = logit_md.predict_proba(X_test)[:,1]

        extended_precision = list()
        extended_recall = list()
        extended_f1 = list()
        for cut_off in cut_off_values:

            ## label likelihood
            pred_labels = np.where(pred < cut_off, 0, 1)

            extended_precision.append(precision_score(Y_test, pred_labels))
            extended_recall.append(recall_score(Y_test, pred_labels, zero_division = 0))
            extended_f1.append(f1_score(Y_test, pred_labels, zero_division = 0))


        ## compute and store score
        extended_precision_scores.append(extended_precision)
        extended_recall_scores.append(extended_recall)
        extended_f1_scores.append(extended_f1)

## compute average scores
avg_precision_score = np.mean(precision_scores, axis = 0)
avg_recall_score = np.mean(recall_scores, axis = 0)
avg_f1_score = np.mean(f1_scores, axis = 0)
avg_precision_score_lasso = np.mean(lasso_precision_scores, axis = 0)
avg_recall_score_lasso = np.mean(lasso_recall_scores, axis = 0)
avg_f1_score_lasso = np.mean(lasso_f1_scores, axis = 0)

## compute average scores
avg_precision_score_extended = np.mean(extended_precision_scores, axis = 0)
avg_recall_score_extended = np.mean(extended_recall_scores, axis = 0)
avg_f1_score_extended = np.mean(extended_f1_scores, axis = 0)



print('Cut-off values:', cut_off_values)
print('Logistic with all features')
print('Precision:', avg_precision_score)
print('Recall:   ', avg_recall_score)
print('F1 score: ', avg_f1_score)
print()
print('Logistic LASSO')
print('Precision:', avg_precision_score_lasso)
print('Recall:   ', avg_recall_score_lasso)
print('F1 score:', avg_f1_score_lasso)
print()
print('Logistic with exteneded data')
print('Precision:', avg_precision_score_extended)
print('Recall:   ', avg_recall_score_extended)
print('F1 score: ', avg_f1_score_extended)

Cut-off values: [0.3, 0.35, 0.4, 0.45, 0.5, 0.55]
Logistic with all features
Precision: [0.52004324 0.54668851 0.57332968 0.59721876 0.62044361 0.64111421]
Recall:    [0.91246429 0.88485714 0.85355952 0.8192381  0.7807619  0.73577381]
F1 score:  [0.65731634 0.66960286 0.67846687 0.68226798 0.68162062 0.67396454]

Logistic LASSO
Precision: [0.52373126 0.55171645 0.57794395 0.6028595  0.625502   0.64950181]
Recall:    [0.91435714 0.88511905 0.85209524 0.81529762 0.77453571 0.72847619]
F1 score: [0.66024022 0.6729073  0.68051023 0.68344835 0.68126965 0.6742049 ]

Logistic with exteneded data
Precision: [0.50631118 0.53263517 0.55774582 0.58194607 0.60355393 0.62394782]
Recall:    [0.92915972 0.90295833 0.87413194 0.84225694 0.80610417 0.76795833]
F1 score:  [0.65076486 0.66434445 0.67432761 0.68069113 0.68177425 0.67923516]


In [52]:
## lists to store results
precision_scores = list()
recall_scores = list()
f1_scores = list()
lasso_precision_scores = list()
lasso_recall_scores = list()
lasso_f1_scores = list()

## Runing on cleaned data extended
extended_precision_scores = list()
extended_recall_scores = list()
extended_f1_scores = list()

kfold = StratifiedKFold(n_splits = 20, shuffle = True)
scaler = MinMaxScaler()

cut_off_values = [0.4, 0.45, 0.5]

for i in range(100):
    ## Running on cleaned data
    for train_ix, test_ix in kfold.split(X, Y):
        ## split data into train and validation folds
        X_train, X_test = X.loc[train_ix], X.loc[test_ix]
        X_train_lasso, X_test_lasso = X_lasso.loc[train_ix], X_lasso.loc[test_ix]
        Y_train, Y_test = Y.loc[train_ix], Y.loc[test_ix]

        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
        X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)
        X_train_lasso = pd.DataFrame(scaler.fit_transform(X_train_lasso), columns = X_train_lasso.columns)
        X_test_lasso = pd.DataFrame(scaler.fit_transform(X_test_lasso), columns = X_test_lasso.columns)

        ## build logistic models
        logit_md = LogisticRegression().fit(X_train, Y_train)
        logit_lasso_md = LogisticRegression().fit(X_train_lasso, Y_train)

        ## predict the likelihood
        pred = logit_md.predict_proba(X_test)[:,1]
        pred_lasso = logit_lasso_md.predict_proba(X_test_lasso)[:,1]

        precision = list()
        recall = list()
        f1s = list()
        lasso_precision = list()
        lasso_recall = list()
        lasso_f1 = list()
        for cut_off in cut_off_values:

            ## label likelihood
            pred_labels = np.where(pred < cut_off, 0, 1)
            pred_labels_lasso = np.where(pred_lasso < cut_off, 0, 1)

            ## compute and store score
            precision.append(precision_score(Y_test, pred_labels))
            recall.append(recall_score(Y_test, pred_labels, zero_division = 0))
            f1s.append(f1_score(Y_test, pred_labels, zero_division = 0))
            lasso_precision.append(precision_score(Y_test, pred_labels_lasso))
            lasso_recall.append(recall_score(Y_test, pred_labels_lasso, zero_division = 0))
            lasso_f1.append(f1_score(Y_test, pred_labels_lasso))

        ## compute and store score
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1s)
        lasso_precision_scores.append(lasso_precision)
        lasso_recall_scores.append(lasso_recall)
        lasso_f1_scores.append(lasso_f1)

    for train_ix, test_ix in kfold.split(X_lasso_extended, Y_lasso_extended):
        ## split data into train and validation folds
        X_train, X_test = X_lasso_extended.loc[train_ix], X_lasso_extended.loc[test_ix]
        Y_train, Y_test = Y_lasso_extended.loc[train_ix], Y_lasso_extended.loc[test_ix]

        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
        X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

        ## build logistic models
        logit_md = LogisticRegression().fit(X_train, Y_train)

        ## predict the likelihood
        pred = logit_md.predict_proba(X_test)[:,1]

        extended_precision = list()
        extended_recall = list()
        extended_f1 = list()
        for cut_off in cut_off_values:

            ## label likelihood
            pred_labels = np.where(pred < cut_off, 0, 1)

            extended_precision.append(precision_score(Y_test, pred_labels))
            extended_recall.append(recall_score(Y_test, pred_labels, zero_division = 0))
            extended_f1.append(f1_score(Y_test, pred_labels, zero_division = 0))


        ## compute and store score
        extended_precision_scores.append(extended_precision)
        extended_recall_scores.append(extended_recall)
        extended_f1_scores.append(extended_f1)

## compute average scores
avg_precision_score = np.mean(precision_scores, axis = 0)
avg_recall_score = np.mean(recall_scores, axis = 0)
avg_f1_score = np.mean(f1_scores, axis = 0)
avg_precision_score_lasso = np.mean(lasso_precision_scores, axis = 0)
avg_recall_score_lasso = np.mean(lasso_recall_scores, axis = 0)
avg_f1_score_lasso = np.mean(lasso_f1_scores, axis = 0)

## compute average scores
avg_precision_score_extended = np.mean(extended_precision_scores, axis = 0)
avg_recall_score_extended = np.mean(extended_recall_scores, axis = 0)
avg_f1_score_extended = np.mean(extended_f1_scores, axis = 0)



print('Cut-off values:', cut_off_values)
print('Logistic with all features')
print('Precision:', avg_precision_score)
print('Recall:   ', avg_recall_score)
print('F1 score: ', avg_f1_score)
print()
print('Logistic LASSO')
print('Precision:', avg_precision_score_lasso)
print('Recall:   ', avg_recall_score_lasso)
print('F1 score:', avg_f1_score_lasso)
print()
print('Logistic with exteneded data')
print('Precision:', avg_precision_score_extended)
print('Recall:   ', avg_recall_score_extended)
print('F1 score: ', avg_f1_score_extended)

Cut-off values: [0.4, 0.45, 0.5]
Logistic with all features
Precision: [0.57578834 0.59956262 0.62002972]
Recall:    [0.8540119  0.82057143 0.78014286]
F1 score:  [0.67996997 0.68393827 0.68073847]

Logistic LASSO
Precision: [0.57979227 0.60461662 0.62758102]
Recall:    [0.85164286 0.81470238 0.77385714]
F1 score: [0.681503   0.68430977 0.68151565]

Logistic with exteneded data
Precision: [0.55706592 0.58072734 0.60153341]
Recall:    [0.87514583 0.84190972 0.80265278]
F1 score:  [0.67433163 0.6796894  0.67923558]


In [55]:
cut_off = 0.45

## lists to store results
precision_scores = list()
recall_scores = list()
lasso_precision_scores = list()
lasso_recall_scores = list()

kfold = StratifiedKFold(n_splits = 20, shuffle = True)
scaler = MinMaxScaler()

## Running on cleaned data
for train_ix, test_ix in kfold.split(X, Y):
    ## split data into train and validation folds
    X_train, X_test = X.loc[train_ix], X.loc[test_ix]
    X_train_lasso, X_test_lasso = X_lasso.loc[train_ix], X_lasso.loc[test_ix]
    Y_train, Y_test = Y.loc[train_ix], Y.loc[test_ix]

    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
    X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)
    X_train_lasso = pd.DataFrame(scaler.fit_transform(X_train_lasso), columns = X_train_lasso.columns)
    X_test_lasso = pd.DataFrame(scaler.fit_transform(X_test_lasso), columns = X_test_lasso.columns)
    
    ## build logistic models
    logit_md = LogisticRegression().fit(X_train, Y_train)
    logit_lasso_md = LogisticRegression().fit(X_train_lasso, Y_train)
    
    ## predict the likelihood
    pred = logit_md.predict_proba(X_test)[:,1]
    pred_lasso = logit_lasso_md.predict_proba(X_test_lasso)[:,1]
    
    ## label likelihood
    pred = np.where(pred < cut_off, 0, 1)
    pred_lasso = np.where(pred_lasso < cut_off, 0, 1)
    
    ## compute and store F1 score
    precision_scores.append(precision_score(Y_test, pred, zero_division = 0))
    recall_scores.append(recall_score(Y_test, pred, zero_division = 0))
    lasso_precision_scores.append(precision_score(Y_test, pred_lasso, zero_division = 0))
    lasso_recall_scores.append(recall_score(Y_test, pred_lasso, zero_division = 0))

## compute average scores
avg_precision_score = np.mean(precision_scores)
avg_recall_score = np.mean(recall_scores)
avg_precision_score_lasso = np.mean(lasso_precision_scores)
avg_recall_score_lasso = np.mean(lasso_recall_scores)

## Runing on cleaned data extended
extended_precision_scores = list()
extended_recall_scores = list()
    
for train_ix, test_ix in kfold.split(X_lasso_extended, Y_lasso_extended):
    ## split data into train and validation folds
    X_train, X_test = X_lasso_extended.loc[train_ix], X_lasso_extended.loc[test_ix]
    Y_train, Y_test = Y_lasso_extended.loc[train_ix], Y_lasso_extended.loc[test_ix]

    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
    X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

    ## build logistic models
    logit_md = LogisticRegression().fit(X_train, Y_train)

    ## predict the likelihood
    pred = logit_md.predict_proba(X_test)[:,1]

    ## label likelihood
    pred = np.where(pred < cut_off, 0, 1)        

    ## compute and store score
    extended_precision_scores.append(precision_score(Y_test, pred, zero_division = 0))
    extended_recall_scores.append(recall_score(Y_test, pred, zero_division = 0))

## compute average scores
avg_precision_score_extended = np.mean(extended_precision_scores)
avg_recall_score_extended = np.mean(extended_recall_scores)

print('Cut-off values:', cut_off)
print('Logistic with all features')
print('Precision:', avg_precision_score)
print('Recall:  ', avg_recall_score)
print()
print('Logistic LASSO')
print('Precision:', avg_precision_score_lasso)
print('Recall:  ', avg_recall_score_lasso)
print()
print('Logistic with exteneded data')
print('Precision:', avg_precision_score_extended)
print('Recall:  ', avg_recall_score_extended)

Cut-off values: 0.45
Logistic with all features
Precision: 0.5991287878787878
Recall:   0.8238095238095238

Logistic LASSO
Precision: 0.6007539682539681
Recall:   0.8321428571428571

Logistic with exteneded data
Precision: 0.5895005239858181
Recall:   0.8486111111111111


## Logistic Regression with reduced number of features using LASSO looks good with cut off value of 0.45

In [59]:
Y_lasso_extended.value_counts()[1]/Y_lasso_extended.count()

0.3333333333333333