In [1]:
#E1: LOOCV
#E2: C
#E3: B
#E4: It is a process error used to reduce error and avoid overfitting 

In [3]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler

s3 = boto3.resource('s3')
bucket_name = 'grant-gonnerman-data-445'
bucket = s3.Bucket(bucket_name)

file_key = 'framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading data file
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
# removing NA 
heart = heart.dropna()

In [10]:
# defining input and target variables 
x = heart.drop(columns = ['TenYearCHD'], axis = 1)
y = heart['TenYearCHD']

# defining the folds
kf = KFold(n_splits = 5, shuffle = True)

# defining lists to store results
md1_results = list()
md2_results = list()

for train_idx, val_idx in kf.split(x):
    # splitting the data
    x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    ## model 1 ##
    x1 = x_train[['age', 'currentSmoker', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']]
    x_val_1 = x_val[['age', 'currentSmoker', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']]
    
    #transforming the input data
    scaler = MinMaxScaler()
    x1 = scaler.fit_transform(x1)
    x_val_1 = scaler.fit_transform(x_val_1)
    
    # building logistic model 
    md1 = LogisticRegression().fit(x1, y_train)
    
    #predicting on validation set
    pred1 = md1.predict_proba(x_val_1)[:,1]
    
    # changing the likeihood to lables
    md1_labels = np.where(pred1 < .25, 0, 1)
    
    #storing F1 score 
    md1_results.append(f1_score(y_val, md1_labels))
    
    ## model 2 ##
    x2 = x_train[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate', 'glucose']]
    x_val_2 = x_val[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate', 'glucose']]
    
    #transforming the input data
    scaler = MinMaxScaler()
    x2 = scaler.fit_transform(x2)
    x_val_2 = scaler.fit_transform(x_val_2)
    
    # building logistic model 
    md2 = LogisticRegression().fit(x2, y_train)
    
    #predicting on validation set
    pred2 = md2.predict_proba(x_val_2)[:,1]
    
    # changing the likeihood to lables
    md2_labels = np.where(pred2 < .25, 0, 1)
    
    #storing F1 score 
    md2_results.append(f1_score(y_val, md2_labels))

In [11]:
md1_results

[0.36012861736334406,
 0.40924092409240925,
 0.3255813953488372,
 0.3785488958990536,
 0.4037854889589905]

In [12]:
md2_results

[0.34899328859060397,
 0.36363636363636365,
 0.3446153846153846,
 0.30115830115830117,
 0.3793103448275862]

In [13]:
print('the average F1-score for model 1 is: ', np.mean(md1_results))
print('the average F1-score for model 2 is: ', np.mean(md2_results))

the average F1-score for model 1 is:  0.3754570643325269
the average F1-score for model 2 is:  0.34754273656564794


In [None]:
# from the results above we can see that model 1 performed better at predicting TenYearCHD because it's F1 score is larger.