In [9]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, recall_score
from sklearn.impute import SimpleImputer, KNNImputer

#defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'craig-shaffer-data-445-bucket'
bucket = s3.Bucket(bucket_name)

#defining the file to be read from s3 bucket
file_key = 'framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datafile
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [27]:
#defining input and target
x=heart[['age','currentSmoker','totChol','BMI','heartRate']]
y=heart['TenYearCHD']

#splitting the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = .2,stratify=y)

In [28]:
x_train.describe()

Unnamed: 0,age,currentSmoker,totChol,BMI,heartRate
count,3390.0,3390.0,3354.0,3376.0,3389.0
mean,49.588791,0.492625,236.345259,25.770136,75.833579
std,8.561802,0.500019,43.731579,4.067904,11.969216
min,32.0,0.0,107.0,15.54,45.0
25%,42.0,0.0,206.0,23.03,68.0
50%,49.0,0.0,233.0,25.36,75.0
75%,56.0,1.0,262.0,28.06,83.0
max,70.0,1.0,600.0,56.8,143.0


In [29]:
x_test.describe()

Unnamed: 0,age,currentSmoker,totChol,BMI,heartRate
count,848.0,848.0,834.0,843.0,848.0
mean,49.569575,0.5,238.235012,25.929644,76.060142
std,8.618498,0.500295,47.886545,4.128605,12.258719
min,33.0,0.0,113.0,16.59,44.0
25%,42.0,0.0,205.25,23.235,68.0
50%,49.0,0.5,234.0,25.51,75.0
75%,56.0,1.0,265.0,27.96,82.0
max,69.0,1.0,696.0,44.55,140.0


In [30]:
#defining the imputer
imputer = KNNImputer(n_neighbors=5,weights='distance')
imputer.fit(x_train)

#imputing missing values
x_train_imp = pd.DataFrame(imputer.transform(x_train), columns=x_train.columns)
x_test_imp = pd.DataFrame(imputer.transform(x_test), columns=x_test.columns)

In [31]:
x_train_imp.describe()

Unnamed: 0,age,currentSmoker,totChol,BMI,heartRate
count,3390.0,3390.0,3390.0,3390.0,3390.0
mean,49.588791,0.492625,236.373729,25.771272,75.832145
std,8.561802,0.500019,43.54998,4.063071,11.967742
min,32.0,0.0,107.0,15.54,45.0
25%,42.0,0.0,206.0,23.040173,68.0
50%,49.0,0.0,234.0,25.36,75.0
75%,56.0,1.0,262.0,28.06,83.0
max,70.0,1.0,600.0,56.8,143.0


In [32]:
x_test_imp.describe()

Unnamed: 0,age,currentSmoker,totChol,BMI,heartRate
count,848.0,848.0,848.0,848.0,848.0
mean,49.569575,0.5,237.977213,25.928927,76.060142
std,8.618498,0.500295,47.743708,4.118116,12.258719
min,33.0,0.0,113.0,16.59,44.0
25%,42.0,0.0,205.114429,23.24,68.0
50%,49.0,0.5,234.0,25.51,75.0
75%,56.0,1.0,264.25,27.96,82.0
max,69.0,1.0,696.0,44.55,140.0


**Random Forest**

In [33]:
#building the model
rf_md=RandomForestClassifier(n_estimators=500,max_depth=5).fit(x_train_imp,y_train)

#predicting on test
rf_pred=rf_md.predict_proba(x_test_imp)[:,1]

#changing likelihoods to labels
rf_label=np.where(rf_pred<0.1,0,1)

print(classification_report(y_test,rf_label))

              precision    recall  f1-score   support

           0       0.94      0.37      0.53       719
           1       0.20      0.88      0.33       129

    accuracy                           0.45       848
   macro avg       0.57      0.62      0.43       848
weighted avg       0.83      0.45      0.50       848



**AdaBoost**

In [34]:
#building the model
ada_md=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5),n_estimators=500,learning_rate=.01).fit(x_train_imp,y_train)

#predicting on test
ada_pred=ada_md.predict_proba(x_test_imp)[:,1]

#changing likelihoods to labels
ada_label=np.where(ada_pred<0.1,0,1)

print(classification_report(y_test,ada_label))

              precision    recall  f1-score   support

           0       0.88      0.20      0.33       719
           1       0.16      0.85      0.27       129

    accuracy                           0.30       848
   macro avg       0.52      0.53      0.30       848
weighted avg       0.77      0.30      0.32       848



Based on the results (using recall), I would chose the Random Forest model to predict 10YearCHD