In [1]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, recall_score
from sklearn.impute import SimpleImputer

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'craig-shaffer-data-445-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the datafile
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [2]:
#defining input and target
x=heart[['age','currentSmoker','totChol','BMI','heartRate']]
y=heart['TenYearCHD']

#splitting the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = .2,stratify=y)

In [3]:
x_train.describe()

Unnamed: 0,age,currentSmoker,totChol,BMI,heartRate
count,3390.0,3390.0,3351.0,3373.0,3389.0
mean,49.664602,0.490265,237.219636,25.828915,76.004721
std,8.604206,0.499979,45.28166,4.1016,12.185572
min,32.0,0.0,107.0,16.48,45.0
25%,42.0,0.0,206.0,23.07,68.0
50%,49.0,0.0,234.0,25.43,75.0
75%,56.0,1.0,264.0,28.04,83.0
max,70.0,1.0,696.0,56.8,143.0


In [4]:
x_test.describe()

Unnamed: 0,age,currentSmoker,totChol,BMI,heartRate
count,848.0,848.0,837.0,846.0,848.0
mean,49.266509,0.509434,234.727599,25.694728,75.376179
std,8.440343,0.500206,41.674116,3.993869,11.362078
min,33.0,0.0,133.0,15.54,44.0
25%,42.0,0.0,205.0,23.0525,68.0
50%,48.0,1.0,233.0,25.23,75.0
75%,56.0,1.0,261.0,28.055,82.0
max,68.0,1.0,453.0,40.58,122.0


In [5]:
#defining the imputer
imputer = SimpleImputer(missing_values=np.nan,strategy='median')
imputer.fit(x_train)

#imputing missing values
x_train_imp = pd.DataFrame(imputer.transform(x_train), columns=x_train.columns)
x_test_imp = pd.DataFrame(imputer.transform(x_test), columns=x_train.columns)

In [6]:
x_train_imp.describe()

Unnamed: 0,age,currentSmoker,totChol,BMI,heartRate
count,3390.0,3390.0,3390.0,3390.0,3390.0
mean,49.664602,0.490265,237.182596,25.826914,76.004425
std,8.604206,0.499979,45.021669,4.091397,12.183787
min,32.0,0.0,107.0,16.48,45.0
25%,42.0,0.0,206.0,23.08,68.0
50%,49.0,0.0,234.0,25.43,75.0
75%,56.0,1.0,263.0,28.02,83.0
max,70.0,1.0,696.0,56.8,143.0


In [7]:
x_test_imp.describe()

Unnamed: 0,age,currentSmoker,totChol,BMI,heartRate
count,848.0,848.0,848.0,848.0,848.0
mean,49.266509,0.509434,234.71816,25.694104,75.376179
std,8.440343,0.500206,41.402702,3.989171,11.362078
min,33.0,0.0,133.0,15.54,44.0
25%,42.0,0.0,206.0,23.0575,68.0
50%,48.0,1.0,233.0,25.23,75.0
75%,56.0,1.0,260.0,28.045,82.0
max,68.0,1.0,453.0,40.58,122.0


In [8]:
scaler=MinMaxScaler()
x_train_imp = pd.DataFrame(scaler.fit_transform(x_train_imp), columns = x_train_imp.columns)
x_test_imp = pd.DataFrame(scaler.fit_transform(x_test_imp), columns = x_test_imp.columns)

**Logistic Regression**

In [10]:
#building the model
logit_md=LogisticRegression().fit(x_train_imp,y_train)

#predicting on test
logit_pred=logit_md.predict_proba(x_test_imp)[:,1]

#changing likelihoods to labels
logit_label=np.where(logit_pred<0.1,0,1)

print(classification_report(y_test,logit_label))

              precision    recall  f1-score   support

           0       0.96      0.25      0.39       719
           1       0.18      0.95      0.31       129

    accuracy                           0.35       848
   macro avg       0.57      0.60      0.35       848
weighted avg       0.84      0.35      0.38       848



**Random Forest**

In [13]:
#building the model
rf_md=RandomForestClassifier(n_estimators=500,max_depth=5).fit(x_train_imp,y_train)

#predicting on test
rf_pred=rf_md.predict_proba(x_test_imp)[:,1]

#changing likelihoods to labels
rf_label=np.where(rf_pred<0.1,0,1)

print(classification_report(y_test,rf_label))

              precision    recall  f1-score   support

           0       0.96      0.17      0.29       719
           1       0.17      0.96      0.29       129

    accuracy                           0.29       848
   macro avg       0.57      0.57      0.29       848
weighted avg       0.84      0.29      0.29       848



Based on my results, I'd choose Random Forest model because the recall is higher for predicting 10YearCHD