In [1]:
# (a)
import boto3
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score

# Defining s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'gabrielferreira-data-455-bucket'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
key_file = 'framingham.csv'

bucket_object = bucket.Object(key_file)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading csv
heart = pd.read_csv(file_content_stream)

# (b)
heart = heart.dropna()
heart.head(1)

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0


In [10]:
# Defining input and target variables
X = heart[['male', 'age', 'currentSmoker', 'totChol', 'sysBP', 'BMI', 'heartRate', 'glucose']]
Y = heart['TenYearCHD']

# Spliting the data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [11]:
# Adaboost
md1 = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500).fit(X_train, Y_train)

# Extracting features importances
md1.feature_importances_

array([0.01690406, 0.10776187, 0.01206625, 0.17811399, 0.18870547,
       0.2422261 ,        nan, 0.14098364])

In [12]:
## Adaboost with top 5 important variables
X_train_ada = X_train[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose']]
X_test_ada = X_test[['totChol', 'sysBP', 'BMI', 'heartRate', 'glucose']]

In [14]:
md1_top5 = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500).fit(X_train_ada, Y_train)

# Predicting on test dataset
pred1 = md1_top5.predict_proba(X_test_ada)[:, 1]
pred1 = np.where(pred1 >= 0.1, 1, 0)

# Computing recall
recall_score(Y_test, pred1)

1.0

In [16]:
## Randoom Forest
md2 = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)

# Extracting features importances
md2.feature_importances_

array([0.04932816, 0.29242247, 0.00737819, 0.07601317, 0.2985037 ,
       0.08441218, 0.02973172, 0.16221041])

In [18]:
## Adaboost with top 5 important variables
X_train_rf = X_train[['age', 'totChol', 'BMI', 'sysBP', 'glucose']]
X_test_rf = X_test[['age', 'totChol', 'BMI', 'sysBP', 'glucose']]

md2_top5 = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train_rf, Y_train)

# Predicting on test dataset
pred2 = md2_top5.predict_proba(X_test_rf)[:, 1]
pred2 = np.where(pred2 >= 0.1, 1, 0)

# Computing recall
recall_score(Y_test, pred2)

0.875

#### Giving this particular split, I would use Adaboost model to predict the likelyhood of TenYearCHD