In [None]:
# E1: A) 1/100
#     B) The weight would increase/ get larger
# E2: it is an ensemble learning algorithm because it utalizes weak learners grown sequentially with different weights to maximize the efficency
# E3: e^(t) = (.08 +.12)/(.16+.64+.08+.12) a^(t) = log((1-.2)/.2) = .602
#     obs 1: .16e^(.602*-1) = .087
#     obs 2: .64e^(.602*-1) = .350
#     obs 3: .08e^(.602*1) = .146
#     obs 4: .12e^(.602*1) = .219
# E4: hyper-parameters to tweak would be n_estimators and learning_rate
# E5: E
# E6: F
# E7: ada boost asigns higher weights to missclassified input in order to prioritize them to correctly classify them next. 
#     gradient boosting utalized resudual errors made by the previous predictor to fit a new predictor to minimize the loss function

In [6]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import recall_score, accuracy_score

s3 = boto3.resource('s3')
bucket_name = 'grant-gonnerman-data-445'
bucket = s3.Bucket(bucket_name)

file_key = 'framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading data file
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [7]:
# removing NA 
heart = heart.dropna()

In [13]:
# defining input and target variables 
x = heart[['age', 'totChol', 'sysBP','BMI', 'heartRate', 'glucose']]
y = heart['TenYearCHD']

rf_recall = list()
rf_accuracy = list()
ada_recall = list()
ada_accuracy = list()
gb_recall = list()
gb_accuracy = list()

for i in range (1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

    # building RF model
    rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(x_train, y_train)
    # predicting 
    rf_pred = rf_md.predict_proba(x_test)[:,1]
    # changing liklihoods to labels
    rf_labels = np.where(rf_pred < 0.1, 0, 1)
    # computing recall
    rf_recall.append(recall_score(y_test, rf_labels))
    rf_accuracy.append(accuracy_score(y_test, rf_labels))


    # building Ada Boost model
    ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = .001).fit(x_train, y_train)
    # predicting on test
    ada_pred = ada_md.predict_proba(x_test)[:,1]
    #liklyhoods to labels
    ada_label = np.where(ada_pred < .1, 0, 1)
    # computing recall
    ada_recall.append(recall_score(y_test, ada_label))
    ada_accuracy.append(accuracy_score(y_test, ada_label))


    # building the gradient boosting model 
    gb_md = GradientBoostingClassifier(max_depth = 3, n_estimators = 500, learning_rate = .001).fit(x_train, y_train)
    # predicting on test
    gb_pred = gb_md.predict_proba(x_test)[:,1]
    #liklyhoods to labels
    gb_label = np.where(gb_pred < .1, 0, 1)
    # computing recall
    gb_recall.append(recall_score(y_test, gb_label))
    gb_accuracy.append(accuracy_score(y_test, gb_label))

In [14]:
print('average recall of RF model', np.mean(rf_recall))
print('average accuracy of RF model', np.mean(rf_accuracy))
print('average recall of Ada Boost model', np.mean(ada_recall))
print('average accuracy of Ada Boost model', np.mean(ada_accuracy))
print('average recall of Gradient Boosting model', np.mean(gb_recall))
print('average accuracy of Gradient Boosting model', np.mean(gb_accuracy))

average recall of RF model 0.8516414141414141
average accuracy of RF model 0.4517304189435338
average recall of Ada Boost model 0.9321789321789322
average accuracy of Ada Boost model 0.2892035105149859
average recall of Gradient Boosting model 1.0
average accuracy of Gradient Boosting model 0.15300546448087426


In [None]:
# the model meet the minimum requirement of 80% recall but none meet the requirement for accuracy
# to attempt bringing the accuracy above the 80% threshold i would try tweaking the hyper parameters 
# specifically the number of estimators, max depth, and learning rate to see if diffferent values will yeild better results