In [2]:
## Question 
# (a)
import boto3
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR, SVC

# Defining s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'gabrielferreira-data-455-bucket'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
key_file = 'framingham.csv'

bucket_object = bucket.Object(key_file)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading csv
heart = pd.read_csv(file_content_stream)

# (b)
# Removing missing values
heart = heart.dropna()
heart.head(1)

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0


In [11]:
# Defining target and predictor variables
X = heart[['age', 'totChol', 'sysBP', 'BMI', 'heartRate', 'glucose']]
Y = heart['TenYearCHD']

# (i) Splitting the dataset into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

# Scaling the inputs to 0-1
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [12]:
#### (iii) Creating AdaBoost Model
AB_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)
    
# Predicting on test dataset
AB_pred = AB_md.predict_proba(X_test)[:,1]
AB_label = np.where(AB_pred > 0.15, 1, 0)
    
# Computing Recall Score
print("AdaBoost Recal Score: ", round(recall_score(Y_test, AB_label), 2))
    
# Computing Accuracy Score
print("AdaBoost Accuracy  Score: ", round(accuracy_score(Y_test, AB_label), 2))

AdaBoost Recal Score:  0.84
AdaBoost Accuracy  Score:  0.23


In [13]:
#### Creating Gradiet Boost Model
GB_md = GradientBoostingClassifier(max_depth = 3, n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)
    
# Predicting on test dataset
GB_pred = GB_md.predict_proba(X_test)[:,1]
GB_label = np.where(GB_pred > 0.15, 1, 0)
    
# Computing Recall Score
print("Gradiet Boost Recal Score: ", round(recall_score(Y_test, GB_label), 2))
    
# Computing Accuracy Score
print("Gradiet Boost accuracy Score: ", round(accuracy_score(Y_test, GB_label), 2))

Gradiet Boost Recal Score:  0.79
Gradiet Boost accuracy Score:  0.47


In [14]:
#### (ii) Creating Support Vector Machine
SVM_md = SVC(kernel = 'rbf', probability = True).fit(X_train, Y_train)
    
# Predicting on test dataset
SVM_pred = SVM_md.predict_proba(X_test)[:,1]
SVM_label = np.where(SVM_pred > 0.15, 1, 0)
    
# Computing Recall Score
print("Support Vector Machine Recal Score: ", round(recall_score(Y_test, SVM_label), 2))
    
# Computing Accuracy
print("Support Vector Machine Accuracy Score: ", round(accuracy_score(Y_test, SVM_label), 2))

Support Vector Machine Recal Score:  0.62
Support Vector Machine Accuracy Score:  0.55


In [19]:
## Ensembeling of likelyhood
X_rf = pd.concat([pd.DataFrame(AB_pred), pd.DataFrame(GB_pred), pd.DataFrame(SVM_pred), Y_test.reset_index(drop = True)], axis = 1)

# Defining input and target
X_rf_1 = pd.concat([pd.DataFrame(AB_pred), pd.DataFrame(GB_pred), pd.DataFrame(SVM_pred)], axis = 1)

# Building the Random Forest Model
RF_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_rf_1, Y_test)

# Extracting the ensemble likelyhood
RF_preds = RF_md.predict_proba(X_rf_1)[:,1]

# Final results
final_results = pd.concat([pd.DataFrame(AB_pred), pd.DataFrame(GB_pred), pd.DataFrame(SVM_pred), pd.DataFrame(RF_preds), Y_test.reset_index(drop = True)], axis = 1)
final_results.head()

Unnamed: 0,0,0.1,0.2,0.3,TenYearCHD
0,0.210704,0.042467,0.151246,0.200309,1
1,0.626346,0.276724,0.167198,0.290197,1
2,0.355289,0.166887,0.148724,0.121459,0
3,0.479004,0.33081,0.258932,0.249578,0
4,0.266034,0.055741,0.152184,0.097598,0


In [20]:
# Labeling predictions
RF_label = np.where(RF_preds > 0.15, 1, 0)

# Computing Recall Score
print("Random Forest Recal Score: ", round(recall_score(Y_test, RF_label), 2))
    
# Computing Accuracy
print("Random Forest Accuracy Score: ", round(accuracy_score(Y_test, RF_label), 2))

Random Forest Recal Score:  0.7
Random Forest Accuracy Score:  0.69
