In [None]:
# E1: No
# E2: Yes
# E3: D
# E4: D
# E5: outlook: sunny = (2/5)*(1-(2/5))+(3/5)*(1-(3/5)) = .48, overcast = 0, rainy = (3/5)*(1-(3/5))+(2/5)*(1-(2/5)) = .48, TOTAL = ((5/14)*.48)+((4/14)*0)+((5/14)*.48) = .3428
#     temp: hot = .5, mild = (4/6)*(1-(4/6))+(2/6)*(1-(2/6)) = .44, cool = (3/4)*(1-(3/4))+(1/4)*(1-(1/4)) = .375,         TOTAL = ((4/14)*.5)+((6/14)*.44)+((4/14)*.375) = .438
#     windy: false = (6/8)*(1-(6/8))+(2/8)*(1-(2/8)) = .375, true = .5                                                     TOTAL = ((4/14)*.5)+((6/14)*.44) = .428
#     humidity: high = (3/7)*(1-(3/7))+(4/7)*(1-(4/7)) = .489, normal = (6/7)*(1-(6/7))+(1/7)*(1-(1/7)) = .244             TOTAL = ((4/14)*.5)+((6/14)*.44) = .366
#     the outlook attribute has the highest information gain because its gini impurity value is the smallest

In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score

s3 = boto3.resource('s3')
bucket_name = 'grant-gonnerman-data-445'
bucket = s3.Bucket(bucket_name)

file_key = 'framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading data file
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [2]:
# removing NA 
heart = heart.dropna()

In [3]:
rf_importances = list()

# defining input and target variables 
x = heart.drop(columns = 'TenYearCHD', axis = 1)
y = heart['TenYearCHD']

for i in range (1,100):
    # splitting the data
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

    # building RF model
    rf_md = RandomForestClassifier(n_estimators = 500).fit(x_train, y_train)

    #extracting importaint features 
    # importances = pd.DataFrame({'Feature': x_train.columns, 'Importance': rf_md.feature_importances_})
    # importances = importances.sort_values(by = 'Importance', ascending = False)
    rf_importances.append(rf_md.feature_importances_)
    
rf_results = pd.DataFrame(rf_importances)
rf_results.columns = [heart.drop(columns = 'TenYearCHD', axis = 1)] 

avg_importances = pd.DataFrame({'Importance': np.mean(rf_results)})
avg_importances = avg_importances.sort_values(by = 'Importance', ascending = False)
avg_importances

Unnamed: 0,Importance
sysBP,0.135149
BMI,0.127458
age,0.12452
totChol,0.122112
glucose,0.120041
diaBP,0.118767
heartRate,0.095636
cigsPerDay,0.050412
education,0.037007
male,0.021161


In [4]:
rf1_results = list()
rf2_results = list()
rf3_results = list()

# defining top 5 previous input and target variables 
x = heart[['sysBP', 'BMI', 'age', 'totChol', 'glucose']]
y = heart['TenYearCHD']

for i in range (1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

    # building first RF model
    rf_md1 = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(x_train, y_train)
    # predicting 
    rf_pred1 = rf_md1.predict_proba(x_test)[:,1]
    # changing liklihoods to labels
    rf_labels1 = np.where(rf_pred1 < 0.1, 0, 1)
    # computing recall
    rf_recall1 = recall_score(y_test, rf_labels1)
    rf1_results.append(rf_recall1)

    # building second RF model
    rf_md2 = RandomForestClassifier(n_estimators = 500, max_depth = 5).fit(x_train, y_train)
    # predicting 
    rf_pred2 = rf_md2.predict_proba(x_test)[:,1]
    # changing liklihoods to labels
    rf_labels2 = np.where(rf_pred2 < 0.1, 0, 1)
    # computing recall
    rf_recall2 = recall_score(y_test, rf_labels2)
    rf2_results.append(rf_recall2)

    # building third RF model
    rf_md3 = RandomForestClassifier(n_estimators = 500, max_depth = 7).fit(x_train, y_train)
    # predicting 
    rf_pred3 = rf_md3.predict_proba(x_test)[:,1]
    # changing liklihoods to labels
    rf_labels3 = np.where(rf_pred3 < 0.1, 0, 1)
    # computing recall
    rf_recall3 = recall_score(y_test, rf_labels3)
    rf3_results.append(rf_recall3)

print('average recall of model 1:', np.mean(rf1_results))
print('average recall of model 2:', np.mean(rf2_results))
print('average recall of model 3:', np.mean(rf3_results))

average recall of model 1: 0.8397366522366522
average recall of model 2: 0.8227813852813852
average recall of model 3: 0.8060064935064934


In [None]:
# based on the results i would use model 1 to predict TenYearCHD because it has the lowest average recall over all iterations