In [1]:
## 1. If the a decision tree is under-fitting the training dataset, it is not a good idea to try scaling
##    the input features. Scaling the input features does not change the relative value
##    of the splitting feature values over their range of values. Using the same splitting value and
##    same tree structure results in the same splits. The change should be in tree structure
##    i.e. splitting values and/or depth of tree.

In [2]:
## 2. If the decision tree is over-fitting the training dataset, it is a good idea to try decreasing max_depth.
##    High max_depth makes the tree split too much to increase the purity, but increase over-fitting risk.
##    Decreasing the max_depth decreases over-fitting risk.

In [3]:
## 3. d. a and b

In [4]:
## 4. d. a and c

In [None]:
## 5. Calculating Gini impurity of each split

## Outlook:       2/5 * 3/5 + 0         + 3/5 * 2/5 = 0.48
## Temperature:   2/4 * 2/4 + 4/6 * 2/6 + 3/4 * 1/4 = 1.03
## Windy:         6/8 * 2/8 + 3/6 * 3/6             = 0.44
## Humidity:      3/7 * 4/7 + 6/7 * 1/7             = 0.37

## Splitting using Humidity has the lowest Gini impurity index, so this split has the highest information gain

In [16]:
## 6.a
import boto3
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'danhtran358-data-445-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the csv file
file_key = 'framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [17]:
## 6.b
## Dropping missing values
heart = heart.dropna()

In [18]:
## 6.c
## Defining input and target variables
X = heart.drop(columns = ['TenYearCHD'])
Y = heart['TenYearCHD']

importances = list()

for i in range(100):
    # Splitting the data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    md_imp = RandomForestClassifier(n_estimators = 500).fit(X_train, Y_train)
    
    importances.append(md_imp.feature_importances_)
    
importances = pd.DataFrame(importances, columns = X_train.columns)
np.mean(importances, axis = 0)

male               0.021152
age                0.124537
education          0.036868
currentSmoker      0.012583
cigsPerDay         0.050475
BPMeds             0.007038
prevalentStroke    0.003400
prevalentHyp       0.018310
diabetes           0.006550
totChol            0.121941
sysBP              0.135373
diaBP              0.118899
BMI                0.127376
heartRate          0.095935
glucose            0.119561
dtype: float64

In [19]:
## 6.d
## Defining input and target variables
X = heart[['age', 'totChol', 'sysBP', 'BMI', 'glucose']]
Y = heart['TenYearCHD']

recall_md1 = list()
recall_md2 = list()
recall_md3 = list()

for i in range(100):
    # Splitting the data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)
    pred = md.predict_proba(X_test)[:, 1]
    pred = np.where(pred < 0.1, 0, 1)
    recall_md1.append(recall_score(Y_test, pred))
    
    md = RandomForestClassifier(n_estimators = 500, max_depth = 5).fit(X_train, Y_train)
    pred = md.predict_proba(X_test)[:, 1]
    pred = np.where(pred < 0.1, 0, 1)
    recall_md2.append(recall_score(Y_test, pred))
    
    md = RandomForestClassifier(n_estimators = 500, max_depth = 7).fit(X_train, Y_train)    
    pred = md.predict_proba(X_test)[:, 1]
    pred = np.where(pred < 0.1, 0, 1)
    recall_md3.append(recall_score(Y_test, pred))

print('Random forest 1', np.mean(recall_md1))
print('Random forest 2', np.mean(recall_md2))
print('Random forest 3', np.mean(recall_md3))

Random forest 1 0.8418749999999998
Random forest 2 0.8271428571428572
Random forest 3 0.8130357142857144


In [None]:
## I would use the first random forest because it has the higher recall score