# Gradient Boosted Trees
---

In [58]:
import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import KNNImputer
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

data_df = pd.read_csv("NHANES_data_stroke_train.csv")

strokes = data_df[data_df["stroke"] == 1]
noStrokes = data_df[data_df["stroke"] == 2]

noStrokes = noStrokes.dropna()

# Fill missing data in MIs using the imputer method considering 5 neighbors
imputer = KNNImputer(n_neighbors=5)
imputer.fit(strokes)
imputed_data = imputer.transform(strokes)
imputed_df = pd.DataFrame(imputed_data, columns=strokes.columns)

undersample_noStrokes = noStrokes.sample(frac=0.097)

print(len(undersample_noStrokes))
print(len(strokes))

# combine datasets
# Ignore index to concatenate to the appropriate axis
data = pd.concat([imputed_df, undersample_noStrokes], ignore_index=True)

X = data[['Sex', 'Age', 'Race', 'Diastolic', 'Systolic', 'Pulse', 'BMI', 'HDL', 'Trig', 'LDL', 'TCHOL', 'kidneys_eGFR', 'Diabetes', 'CurrentSmoker', 'isActive']]
y = data["stroke"]

avgAccuracy = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y) 
    # n_estimators = 100 <--- # of trees
    
    # Create a Gradient Boosting classifier with regularization
    gbt = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        min_samples_split=5,
        min_samples_leaf=2,
        subsample=0.8,
        random_state=42,
        verbose=0,
    )
    
    # Fit the model to the data
    gbt.fit(X_train, y_train)
    
    #print("accuracy for train:", clf.score(X_train, y_train)*100)
    acc = gbt.score(X_test, y_test)*100
    avgAccuracy = avgAccuracy+[acc]
    print("accuracy for test ...... ", acc)

print("* Average accuracy *: ", sum(avgAccuracy)/len(avgAccuracy))

136
135
accuracy for test ......  79.41176470588235
accuracy for test ......  73.52941176470588
accuracy for test ......  70.58823529411765
accuracy for test ......  77.94117647058823
accuracy for test ......  66.17647058823529
accuracy for test ......  80.88235294117648
accuracy for test ......  61.76470588235294
accuracy for test ......  75.0
accuracy for test ......  89.70588235294117
accuracy for test ......  77.94117647058823
* Average accuracy *:  75.29411764705883


# Random Forest
---

In [71]:
import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split


data_df = pd.read_csv("NHANES_data_stroke_train.csv")

strokes = data_df[data_df["stroke"] == 1]
noStrokes = data_df[data_df["stroke"] == 2]

noStrokes = noStrokes.dropna()

# Fill missing data in MIs using the imputer method considering 5 neighbors
imputer = KNNImputer(n_neighbors=5)
imputer.fit(strokes)
imputed_data = imputer.transform(strokes)
imputed_df = pd.DataFrame(imputed_data, columns=strokes.columns)

undersample_noStrokes = noStrokes.sample(frac=0.097)

print(len(undersample_noStrokes))
print(len(strokes))

# combine datasets
# Ignore index to concatenate to the appropriate axis
data = pd.concat([imputed_df, undersample_noStrokes], ignore_index=True)

X = data[['Sex', 'Age', 'Race', 'Diastolic', 'Systolic', 'Pulse', 'BMI', 'HDL', 'Trig', 'LDL', 'TCHOL', 'kidneys_eGFR', 'Diabetes', 'CurrentSmoker', 'isActive']]
y = data["stroke"]

avgAccuracy = []
for i in range(10):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, stratify=y) 
    # n_estimators = 100 <--- # of trees
    clf = RandomForestClassifier(n_estimators=100, max_depth=2, criterion='gini', random_state=25)
    clf.fit(X_train, y_train)

    #print("accuracy for train:", clf.score(X_train, y_train)*100)
    acc = clf.score(X_test, y_test)*100
    avgAccuracy = avgAccuracy+[acc]
    print("accuracy for test ...... ", acc)

print("* Average accuracy *: ", sum(avgAccuracy)/len(avgAccuracy))

136
135
accuracy for test ......  70.58823529411765
accuracy for test ......  66.17647058823529
accuracy for test ......  75.0
accuracy for test ......  70.58823529411765
accuracy for test ......  72.05882352941177
accuracy for test ......  61.76470588235294
accuracy for test ......  72.05882352941177
accuracy for test ......  79.41176470588235
accuracy for test ......  75.0
accuracy for test ......  72.05882352941177
* Average accuracy *:  71.47058823529412
