In [1]:
# A random forest is a meta estimator that fits a number of decision tree classifiers on 
# various sub-samples of the dataset and uses averaging to improve the predictive accuracy 
# and control over-fitting. The sub-sample size is controlled with the max_samples parameter 
# if bootstrap=True (default), otherwise the whole dataset is used to build each tree.

# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

# class sklearn.ensemble.RandomForestClassifier(n_estimators=100, *, criterion='gini', 
# max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
# max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, 
# oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, 
# class_weight=None, ccp_alpha=0.0, max_samples=None)[source]

# to run: $
# jupyter nbconvert --to notebook --inplace --execute Project1.ipynb

import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from random import randint

#input dataset
fruit_df = pd.read_csv("NHANES_data_stroke_train.csv")

#under sample the non-stroke
MI_positive = fruit_df[fruit_df['stroke'] == 1]
MI_negitive = fruit_df[fruit_df['stroke'] == 2].sample(frac=.03411675511751327)
fruit_df = pd.concat([MI_positive, MI_negitive])

fruit_featureNames = ["Income","Sex","Age","Systolic","BMI","HDL","TCHOL","kidneys_eGFR"]
X = fruit_df[fruit_featureNames]
y = fruit_df["stroke"]

# Define the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
X = preprocessing_pipeline.fit_transform(X)

#add features


avgAccuracy = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

    # n_estimators = 100 <--- # of trees
    clf = RandomForestClassifier(n_estimators=500, max_depth=12, criterion='gini')
    clf.fit(X_train, y_train)

    #print("accuracy for train:", clf.score(X_train, y_train)*100)
    acc = clf.score(X_test, y_test)*100
    avgAccuracy = avgAccuracy+[acc]
    # print(f"accuracy for test {i + 1}...... ", acc)

    print(f"* Average accuracy for test {i} *: {sum(avgAccuracy)/len(avgAccuracy)}")

* Average accuracy for test 0 *: 67.64705882352942


* Average accuracy for test 1 *: 63.235294117647065


* Average accuracy for test 2 *: 63.235294117647065


* Average accuracy for test 3 *: 66.1764705882353


* Average accuracy for test 4 *: 64.70588235294119


* Average accuracy for test 5 *: 65.19607843137256


* Average accuracy for test 6 *: 63.655462184873954


* Average accuracy for test 7 *: 63.602941176470594


* Average accuracy for test 8 *: 63.72549019607844


* Average accuracy for test 9 *: 63.52941176470589


In [3]:
# feature importance
print("accuracy for train:", clf.score(X_train, y_train)*100)

# ranked based on the average impurity decrease across all the decision trees in the forest
feature_importances = clf.feature_importances_

# Create a DataFrame to display the feature importances
feature_importance_df = pd.DataFrame({'Feature': fruit_featureNames, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("\n", feature_importance_df)

accuracy for train: 100.0

         Feature  Importance
7  kidneys_eGFR    0.188595
3      Systolic    0.161606
2           Age    0.152292
6         TCHOL    0.144194
4           BMI    0.114164
0        Income    0.111462
5           HDL    0.110881
1           Sex    0.016806
