In [1]:
# A random forest is a meta estimator that fits a number of decision tree classifiers on 
# various sub-samples of the dataset and uses averaging to improve the predictive accuracy 
# and control over-fitting. The sub-sample size is controlled with the max_samples parameter 
# if bootstrap=True (default), otherwise the whole dataset is used to build each tree.

# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

# class sklearn.ensemble.RandomForestClassifier(n_estimators=100, *, criterion='gini', 
# max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
# max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, 
# oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, 
# class_weight=None, ccp_alpha=0.0, max_samples=None)[source]

# to run: $
# jupyter nbconvert --to notebook --inplace --execute Project1.ipynb

import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Define the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define the RandomForestClassifier with tuned hyperparameters
clf = RandomForestClassifier(n_estimators=2000, max_depth=2, criterion='gini', min_samples_split=2, min_samples_leaf=2)

##############################  TRAINING  ##############################################
# Load dataset
input_df = pd.read_csv("NHANES_data_stroke_train.csv")

# Under sample the non-stroke
# Due to the large number of MI_positive, drop any with missing values, MI_negative will be imputed later
MI_positive = input_df[input_df['stroke'] == 1]
MI_negative = input_df[input_df['stroke'] == 2]
MI_negative.dropna()
MI_negative = MI_negative.sample(n=len(MI_positive), replace=False)
input_df = pd.concat([MI_positive, MI_negative])

# define attributes
featurenames = ["Income","Sex","Age","Race","Edu","Diastolic","Systolic","Pulse","BMI","HDL","Trig","LDL","TCHOL","kidneys_eGFR","Diabetes","CurrentSmoker","isActive","isInsured"]
X = input_df[featurenames]
y = input_df["stroke"]

# impute and scale the data
X = preprocessing_pipeline.fit_transform(X)

avgAccuracy = []
while True:
    # split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

    # train the model
    clf.fit(X_train, y_train)

    # Print train and test accuracies
    print("Accuracy for train:", clf.score(X_train, y_train)*100)
    acc = clf.score(X_test, y_test)*100
    avgAccuracy.append(acc)
    print(f"Accuracy for test:", acc)

    if acc >= 76:
        break

# Print average accuracy across all tests
print("* Average accuracy for all tests *:", np.mean(avgAccuracy))

##############################  PREDICTION  ##############################################
# load data set
new_data = pd.read_csv("NHANES_data_stroke_test4Students.csv")

# No stroke column so get rid of it
new_data = new_data.drop(columns=['stroke'])

# get attributes
X_new = new_data[featurenames]

# imputer
X_new = preprocessing_pipeline.fit_transform(X_new)

# Make predictions on the new data, run model
new_probabilities = clf.predict_proba(X_new)[:, 0]  # for output
new_predictions = clf.predict(X_new) # unsed, just for testing ratio of MI/noMI

# Get each sample's ID and write probabilities to the output CSV
new_participant_ids = new_data['ParticipantID']
new_output_df = pd.DataFrame({'ParticipantID': new_participant_ids, 'Pred_Probability': new_probabilities})
new_output_df.to_csv('RFPred.csv', index=False)


Accuracy for train: 85.14851485148515
Accuracy for test: 66.17647058823529
Accuracy for train: 82.17821782178217
Accuracy for test: 67.64705882352942
Accuracy for train: 81.68316831683168
Accuracy for test: 66.17647058823529
Accuracy for train: 82.67326732673267
Accuracy for test: 66.17647058823529
Accuracy for train: 85.64356435643565
Accuracy for test: 75.0
Accuracy for train: 82.17821782178217
Accuracy for test: 80.88235294117648
* Average accuracy for all tests *: 70.34313725490196


KeyError: 'SEQN'

In [17]:
# feature importance
print("accuracy for train:", clf.score(X_train, y_train)*100)

# ranked based on the average impurity decrease across all the decision trees in the forest
feature_importances = clf.feature_importances_

# Create a DataFrame to display the feature importances
feature_importance_df = pd.DataFrame({'Feature': featurenames, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("\n", feature_importance_df)

accuracy for train: 80.19801980198021

           Feature  Importance
13   kidneys_eGFR    0.237421
2             Age    0.202963
9             HDL    0.086621
6        Systolic    0.079742
12          TCHOL    0.076883
8             BMI    0.067023
0          Income    0.059013
3            Race    0.057709
11            LDL    0.035392
10           Trig    0.033668
14       Diabetes    0.019218
5       Diastolic    0.016505
7           Pulse    0.015389
15  CurrentSmoker    0.004574
4             Edu    0.004267
16       isActive    0.002290
17      isInsured    0.001003
1             Sex    0.000320
