In [1]:
# A random forest is a meta estimator that fits a number of decision tree classifiers on 
# various sub-samples of the dataset and uses averaging to improve the predictive accuracy 
# and control over-fitting. The sub-sample size is controlled with the max_samples parameter 
# if bootstrap=True (default), otherwise the whole dataset is used to build each tree.

# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

# class sklearn.ensemble.RandomForestClassifier(n_estimators=100, *, criterion='gini', 
# max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
# max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, 
# oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, 
# class_weight=None, ccp_alpha=0.0, max_samples=None)[source]

# to run: $
# jupyter nbconvert --to notebook --inplace --execute Project1.ipynb

import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

##############################  TRAINING  ##############################################
# Load dataset
input_df = pd.read_csv("NHANES_data_stroke_train.csv")

# Under sample the non-stroke
MI_positive = input_df[input_df['stroke'] == 1]
MI_negative = input_df[input_df['stroke'] == 2].sample(frac=.03411675511751327)
input_df = pd.concat([MI_positive, MI_negative])

featurenames = ["Income","Sex","Age","Race","Edu","Diastolic","Systolic","Pulse","BMI","HDL","Trig","LDL","TCHOL","kidneys_eGFR","Diabetes","CurrentSmoker","isActive","isInsured"]

X = input_df[featurenames]
y = input_df["stroke"]

# Define the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
X = preprocessing_pipeline.fit_transform(X)

avgAccuracy = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

    # Define the RandomForestClassifier with tuned hyperparameters
    clf = RandomForestClassifier(n_estimators=500, max_depth=2, criterion='gini', min_samples_split=2, min_samples_leaf=2)
    clf.fit(X_train, y_train)

    # Print train and test accuracies
    print("Accuracy for train:", clf.score(X_train, y_train)*100)
    acc = clf.score(X_test, y_test)*100
    avgAccuracy.append(acc)
    print(f"Accuracy for test {i + 1}:", acc)

# Print average accuracy across all tests
print("* Average accuracy for all tests *:", np.mean(avgAccuracy))

##############################  PREDICIOTN  ##############################################
# load data set
new_data = pd.read_csv("DEMO.csv")

# No stroke column so get rid of it
new_data = new_data.drop(columns=['stroke'])

# get attributes
X_new = new_data[featurenames]

# imputer
X_new = preprocessing_pipeline.fit_transform(X_new)

# Make predictions on the new data, run model
new_probabilities = clf.predict_proba(X_new)[:, 0]  # for output
new_predictions = clf.predict(X_new) # unsed, just for testing ratio of MI/noMI

# Get each sample's ID and write probabilities to the output CSV
new_participant_ids = new_data['SEQN']
new_output_df = pd.DataFrame({'SEQN': new_participant_ids, 'Pred_Probability': new_probabilities})
new_output_df.to_csv('RFPred.csv', index=False)


Accuracy for train: 80.6930693069307
Accuracy for test 1: 79.41176470588235
Accuracy for train: 80.19801980198021
Accuracy for test 2: 77.94117647058823


In [5]:
# feature importance
print("accuracy for train:", clf.score(X_train, y_train)*100)

# ranked based on the average impurity decrease across all the decision trees in the forest
feature_importances = clf.feature_importances_

# Create a DataFrame to display the feature importances
feature_importance_df = pd.DataFrame({'Feature': featurenames, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("\n", feature_importance_df)

accuracy for train: 100.0

         Feature  Importance
7  kidneys_eGFR    0.184314
2           Age    0.170259
0        Income    0.159905
3      Systolic    0.128499
6         TCHOL    0.123687
4           BMI    0.113949
5           HDL    0.103187
1           Sex    0.016200
