In [3]:
# George McCannon
# run command:$ jupyter nbconvert --to notebook --inplace --execute Project2.ipynb

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [4]:
# Define the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define the RandomForestClassifier with tuned hyperparameters
clf_rf = RandomForestClassifier(n_estimators=5000, max_depth=3, criterion='gini', min_samples_split=2, min_samples_leaf=2)

# Define the Gradient Boosting model
clf_gbt = GradientBoostingClassifier(
    n_estimators=50,
    learning_rate=0.05,
    max_depth=2,
    min_samples_split=5,
    min_samples_leaf=2,
    subsample=0.8,
    verbose=0,
)

# define the SVM model; kernals could be: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’.              
clf_svm = svm.SVC(kernel="rbf", C=.5, probability=True)

In [10]:
# Load dataset fortraining the mdoel
input_df = pd.read_csv("NHANES_data_stroke_train.csv")

# Under sample the non-stroke
# Due to the large number of MI_negative, drop any with missing values, MI_positive will be imputed later
MI_positive = input_df[input_df['stroke'] == 1]
MI_negative = input_df[input_df['stroke'] == 2]
MI_negative = MI_negative.dropna()
MI_negative = MI_negative.sample(n=len(MI_positive), replace=False)
input_df = pd.concat([MI_positive, MI_negative])

# define attributes
# defaults: ["Income","Sex","Age","Race","Edu","Diastolic","Systolic","Pulse","BMI","HDL","Trig","LDL","TCHOL","kidneys_eGFR","Diabetes","CurrentSmoker","isActive","isInsured"]
featurenames = ["Income","Age","Race","Diastolic","Systolic","Pulse","BMI","HDL","Trig","LDL","TCHOL","kidneys_eGFR","Diabetes"]
X = input_df[featurenames]
y = input_df["stroke"]

# impute and scale the data
X = preprocessing_pipeline.fit_transform(X)

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [11]:
# load the data set to make the predictions on
new_data = pd.read_csv("NHANES_data_stroke_test4Students.csv")

# No stroke column so get rid of it
new_data = new_data.drop(columns=['stroke'])

# get attributes
X_new = new_data[featurenames]

# imputer
X_new = preprocessing_pipeline.fit_transform(X_new)

In [12]:
# Random Forest Model
# train the model
clf_rf.fit(X_train, y_train)

# display accuracy information
print("Accuracy for training:", clf_rf.score(X_train, y_train)*100)
print("Accuracy for testing:", clf_rf.score(X_test, y_test)*100)

# Make predictions on the new data, run model
new_probabilities = clf_rf.predict_proba(X_new)[:, 0]  # for output
new_predictions = clf_rf.predict(X_new) # unsed, just for testing ratio of MI/noMI

# Get each sample's ID and write probabilities to the output CSV
new_participant_ids = new_data['ParticipantID']
new_output_df = pd.DataFrame({'ParticipantID': new_participant_ids, 'Pred_Probability': new_probabilities})
new_output_df.to_csv('RFpred.csv', index=False)

Accuracy for training: 83.66336633663366
Accuracy for testing: 79.41176470588235


In [13]:
# Graident Boosted Trees Model
# train the model
clf_gbt.fit(X_train, y_train)

# display accuracy information
print("Accuracy for training:", clf_gbt.score(X_train, y_train)*100)
print("Accuracy for testing:", clf_gbt.score(X_test, y_test)*100)

# Make predictions on the new data, run model
new_probabilities = clf_gbt.predict_proba(X_new)[:, 0]  # for output
new_predictions = clf_gbt.predict(X_new) # unsed, just for testing ratio of MI/noMI

# Get each sample's ID and write probabilities to the output CSV
new_participant_ids = new_data['ParticipantID']
new_output_df = pd.DataFrame({'ParticipantID': new_participant_ids, 'Pred_Probability': new_probabilities})
new_output_df.to_csv('GBTpred.csv', index=False)

Accuracy for training: 84.65346534653465
Accuracy for testing: 79.41176470588235


In [14]:
# Graident Boosted Trees Model
# train the model
clf_svm.fit(X_train, y_train)

# display accuracy information
print("Accuracy for training:", clf_svm.score(X_train, y_train)*100)
print("Accuracy for testing:", clf_svm.score(X_test, y_test)*100)

# Make predictions on the new data, run model
new_probabilities = clf_svm.predict_proba(X_new)[:, 0]  # for output
new_predictions = clf_svm.predict(X_new) # unsed, just for testing ratio of MI/noMI

# Get each sample's ID and write probabilities to the output CSV
new_participant_ids = new_data['ParticipantID']
new_output_df = pd.DataFrame({'ParticipantID': new_participant_ids, 'Pred_Probability': new_probabilities})
new_output_df.to_csv('SVMpred.csv', index=False)

Accuracy for training: 82.67326732673267
Accuracy for testing: 80.88235294117648
