In [2]:
# Gradient boosting is a machine learning technique based on boosting in a functional space, where the target 
# is pseudo-residuals rather than the typical residuals used in traditional boosting. It gives a prediction 
# model in the form of an ensemble of weak prediction models, i.e., models that make very few assumptions about 
# the data, which are typically simple decision trees.[wiki]

# Gradient Boosted Decision Trees is a generalization of boosting to arbitrary differentiable loss functions
# https://scikit-learn.org/stable/modules/ensemble.html#gradient-boosted-trees

# class sklearn.ensemble.GradientBoostingClassifier(*, loss='log_loss', learning_rate=0.1, n_estimators=100, 
# subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
# max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, verbose=0, 
# max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)

import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Define the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', StandardScaler())  # Scale the features
])

# Define the Gradient Boosting model
gbt = GradientBoostingClassifier(
    n_estimators=50,
    learning_rate=0.05,
    max_depth=2,
    min_samples_split=5,
    min_samples_leaf=2,
    subsample=0.8,
    verbose=0,
)

##############################  TRAINING  ##############################################
#input dataset
input_df = pd.read_csv("NHANES_data_stroke_train.csv")

# Under sample the non-stroke
# Due to the large number of MI_positive, drop any with missing values, MI_negative will be imputed later
MI_positive = input_df[input_df['stroke'] == 1]
MI_negative = input_df[input_df['stroke'] == 2]
MI_negative = MI_negative.dropna()
MI_negative = MI_negative.sample(n=len(MI_positive), replace=False)
input_df = pd.concat([MI_positive, MI_negative])

# attributes
featurenames = ["Income","Age","Race","Diastolic","Systolic","Pulse","BMI","HDL","Trig","LDL","TCHOL","kidneys_eGFR","Diabetes"]
X = input_df[featurenames]
y = input_df["stroke"]

# impute and scale the data
X = preprocessing_pipeline.fit_transform(X)

avgAccuracy = []
for i in range(10):
    # split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y) 
    
    # train the model
    gbt.fit(X_train, y_train)
    
    # print accuracy info
    print("accuracy for train:", gbt.score(X_train, y_train)*100)
    acc = gbt.score(X_test, y_test)*100
    avgAccuracy = avgAccuracy+[acc]
    print("accuracy for test ...... ", acc)

print("* Average accuracy *: ", sum(avgAccuracy)/len(avgAccuracy))

##############################  PREDICTION  ##############################################
# load data set
new_data = pd.read_csv("NHANES_data_stroke_test4Students.csv")

# No stroke column so get rid of it
new_data = new_data.drop(columns=['stroke'])

# get attributes
X_new = new_data[featurenames]

# imputer
X_new = preprocessing_pipeline.fit_transform(X_new)

# Make predictions on the new data, run model
new_probabilities = gbt.predict_proba(X_new)[:, 0]  # for output
new_predictions = gbt.predict(X_new) # unsed, just for testing ratio of MI/noMI

# Get each sample's ID and write probabilities to the output CSV
new_participant_ids = new_data['ParticipantID']
new_output_df = pd.DataFrame({'ParticipantID': new_participant_ids, 'Pred_Probability': new_probabilities})
new_output_df.to_csv('GBTpred.csv', index=False)

accuracy for train: 88.11881188118812
accuracy for test ......  73.52941176470588
accuracy for train: 83.16831683168317
accuracy for test ......  79.41176470588235
accuracy for train: 87.12871287128714
accuracy for test ......  73.52941176470588
accuracy for train: 87.62376237623762
accuracy for test ......  69.11764705882352
accuracy for train: 85.64356435643565
accuracy for test ......  79.41176470588235
accuracy for train: 84.65346534653465
accuracy for test ......  80.88235294117648
accuracy for train: 86.63366336633663
accuracy for test ......  64.70588235294117
accuracy for train: 84.15841584158416
accuracy for test ......  66.17647058823529
accuracy for train: 87.12871287128714
accuracy for test ......  73.52941176470588
accuracy for train: 85.14851485148515
accuracy for test ......  70.58823529411765
* Average accuracy *:  73.08823529411765


In [3]:
# feature importance

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y) 
# n_estimators = 100 <--- # of trees

gbt = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbt.fit(X_train, y_train)
    
print("accuracy for train:", gbt.score(X_train, y_train)*100)

# contribution of a feature in each tree is determined by the improvement it brings to the loss function.
# importance of a feature is calculated by summing up the impurity reductions 
feature_importances = gbt.feature_importances_

# Create a DataFrame to display the feature importances
feature_importance_df = pd.DataFrame({'Feature': featurenames, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("\n", feature_importance_df)


accuracy for train: 100.0

           Feature  Importance
13   kidneys_eGFR    0.274765
2             Age    0.152181
0          Income    0.106107
12          TCHOL    0.071210
6        Systolic    0.066189
11            LDL    0.063173
8             BMI    0.047464
5       Diastolic    0.044583
3            Race    0.042969
9             HDL    0.033641
10           Trig    0.032668
7           Pulse    0.029736
14       Diabetes    0.020088
4             Edu    0.015151
1             Sex    0.000047
17      isInsured    0.000027
15  CurrentSmoker    0.000000
16       isActive    0.000000
