# Gradient Boosted Trees
---

In [1]:
import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import KNNImputer
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

data_df = pd.read_csv("NHANES_data_stroke_train.csv")

strokes = data_df[data_df["stroke"] == 1]
noStrokes = data_df[data_df["stroke"] == 2]

noStrokes = noStrokes.dropna()

# Fill missing data in MIs using the imputer method considering 5 neighbors
imputer = KNNImputer(n_neighbors=5)
imputer.fit(strokes)
imputed_data = imputer.transform(strokes)
imputed_df = pd.DataFrame(imputed_data, columns=strokes.columns)

undersample_noStrokes = noStrokes.sample(frac=0.097)

print(len(undersample_noStrokes))
print(len(strokes))

# combine datasets
# Ignore index to concatenate to the appropriate axis
data = pd.concat([imputed_df, undersample_noStrokes], ignore_index=True)

X = data[['Sex', 'Age', 'Race', 'Diastolic', 'Systolic', 'Pulse', 'BMI', 'HDL', 'Trig', 'LDL', 'TCHOL', 'kidneys_eGFR', 'Diabetes', 'CurrentSmoker', 'isActive']]
y = data["stroke"]

avgAccuracy = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y) 
    # n_estimators = 100 <--- # of trees
    
    # Create a Gradient Boosting classifier with regularization
    gbt = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        min_samples_split=5,
        min_samples_leaf=2,
        subsample=0.8,
        random_state=42,
        verbose=0,
    )
    
    # Fit the model to the data
    gbt.fit(X_train, y_train)
    
    #print("accuracy for train:", clf.score(X_train, y_train)*100)
    acc = gbt.score(X_test, y_test)*100
    avgAccuracy = avgAccuracy+[acc]
    print("accuracy for test ...... ", acc)

print("* Average accuracy *: ", sum(avgAccuracy)/len(avgAccuracy))

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


136
135
accuracy for test ......  67.64705882352942
accuracy for test ......  66.17647058823529
accuracy for test ......  70.58823529411765
accuracy for test ......  67.64705882352942
accuracy for test ......  70.58823529411765
accuracy for test ......  64.70588235294117
accuracy for test ......  69.11764705882352
accuracy for test ......  72.05882352941177
accuracy for test ......  70.58823529411765
accuracy for test ......  82.35294117647058
* Average accuracy *:  70.14705882352942


# Random Forest
---

In [71]:
import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split


data_df = pd.read_csv("NHANES_data_stroke_train.csv")

strokes = data_df[data_df["stroke"] == 1]
noStrokes = data_df[data_df["stroke"] == 2]

noStrokes = noStrokes.dropna()

# Fill missing data in MIs using the imputer method considering 5 neighbors
imputer = KNNImputer(n_neighbors=5)
imputer.fit(strokes)
imputed_data = imputer.transform(strokes)
imputed_df = pd.DataFrame(imputed_data, columns=strokes.columns)

undersample_noStrokes = noStrokes.sample(frac=0.097)

print(len(undersample_noStrokes))
print(len(strokes))

# combine datasets
# Ignore index to concatenate to the appropriate axis
data = pd.concat([imputed_df, undersample_noStrokes], ignore_index=True)

X = data[['Sex', 'Age', 'Race', 'Diastolic', 'Systolic', 'Pulse', 'BMI', 'HDL', 'Trig', 'LDL', 'TCHOL', 'kidneys_eGFR', 'Diabetes', 'CurrentSmoker', 'isActive']]
y = data["stroke"]

avgAccuracy = []
for i in range(10):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, stratify=y) 
    # n_estimators = 100 <--- # of trees
    clf = RandomForestClassifier(n_estimators=100, max_depth=2, criterion='gini', random_state=25)
    clf.fit(X_train, y_train)

    #print("accuracy for train:", clf.score(X_train, y_train)*100)
    acc = clf.score(X_test, y_test)*100
    avgAccuracy = avgAccuracy+[acc]
    print("accuracy for test ...... ", acc)

print("* Average accuracy *: ", sum(avgAccuracy)/len(avgAccuracy))

136
135
accuracy for test ......  70.58823529411765
accuracy for test ......  66.17647058823529
accuracy for test ......  75.0
accuracy for test ......  70.58823529411765
accuracy for test ......  72.05882352941177
accuracy for test ......  61.76470588235294
accuracy for test ......  72.05882352941177
accuracy for test ......  79.41176470588235
accuracy for test ......  75.0
accuracy for test ......  72.05882352941177
* Average accuracy *:  71.47058823529412


# SVM
---

In [1]:
from sklearn import svm
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer


np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

data_df = pd.read_csv("NHANES_data_stroke_train.csv")

strokes = data_df[data_df["stroke"] == 1]
noStrokes = data_df[data_df["stroke"] == 2]

noStrokes = noStrokes.dropna()

# Fill missing data in MIs using the imputer method considering 5 neighbors
imputer = KNNImputer(n_neighbors=5)
imputer.fit(strokes)
imputed_data = imputer.transform(strokes)
imputed_df = pd.DataFrame(imputed_data, columns=strokes.columns)

undersample_noStrokes = noStrokes.sample(frac=0.097)

print(len(undersample_noStrokes))
print(len(strokes))

# combine datasets
# Ignore index to concatenate to the appropriate axis
data = pd.concat([imputed_df, undersample_noStrokes], ignore_index=True)

X = data[['Sex', 'Age', 'Race', 'Diastolic', 'Systolic', 'Pulse', 'BMI', 'HDL', 'Trig', 'LDL', 'TCHOL', 'kidneys_eGFR', 'Diabetes', 'CurrentSmoker', 'isActive']]
y = data["stroke"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =  \
        train_test_split(X, y, test_size=.30, random_state=42)

# kernals could be: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’.              
clf = svm.SVC(kernel="linear", C=1000, probability=True)

clf.fit(X_train, y_train)
print("accuracy train : ", clf.score(X_train, y_train))
print("accuracy test : ", clf.score(X_test, y_test))

print("predicted probabilities:\n", clf.predict_proba(X_test))

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


136
135
accuracy train :  0.7936507936507936
accuracy test :  0.6829268292682927
predicted probabilities:
 [[0.634 0.366]
 [0.051 0.949]
 [0.155 0.845]
 [0.940 0.060]
 [0.053 0.947]
 [0.789 0.211]
 [0.050 0.950]
 [0.393 0.607]
 [0.041 0.959]
 [0.017 0.983]
 [0.627 0.373]
 [0.356 0.644]
 [0.013 0.987]
 [0.759 0.241]
 [0.088 0.912]
 [0.236 0.764]
 [0.087 0.913]
 [0.143 0.857]
 [0.544 0.456]
 [0.738 0.262]
 [0.459 0.541]
 [0.381 0.619]
 [0.955 0.045]
 [0.848 0.152]
 [0.161 0.839]
 [0.568 0.432]
 [0.607 0.393]
 [0.810 0.190]
 [0.314 0.686]
 [0.460 0.540]
 [0.653 0.347]
 [0.534 0.466]
 [0.245 0.755]
 [0.088 0.912]
 [0.204 0.796]
 [0.850 0.150]
 [0.701 0.299]
 [0.581 0.419]
 [0.546 0.454]
 [0.416 0.584]
 [0.917 0.083]
 [0.772 0.228]
 [0.040 0.960]
 [0.108 0.892]
 [0.898 0.102]
 [0.252 0.748]
 [0.500 0.500]
 [0.092 0.908]
 [0.811 0.189]
 [0.031 0.969]
 [0.080 0.920]
 [0.034 0.966]
 [0.022 0.978]
 [0.841 0.159]
 [0.706 0.294]
 [0.153 0.847]
 [0.337 0.663]
 [0.122 0.878]
 [0.846 0.154]
 [0.403 