In [2]:
# SVM
# Support vector machines (SVMs) are a set of supervised learning methods used for 
# classification, regression and outliers detection.

# The advantages of support vector machines are:
    # Effective in high dimensional spaces.
    # Still effective in cases where number of dimensions is greater than the number of samples.
    # Uses a subset of training points in the decision function (called support vectors), 
    # so it is also memory efficient.
    # Versatile: different Kernel functions can be specified for the decision function. 
    # Common kernels are provided, but it is also possible to specify custom kernels.

# The disadvantages of support vector machines include:
    # If the number of features is much greater than the number of samples, avoid over-fitting 
    # in choosing Kernel functions and regularization term is crucial.
    # SVMs do not directly provide probability estimates, these are calculated using an 
    # expensive five-fold cross-validation (see Scores and probabilities, below).

# class sklearn.svm.SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', 
# coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, 
# class_weight=None, verbose=False, max_iter=-1, 
# decision_function_shape='ovr', break_ties=False, random_state=None)

from sklearn import svm
import pandas as pd
import numpy as np

np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# data
fruits = pd.read_csv('NHANES_data_stroke_train.csv')

#under sample the non-stroke
MI_positive = fruits[fruits['stroke'] == 1]
MI_negitive = fruits[fruits['stroke'] == 2].sample(frac=.03411675511751327)
fruits = pd.concat([MI_positive, MI_negitive])

#
X = fruits[["Income","Sex","Age","Race","Edu","Diastolic","Systolic","Pulse","BMI","HDL","Trig","LDL","TCHOL","kidneys_eGFR","Diabetes","CurrentSmoker","isActive","isInsured"]]
y = fruits['stroke']

# Define the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', StandardScaler())  # Scale the features
])

# Fit and transform the preprocessing pipeline
X = preprocessing_pipeline.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.50, random_state=42)

# kernals could be: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’.              
clf = svm.SVC(kernel="linear", C=1000, probability=True)

clf.fit(X_train, y_train)
print("accuracy train : ", clf.score(X_train, y_train))
print("accuracy test : ", clf.score(X_test, y_test))

print("predicted probabilities:\n", clf.predict_proba(X_test))


accuracy train :  0.762962962962963
accuracy test :  0.7185185185185186
predicted probabilities:
 [[0.610 0.390]
 [0.318 0.682]
 [0.317 0.683]
 [0.461 0.539]
 [0.216 0.784]
 [0.347 0.653]
 [0.211 0.789]
 [0.552 0.448]
 [0.517 0.483]
 [0.641 0.359]
 [0.444 0.556]
 [0.669 0.331]
 [0.756 0.244]
 [0.685 0.315]
 [0.371 0.629]
 [0.416 0.584]
 [0.361 0.639]
 [0.360 0.640]
 [0.414 0.586]
 [0.584 0.416]
 [0.357 0.643]
 [0.480 0.520]
 [0.672 0.328]
 [0.640 0.360]
 [0.476 0.524]
 [0.699 0.301]
 [0.640 0.360]
 [0.369 0.631]
 [0.579 0.421]
 [0.468 0.532]
 [0.567 0.433]
 [0.444 0.556]
 [0.444 0.556]
 [0.434 0.566]
 [0.370 0.630]
 [0.438 0.562]
 [0.690 0.310]
 [0.607 0.393]
 [0.489 0.511]
 [0.272 0.728]
 [0.711 0.289]
 [0.549 0.451]
 [0.324 0.676]
 [0.441 0.559]
 [0.553 0.447]
 [0.293 0.707]
 [0.516 0.484]
 [0.360 0.640]
 [0.703 0.297]
 [0.419 0.581]
 [0.204 0.796]
 [0.356 0.644]
 [0.352 0.648]
 [0.292 0.708]
 [0.568 0.432]
 [0.466 0.534]
 [0.214 0.786]
 [0.269 0.731]
 [0.362 0.638]
 [0.427 0.573]
 [

In [4]:
#### See which data points are critical #####
# get the support vectors
print("clf support vectors: {}".format(clf.support_vectors_))
# get indices of support vectors
print("clf support vector indices: {}".format(clf.support_))
# get number of support vectors for each class
print("clf # of support vectors in each class: {}".format(clf.n_support_))

clf support vectors: [[-1.125 -1.015 1.225 ... 0.557 0.540 -0.348]
 [0.000 0.985 -0.495 ... -1.794 0.540 -0.348]
 [-0.943 0.985 -1.068 ... 0.557 0.540 -0.348]
 ...
 [0.074 0.985 0.733 ... 0.557 0.540 -0.348]
 [-0.155 0.985 -0.086 ... -1.794 0.540 -0.348]
 [-0.903 -1.015 0.570 ... 0.557 0.540 2.882]]
clf support vector indices: [  0  13  14  15  19  22  24  32  34  37  39  40  43  46  52  53  57  61
  62  63  64  67  74  81  82  85  88  89  93  97  99 100 110 112 113 115
 123 125 132 133 134   2   3   5   6  12  16  17  31  38  41  42  45  48
  51  58  59  60  66  73  75  76  77  87  92  94 103 106 108 109 118 119
 121 122 124 127 129]
clf # of support vectors in each class: [41 36]
