In [1]:
# SVM
# Support vector machines (SVMs) are a set of supervised learning methods used for 
# classification, regression and outliers detection.

# The advantages of support vector machines are:
    # Effective in high dimensional spaces.
    # Still effective in cases where number of dimensions is greater than the number of samples.
    # Uses a subset of training points in the decision function (called support vectors), 
    # so it is also memory efficient.
    # Versatile: different Kernel functions can be specified for the decision function. 
    # Common kernels are provided, but it is also possible to specify custom kernels.

# The disadvantages of support vector machines include:
    # If the number of features is much greater than the number of samples, avoid over-fitting 
    # in choosing Kernel functions and regularization term is crucial.
    # SVMs do not directly provide probability estimates, these are calculated using an 
    # expensive five-fold cross-validation (see Scores and probabilities, below).

# class sklearn.svm.SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', 
# coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, 
# class_weight=None, verbose=False, max_iter=-1, 
# decision_function_shape='ovr', break_ties=False, random_state=None)

from sklearn import svm
import pandas as pd
import numpy as np

np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# data
fruits = pd.read_csv('NHANES_data_stroke_train.csv')

#under sample the non-stroke
MI_positive = fruits[fruits['stroke'] == 1]
MI_negitive = fruits[fruits['stroke'] == 2].sample(frac=.03411675511751327)
fruits = pd.concat([MI_positive, MI_negitive])

#
X = fruits[["Income","Sex","Age","Race","Edu","Diastolic","Systolic","Pulse","BMI","HDL","Trig","LDL","TCHOL","kidneys_eGFR","Diabetes","CurrentSmoker","isActive","isInsured"]]
y = fruits['stroke']

# Define the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', StandardScaler())  # Scale the features
])

# Fit and transform the preprocessing pipeline
X = preprocessing_pipeline.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.50, random_state=42)

# kernals could be: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’.              
clf = svm.SVC(kernel="linear", C=1000, probability=True)

clf.fit(X_train, y_train)
print("accuracy train : ", clf.score(X_train, y_train))
print("accuracy test : ", clf.score(X_test, y_test))

print("predicted probabilities:\n", clf.predict_proba(X_test))


accuracy train :  0.8
accuracy test :  0.6444444444444445
predicted probabilities:
 [[0.567 0.433]
 [0.668 0.332]
 [0.286 0.714]
 [0.422 0.578]
 [0.445 0.555]
 [0.240 0.760]
 [0.294 0.706]
 [0.529 0.471]
 [0.318 0.682]
 [0.453 0.547]
 [0.542 0.458]
 [0.155 0.845]
 [0.325 0.675]
 [0.637 0.363]
 [0.562 0.438]
 [0.458 0.542]
 [0.178 0.822]
 [0.628 0.372]
 [0.513 0.487]
 [0.525 0.475]
 [0.262 0.738]
 [0.203 0.797]
 [0.785 0.215]
 [0.706 0.294]
 [0.571 0.429]
 [0.711 0.289]
 [0.707 0.293]
 [0.500 0.500]
 [0.724 0.276]
 [0.591 0.409]
 [0.569 0.431]
 [0.494 0.506]
 [0.500 0.500]
 [0.464 0.536]
 [0.291 0.709]
 [0.470 0.530]
 [0.646 0.354]
 [0.538 0.462]
 [0.500 0.500]
 [0.201 0.799]
 [0.754 0.246]
 [0.682 0.318]
 [0.330 0.670]
 [0.321 0.679]
 [0.665 0.335]
 [0.372 0.628]
 [0.413 0.587]
 [0.703 0.297]
 [0.637 0.363]
 [0.552 0.448]
 [0.247 0.753]
 [0.233 0.767]
 [0.771 0.229]
 [0.346 0.654]
 [0.615 0.385]
 [0.302 0.698]
 [0.500 0.500]
 [0.210 0.790]
 [0.283 0.717]
 [0.479 0.521]
 [0.491 0.509]
 

In [3]:
#### See which data points are critical #####
# get the support vectors
print("clf support vectors: {}".format(clf.support_vectors_))
# get indices of support vectors
print("clf support vector indices: {}".format(clf.support_))
# get number of support vectors for each class
print("clf # of support vectors in each class: {}".format(clf.n_support_))

clf support vectors: [[-1.086 -0.978 1.140 ... 0.540 0.525 -0.381]
 [-0.916 1.022 -1.222 ... 0.540 0.525 -0.381]
 [1.074 -0.978 0.887 ... 0.540 -1.911 -0.381]
 ...
 [1.080 1.022 1.140 ... 0.540 -1.911 -0.381]
 [0.000 -0.978 1.225 ... 0.540 0.525 -0.381]
 [-0.508 1.022 -0.041 ... 0.540 -1.911 -0.381]]
clf support vector indices: [  0  14  15  19  22  24  34  37  39  43  46  50  53  61  62  63  64  67
  78  81  82  83  85  88  89  93  97  99 100 110 112 113 115 123 125 126
 132   1   7  11  16  18  21  23  27  29  35  38  41  47  48  54  56  58
  60  66  71  73  75  76  77  87  90  94 102 106 107 109 111 114 117 119
 120 124 131]
clf # of support vectors in each class: [37 38]
