In [2]:
# SVM
# Support vector machines (SVMs) are a set of supervised learning methods used for 
# classification, regression and outliers detection.

# The advantages of support vector machines are:
    # Effective in high dimensional spaces.
    # Still effective in cases where number of dimensions is greater than the number of samples.
    # Uses a subset of training points in the decision function (called support vectors), 
    # so it is also memory efficient.
    # Versatile: different Kernel functions can be specified for the decision function. 
    # Common kernels are provided, but it is also possible to specify custom kernels.

# The disadvantages of support vector machines include:
    # If the number of features is much greater than the number of samples, avoid over-fitting 
    # in choosing Kernel functions and regularization term is crucial.
    # SVMs do not directly provide probability estimates, these are calculated using an 
    # expensive five-fold cross-validation (see Scores and probabilities, below).

# class sklearn.svm.SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', 
# coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, 
# class_weight=None, verbose=False, max_iter=-1, 
# decision_function_shape='ovr', break_ties=False, random_state=None)

from sklearn import svm
import pandas as pd
import numpy as np

np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Define the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', StandardScaler())  # Scale the features
])

# define the SVM model
# kernals could be: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’.              
clf = svm.SVC(kernel="rbf", C=.5, probability=True)

##############################  TRAINING  ##############################################
# data
input_df = pd.read_csv('NHANES_data_stroke_train.csv')

# Under sample the non-stroke
# Due to the large number of MI_negative, drop any with missing values, MI_positive will be imputed later
MI_positive = input_df[input_df['stroke'] == 1]
MI_negative = input_df[input_df['stroke'] == 2]
MI_negative = MI_negative.dropna()
MI_negative = MI_negative.sample(n=len(MI_positive), replace=False)
input_df = pd.concat([MI_positive, MI_negative])

# attributes
featurenames = ["Income","Age","Race","Diastolic","Systolic","Pulse","BMI","HDL","Trig","LDL","TCHOL","kidneys_eGFR","Diabetes"]
X = input_df[featurenames]
y = input_df['stroke']

# impute and scale the data
X = preprocessing_pipeline.fit_transform(X)

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.50, random_state=42)

# train the model
clf.fit(X_train, y_train)

# print accuracy info
print("accuracy train : ", clf.score(X_train, y_train))
print("accuracy test : ", clf.score(X_test, y_test))
print("predicted probabilities:\n", clf.predict_proba(X_test))

##############################  PREDICTION  ##############################################
# load data set
new_data = pd.read_csv("NHANES_data_stroke_test4Students.csv")

# No stroke column so get rid of it
new_data = new_data.drop(columns=['stroke'])

# get attributes
X_new = new_data[featurenames]

# impute and scale the data
X_new = preprocessing_pipeline.fit_transform(X_new)

# Make predictions on the new data, run model
new_probabilities = clf.predict_proba(X_new)[:, 0]  # for output
new_predictions = clf.predict(X_new) # unsed, just for testing ratio of MI/noMI

# Get each sample's ID and write probabilities to the output CSV
new_participant_ids = new_data['ParticipantID']
new_output_df = pd.DataFrame({'ParticipantID': new_participant_ids, 'Pred_Probability': new_probabilities})
new_output_df.to_csv('SVMpred.csv', index=False)


accuracy train :  0.8740740740740741
accuracy test :  0.7555555555555555
predicted probabilities:
 [[0.846 0.154]
 [0.507 0.493]
 [0.056 0.944]
 [0.313 0.687]
 [0.091 0.909]
 [0.573 0.427]
 [0.381 0.619]
 [0.650 0.350]
 [0.646 0.354]
 [0.220 0.780]
 [0.255 0.745]
 [0.213 0.787]
 [0.028 0.972]
 [0.936 0.064]
 [0.741 0.259]
 [0.850 0.150]
 [0.373 0.627]
 [0.900 0.100]
 [0.254 0.746]
 [0.842 0.158]
 [0.045 0.955]
 [0.205 0.795]
 [0.908 0.092]
 [0.939 0.061]
 [0.253 0.747]
 [0.878 0.122]
 [0.839 0.161]
 [0.311 0.689]
 [0.906 0.094]
 [0.912 0.088]
 [0.770 0.230]
 [0.762 0.238]
 [0.260 0.740]
 [0.849 0.151]
 [0.388 0.612]
 [0.234 0.766]
 [0.894 0.106]
 [0.736 0.264]
 [0.640 0.360]
 [0.677 0.323]
 [0.964 0.036]
 [0.833 0.167]
 [0.462 0.538]
 [0.541 0.459]
 [0.876 0.124]
 [0.029 0.971]
 [0.235 0.765]
 [0.173 0.827]
 [0.916 0.084]
 [0.296 0.704]
 [0.082 0.918]
 [0.666 0.334]
 [0.024 0.976]
 [0.070 0.930]
 [0.916 0.084]
 [0.322 0.678]
 [0.062 0.938]
 [0.222 0.778]
 [0.113 0.887]
 [0.820 0.180]
 

In [4]:
#### See which data points are critical #####
# get the support vectors
print("clf support vectors: {}".format(clf.support_vectors_))
# get indices of support vectors
print("clf support vector indices: {}".format(clf.support_))
# get number of support vectors for each class
print("clf # of support vectors in each class: {}".format(clf.n_support_))

clf support vectors: [[-1.125 -1.015 1.225 ... 0.557 0.540 -0.348]
 [0.000 0.985 -0.495 ... -1.794 0.540 -0.348]
 [-0.943 0.985 -1.068 ... 0.557 0.540 -0.348]
 ...
 [0.074 0.985 0.733 ... 0.557 0.540 -0.348]
 [-0.155 0.985 -0.086 ... -1.794 0.540 -0.348]
 [-0.903 -1.015 0.570 ... 0.557 0.540 2.882]]
clf support vector indices: [  0  13  14  15  19  22  24  32  34  37  39  40  43  46  52  53  57  61
  62  63  64  67  74  81  82  85  88  89  93  97  99 100 110 112 113 115
 123 125 132 133 134   2   3   5   6  12  16  17  31  38  41  42  45  48
  51  58  59  60  66  73  75  76  77  87  92  94 103 106 108 109 118 119
 121 122 124 127 129]
clf # of support vectors in each class: [41 36]
