<a href="https://colab.research.google.com/github/emilyolafson/3d-cnn/blob/main/SVM_Regression_FS86_atlas_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Support Vector Regression

Emily Olafson

SVM regression attempts to solve the regression $f(x) = w^Tx+ b$
\begin{equation}
   \frac{1}{2}||w||^2 + \frac{C}{m}\sum_{i=1}^m|y_i-f(x_i)| \\
\end{equation}
Subject to
\begin{equation}
    y_iw^Tx_i -b \leq \epsilon, \textbf{and } \\
    y_iw^Tx_i +b \geq \epsilon
\end{equation}
Where $x_i \in R$ are the input variables, $y_i$ are the outputs, $C$ is the trade-off between flatness of $f(x)$ and the amount up to which deviations larger than $\epsilon$ are tolerated.
A Gaussian radial basis fn was used.

\begin{align}
    k(x_i,x_j)=exp(-\gamma^{-1}||x_i-x_j||^2)\\
\end{align}

Where $\gamma$ is a kernel parameter and $x_i$ and $x_j$ are observations. A grid search was used to optimize $\gamma$ and $C$.


In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn import preprocessing
import sklearn
import scipy.io as sio
import csv
from google.colab import drive
from sklearn.model_selection import GridSearchCV,cross_validate,train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn import metrics
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **Processing**:
- Deal with any missing data
- One-hot encoding of categorical variables (ethnicity and sex)

In [None]:
dataset=pd.read_csv('/content/drive/My Drive/ML project/Data/compiledData_2datasets_QSM_MSConnect.csv')

In [None]:
dataset.isna().sum()
dataset.isna().sum().sum()

#no missing data

0

In [None]:
#one hot encoding
dataset=dataset.copy()
ethnicity=dataset.pop('Ethnicity')
dataset['white']=(ethnicity=='white')*1.0
dataset['black']=(ethnicity=='black or african american')*1.0
dataset['declined']=(ethnicity=='declined')*1.0
dataset['other']=(ethnicity=='other combinations not described')*1.0
dataset['asian']=(ethnicity=='asian')*1.0
dataset['hispanic']=(ethnicity=='hispanic')*1.0


sex=dataset.pop('Sex')
dataset['Female']=(sex=='F')*1.0
dataset['Male']=(sex=='M')*1.0

# Determine indices for test and training data (outer loop partition)



In [None]:
import random 
values = np.array(range(0,177))
test_indices=[]
training_indices=[]

for i in range(0,100):
  random.Random(3).shuffle(values)
  training_id, test_id  = sklearn.model_selection.train_test_split(values, train_size=0.9, test_size=0.1)
  test_indices.append(test_id)
  training_indices.append(training_id)

# Train and test model.



In [None]:
# grid search parameters
C=[1,2,4, 8,16,32,64,128,256,512,1024]
Gamma=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]

tuned_parameters = [{'kernel': ['rbf'], 'gamma': Gamma,
                     'C': C}]

best_parameters=[]
rsq=[]
msq=[]
for i in range(0,100):
  train_dataset=dataset.iloc[training_indices[i],]
  test_dataset=dataset.iloc[test_indices[i],]

  train_dataset_nump = train_dataset.to_numpy()
  test_dataset_nump = test_dataset.to_numpy()
  train_dataset_nump = train_dataset_nump[:,2:-1]
  test_dataset_nump = test_dataset_nump[:,2:-1]

  scaler = StandardScaler() #Normalize training and test data.
  train_dataset = scaler.fit_transform(train_dataset_nump)
  test_dataset = scaler.transform(test_dataset_nump)

  train_labels=train_dataset[:,0]
  test_labels=test_dataset[:,0]

  np.delete(train_dataset,0)
  np.delete(test_dataset,0)

  X_train = train_dataset
  y_train = train_labels
  X_test = test_dataset
  y_test = test_labels

  # Grid search with SVR estimator.
  # Inner loop.
  clf = GridSearchCV(SVR(), tuned_parameters, cv=10) #R^2 is default scoring function.

  # Train model.
  clf.fit(X_train, y_train)

  # Save parameters from best-fitting model.
  best_parameters.append(clf.best_params_)

  # Predict EDSS on trained model using test data.
  y_true = y_test
  y_pred = clf.predict(X_test)

  # Calculate and save metrics 
  rsquare_svr = round(r2_score(y_true, y_pred),4)
  rsq.append(rsquare_svr)

  msq_error=mean_squared_error(y_true, y_pred)
  msq.append(msq_error)

In [None]:
# Display mean R-squared
mean_rsq=statistics.mean(rsq)
print(mean_rsq)

In [None]:
# Display mean mean-squared error.
mean_msq=statistics.mean(msq)
print(mean_msq)

In [None]:
# View all best parameters (100 total)
best_parameters