# SVM Support Vector Machine - radial basis function (rbf) 
# for carbon, silicon, and nitrogen isotopes

For the SVM model, the Type "U/C" was dropped because it contained only 1 record. SVM requires at least 2 records in a group.

In [16]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC 

## Read the CSV and perform basic data cleaning

In [17]:
# Read the CSV
C_Si_N = pd.read_csv("presolargrains_C_Si_N.csv")
C_Si_N.head()

Unnamed: 0,Type,carbon_12_13,nitrogen_14_15,silicon_29_28,silicon_30_28
0,X,74.3,207.4,-685.0,-520.0
1,X,1581.0,116.0,-684.0,-490.0
2,X,234.75,187.2,-683.0,-501.0
3,X,455.0,140.0,-662.0,-770.0
4,X,823.0,44.0,-658.0,-234.0


## Create a Train Test Split

In [18]:
# Set y = to the column "Type", then drop that column from the dataframe
y = C_Si_N["Type"]
X = C_Si_N.drop(columns=["Type"])
C_Si_N.head()

Unnamed: 0,Type,carbon_12_13,nitrogen_14_15,silicon_29_28,silicon_30_28
0,X,74.3,207.4,-685.0,-520.0
1,X,1581.0,116.0,-684.0,-490.0
2,X,234.75,187.2,-683.0,-501.0
3,X,455.0,140.0,-662.0,-770.0
4,X,823.0,44.0,-658.0,-234.0


In [19]:
# Set up x and y train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)


## Pre-processing

In [20]:
# Scale the data using the MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Train the Model

In [21]:
# Support vector machine linear classifier
model = SVC()
model.fit(X_train_scaled, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [22]:
# Print train and test scores
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.72
Testing Data Score: 0.7239263803680982


## Hyperparameter Tuning

In [23]:
# Use `GridSearchCV` to tune model's parameters
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [24]:
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.5657492354740061, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.5657492354740061, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.5763239875389408, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.5657492354740061, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.5657492354740061, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.5763239875389408, total=   0.0s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    0.5s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [25]:
# Print the best parameters and the best score
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'gamma': 0.01}
0.5774358974358974


In [26]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test_scaled, y_test))

Test Acc: 0.724


In [28]:
# Not saving this model
#filename = 'SVM_rbf_Si.sav'
#joblib.dump(grid, filename)

# Summary:

### SVM rbf based on carbon, silicon and nitrogen: 72.4%