# SVM Support Vector Machine - radial basis function (rbf) 
# for silicon isotopes


For the SVM model, the Type "U/C" was dropped because it contained only 1 record. SVM requires at least 2 records in a group.

In [2]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC 

## Read the CSV and perform basic data cleaning

In [3]:
# Read the CSV
silicon = pd.read_csv("presolargrains_Si_for_SVM.csv")
silicon.head()

Unnamed: 0,Type,silicon_29_28,silicon_30_28
0,X,-662.0,-770.0
1,X,-451.0,-719.0
2,X,-594.0,-717.0
3,X,-499.0,-709.0
4,X,-366.0,-705.0


## Create a Train Test Split

In [5]:
# Set y = to the column "Type", then drop that column from the dataframe
y = silicon["Type"]
X = silicon.drop(columns=["Type"])
silicon.head()

Unnamed: 0,Type,silicon_29_28,silicon_30_28
0,X,-662.0,-770.0
1,X,-451.0,-719.0
2,X,-594.0,-717.0
3,X,-499.0,-709.0
4,X,-366.0,-705.0


In [6]:
# Set up x and y train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)


## Pre-processing

In [7]:
# Scale the data using the MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Train the Model

In [8]:
# Support vector machine linear classifier
model = SVC()
model.fit(X_train_scaled, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [9]:
# Print train and test scores
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.868925424652557
Testing Data Score: 0.867574931880109


## Hyperparameter Tuning

In [10]:
# Use `GridSearchCV` to tune model's parameters
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [11]:
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.8347399945548598, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.8353774870536931, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.5s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.8358331060812654, total=   0.4s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8347399945548598, total=   0.4s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8353774870536931, total=   0.4s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8358331060812654, total=   0.4s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.8347399945548598, total=   0.4s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.8353774870536931, total=   0.4s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.8358331060812654, total=   0.4s
[CV] C=5, gamma=0.0001 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   19.8s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [12]:
# Print the best parameters and the best score
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'gamma': 0.01}
0.8477609228812789


In [13]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test_scaled, y_test))

Test Acc: 0.868


In [14]:
# Calculate classification report
from sklearn.metrics import classification_report
target_names = ['AB', 'C', 'M', 'N', 'U', 'X', 'Y', 'Z']
predictions = model.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

          AB       0.00      0.00      0.00       188
           C       0.67      0.50      0.57         4
           M       0.86      1.00      0.93      3066
           N       0.00      0.00      0.00         3
           U       0.00      0.00      0.00        11
           X       0.99      0.82      0.90       142
           Y       0.00      0.00      0.00       126
           Z       0.00      0.00      0.00       130

   micro avg       0.87      0.87      0.87      3670
   macro avg       0.32      0.29      0.30      3670
weighted avg       0.76      0.87      0.81      3670



  'precision', 'predicted', average, warn_for)


In [15]:
# Save the fitted model to file
filename = 'SVM_rbf_Si.sav'
joblib.dump(grid, filename)

['SVM_rbf_Si.sav']

# Summary:

### SVM rbf based on silicon only: 86.8%