# SVM Support Vector Machine - radial basis function (rbf)
based on Carbon only

For the SVM model, the Type "U/C" was dropped because it contained only 1 record. SVM requires at least 2 records in a group.

In [12]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC 

## Read the CSV and perform basic data cleaning

In [13]:
# Read the CSV
presolar = pd.read_csv("presolargrains_C_only_for_SVM.csv")
presolar.head()

Unnamed: 0,Type,12C/13C
0,X,9455.1
1,X,6793.0
2,X,6227.0
3,X,4250.0
4,X,3993.46


## Create a Train Test Split

In [14]:
# Set y = to the column "Type", then drop that column from the dataframe
y = presolar["Type"]
X = presolar.drop(columns=["Type"])
presolar.head()

Unnamed: 0,Type,12C/13C
0,X,9455.1
1,X,6793.0
2,X,6227.0
3,X,4250.0
4,X,3993.46


In [15]:
# Set up x and y train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)


## Pre-processing

In [16]:
# Scale the data using the MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Train the Model

In [17]:
# Support vector machine linear classifier
model = SVC()
model.fit(X_train_scaled, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [18]:
# Print train and test scores
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8519846350832266
Testing Data Score: 0.8517665130568356


## Hyperparameter Tuning

In [19]:
# Use `GridSearchCV` to tune model's parameters
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [20]:
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.8490276356192425, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.8494623655913979, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.3s remaining:    0.0s


[CV] ....... C=1, gamma=0.0001, score=0.850294796206101, total=   0.4s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8490276356192425, total=   0.5s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8494623655913979, total=   0.5s
[CV] C=1, gamma=0.001 ................................................
[CV] ........ C=1, gamma=0.001, score=0.850294796206101, total=   0.5s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.8490276356192425, total=   0.4s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.8494623655913979, total=   0.4s
[CV] C=1, gamma=0.01 .................................................
[CV] ......... C=1, gamma=0.01, score=0.850294796206101, total=   0.4s
[CV] C=5, gamma=0.0001 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   19.1s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [21]:
# Print the best parameters and the best score
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'gamma': 0.0001}
0.8495945369184805


In [24]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test_scaled, y_test))

Test Acc: 0.852


In [25]:
# Calculate classification report
from sklearn.metrics import classification_report
target_names = ['AB', 'C', 'M', 'N', 'U', 'X', 'Y', 'Z']
predictions = model.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

          AB       0.00      0.00      0.00       199
           C       0.00      0.00      0.00         4
           M       0.85      1.00      0.92      3318
           N       0.00      0.00      0.00         3
           U       0.00      0.00      0.00        13
           X       0.90      0.09      0.16       105
           Y       0.00      0.00      0.00       135
           Z       0.00      0.00      0.00       129

   micro avg       0.85      0.85      0.85      3906
   macro avg       0.22      0.14      0.13      3906
weighted avg       0.75      0.85      0.79      3906



  'precision', 'predicted', average, warn_for)


In [26]:
# Save the fitted model to file
filename = 'SVM_rbf.sav'
joblib.dump(grid, filename)

['SVM_rbf.sav']

# Summary:

### SVM rbf based on carbon only: 85.2%