# SVM Support Vector Machine - linear classification
based on Carbon only

For the SVM model, the Type "U/C" was dropped because it contained only 1 record. SVM requires at least 2 records in a group.

In [10]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC 

# Read the CSV and perform basic data cleaning

In [2]:
# Read the CSV
presolar = pd.read_csv("presolargrains_C_only_for_SVM.csv")
presolar.head()

Unnamed: 0,Type,12C/13C
0,X,9455.1
1,X,6793.0
2,X,6227.0
3,X,4250.0
4,X,3993.46


### Due to ValueError when trying to train the model, change Types to numbers with this key:

### key 
AB = 1,
C = 2
M = 3,
N = 4,
U = 5,
X = 6,
Y = 7,
Z = 8


# Create a Train Test Split

In [3]:
#presolar['12C/13C'].astype('float32').dtypes

In [4]:
# Set y = to the column "Type", then drop that column from the dataframe
y = presolar["Type"]
X = presolar.drop(columns=["Type"])
presolar.head()

Unnamed: 0,Type,12C/13C
0,X,9455.1
1,X,6793.0
2,X,6227.0
3,X,4250.0
4,X,3993.46


In [5]:
# Set up x and y train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)


# Pre-processing

In [6]:
# Scale the data using the MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model

In [8]:
# Support vector machine linear classifier
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [9]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8511310285958174
Testing Data Score: 0.8509984639016898


# Hyperparameter Tuning

In [13]:
# Use `GridSearchCV` to tune model's parameters
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [14]:
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.8503070624360286, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.8504864311315924, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV] ....... C=1, gamma=0.0001, score=0.851063829787234, total=   0.2s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8503070624360286, total=   0.2s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8504864311315924, total=   0.2s
[CV] C=1, gamma=0.001 ................................................
[CV] ........ C=1, gamma=0.001, score=0.851063829787234, total=   0.2s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.8503070624360286, total=   0.2s
[CV] C=1, gamma=0.01 .................................................
[CV] ........ C=1, gamma=0.01, score=0.8504864311315924, total=   0.2s
[CV] C=1, gamma=0.01 .................................................
[CV] ......... C=1, gamma=0.01, score=0.851063829787234, total=   0.2s
[CV] C=5, gamma=0.0001 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   12.3s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [15]:
# Print the best parameters and the best score
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'gamma': 0.0001}
0.8534357661118225


In [17]:
# Save the fitted model to file
filename = 'SVM_linear.sav'
joblib.dump(grid, filename)

['SVM_linear.sav']

# Summary:

### SVM linear based on carbon only: 85.3%