In [1]:
import torch
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score
import pickle
from sklearn.model_selection import GridSearchCV

In [2]:
#loading the embeddings
load_data = torch.load('embeddings/data.pt')

#saving the embeddings and coresponding class names in list
embedding_list = load_data[0]
name_list = load_data[1]

In [3]:
#using a label encoder to encole the class names 
encoder = LabelEncoder()
encoder.fit(name_list)
Y = encoder.transform(name_list)

In [4]:
#storing the embeddings as list because they are orignally saved as tensors
X=[]
for i in  range(len(embedding_list)):
  X.append(np.asarray(embedding_list[i]))
X=np.squeeze(X)


In [5]:
#splitting data into training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, shuffle=True, random_state=17)

In [6]:
#using gridsearch for hyper parameter tuning
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'probability':[True]
              }

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)


grid.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END C=0.1, gamma=1, kernel=linear, probability=True;, score=0.992 total time=   0.1s
[CV 2/5] END C=0.1, gamma=1, kernel=linear, probability=True;, score=1.000 total time=   0.1s
[CV 3/5] END C=0.1, gamma=1, kernel=linear, probability=True;, score=1.000 total time=   0.1s
[CV 4/5] END C=0.1, gamma=1, kernel=linear, probability=True;, score=0.996 total time=   0.1s
[CV 5/5] END C=0.1, gamma=1, kernel=linear, probability=True;, score=0.992 total time=   0.1s
[CV 1/5] END C=0.1, gamma=1, kernel=poly, probability=True;, score=0.992 total time=   0.1s
[CV 2/5] END C=0.1, gamma=1, kernel=poly, probability=True;, score=0.992 total time=   0.1s
[CV 3/5] END C=0.1, gamma=1, kernel=poly, probability=True;, score=0.985 total time=   0.2s
[CV 4/5] END C=0.1, gamma=1, kernel=poly, probability=True;, score=0.988 total time=   0.1s
[CV 5/5] END C=0.1, gamma=1, kernel=poly, probability=True;, score=0.984 total time=   0.1s
[CV 1/5

In [7]:
#gettings list of parameters that performed the best
grid.best_params_

{'C': 1, 'gamma': 1, 'kernel': 'poly', 'probability': True}

In [8]:
#geting the best performing model
print(grid.best_estimator_)

SVC(C=1, gamma=1, kernel='poly', probability=True)


In [9]:
#training the best performing model
# model = SVC(C=0.1, gamma=1, probability=True)
model = SVC(C=1, gamma=1, kernel='poly', probability=True)
model.fit(X_train, Y_train)

In [10]:
ypreds_train = model.predict(X_train)
ypreds_test = model.predict(X_test)

In [11]:
#checking how well it performs on test dataset
print(accuracy_score(Y_test,ypreds_test))
print(precision_score(Y_test,ypreds_test))

1.0
1.0


In [12]:
#saving the model on disk to load in the app later
if not os.path.exists('classifier/'):
    os.makedirs('classifier')
with open('classifier/svm_classifier.pkl','wb') as f:
    pickle.dump(model,f)

In [13]:
#saving the encodings as well
np.save('classifier/classes.npy', encoder.classes_)