In [1]:
import pandas as pd
import json
import os
import re
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import math

In [4]:
Corpus = pd.read_csv("../../datasets/transformed_dataset.csv")
Corpus

Unnamed: 0,response,label
0,There are many different best seller lists tha...,1
1,Salt is used on roads to help melt ice and sno...,1
2,There are a few reasons why we still have SD (...,1
3,It is generally not acceptable or ethical to a...,1
4,After the Wright Brothers made the first power...,1
...,...,...
66727,Net neutrality says it all in the name . The i...,0
66728,President 's generally get a Congressional aut...,0
66729,There are a few things that have hurt Leno's r...,0
66730,Atheism is a description of what someone belie...,0


In [5]:
Encoder = LabelEncoder()
Y = Encoder.fit_transform(Corpus['label'])
Y

array([1, 1, 1, ..., 0, 0, 0])

In [6]:
Tfidf_vect = TfidfVectorizer(max_features=100000, ngram_range=(1,3))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x100000 sparse matrix of type '<class 'numpy.float64'>'
	with 12120035 stored elements in Compressed Sparse Row format>

In [7]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [8]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear', probability=True)
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.31168831168831
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10025
           1       0.98      0.98      0.98      9995

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 1h 56min 54s, sys: 975 ms, total: 1h 56min 55s
Wall time: 1h 56min 55s


In [None]:
# save the model to disk
import pickle
with open('svm_linear_model_100000_features.pkl','wb') as f:
    pickle.dump(SVM,f)

In [None]:
# save the model to disk
import pickle
with open('tfidf_vectorizer_100000_features.pkl','wb') as f:
    pickle.dump(Tfidf_vect,f)