In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import math

In [2]:
Corpus = pd.read_csv("transformed_dataset.csv")
Corpus

Unnamed: 0,response,label
0,There are many different best seller lists tha...,1
1,Salt is used on roads to help melt ice and sno...,1
2,There are a few reasons why we still have SD (...,1
3,It is generally not acceptable or ethical to a...,1
4,After the Wright Brothers made the first power...,1
...,...,...
66727,Net neutrality says it all in the name . The i...,0
66728,President 's generally get a Congressional aut...,0
66729,There are a few things that have hurt Leno's r...,0
66730,Atheism is a description of what someone belie...,0


In [3]:
Encoder = LabelEncoder()
Y = Encoder.fit_transform(Corpus['label'])
Y

array([1, 1, 1, ..., 0, 0, 0])

In [4]:
Tfidf_vect = TfidfVectorizer(max_features=90000, ngram_range=(1,3))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x90000 sparse matrix of type '<class 'numpy.float64'>'
	with 11925346 stored elements in Compressed Sparse Row format>

In [5]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [6]:
C = [0.1, 1, 10, 100, 1000]
accuracies = []

In [7]:
%%time
for c in C:
  SVM = svm.SVC(C=c, kernel='linear')
  SVM.fit(Train_X,Train_Y)
  # predict the labels on test dataset
  predictions_SVM = SVM.predict(Test_X)
  # Use accuracy_score function to get the accuracy
  accuracies.append(accuracy_score(predictions_SVM, Test_Y)*100)
  print("C = ", c)
  print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
  print(classification_report(Test_Y, predictions_SVM))
  print()

C =  0.1
Linear Kernel SVM Accuracy Score ->  97.002997002997
              precision    recall  f1-score   support

           0       0.97      0.98      0.97     10009
           1       0.98      0.96      0.97     10011

    accuracy                           0.97     20020
   macro avg       0.97      0.97      0.97     20020
weighted avg       0.97      0.97      0.97     20020


C =  1
Linear Kernel SVM Accuracy Score ->  98.51148851148851
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     10009
           1       0.99      0.99      0.99     10011

    accuracy                           0.99     20020
   macro avg       0.99      0.99      0.99     20020
weighted avg       0.99      0.99      0.99     20020


C =  10
Linear Kernel SVM Accuracy Score ->  98.4965034965035
              precision    recall  f1-score   support

           0       0.99      0.98      0.98     10009
           1       0.98      0.99      0.98     1

In [9]:
print("C = ", C)
print("Accuracies = ", accuracies)

C =  [0.1, 1, 10, 100, 1000]
Accuracies =  [97.002997002997, 98.51148851148851, 98.4965034965035, 98.4965034965035, 98.4965034965035]


In [14]:

import plotly.graph_objects as go
import numpy as np

fig = go.Figure()
fig.add_trace(go.Scatter(x=np.log10(C), y=accuracies))

fig.update_layout(title='Effect of regularization parameter C on accuracy',
                   xaxis_title='log_10(C)',
                   yaxis_title='Accuracy (%)')