In [3]:
import pandas as pd
import json
import os
import re
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import math

In [4]:
Corpus = pd.read_csv("transformed_dataset.csv")
Corpus

Unnamed: 0,response,label
0,There are many different best seller lists tha...,1
1,Salt is used on roads to help melt ice and sno...,1
2,There are a few reasons why we still have SD (...,1
3,It is generally not acceptable or ethical to a...,1
4,After the Wright Brothers made the first power...,1
...,...,...
66727,Net neutrality says it all in the name . The i...,0
66728,President 's generally get a Congressional aut...,0
66729,There are a few things that have hurt Leno's r...,0
66730,Atheism is a description of what someone belie...,0


In [5]:
Encoder = LabelEncoder()
Y = Encoder.fit_transform(Corpus['label'])
Y

array([1, 1, 1, ..., 0, 0, 0])

# Increasing max features to 40000

## Using unigrams only

In [4]:
Tfidf_vect = TfidfVectorizer(max_features=40000, ngram_range=(1,1))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 5349798 stored elements in Compressed Sparse Row format>

In [5]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [6]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  96.22877122877122
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      9907
           1       0.96      0.97      0.96     10113

    accuracy                           0.96     20020
   macro avg       0.96      0.96      0.96     20020
weighted avg       0.96      0.96      0.96     20020

CPU times: total: 10min 45s
Wall time: 11min 49s


In [7]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  96.61838161838162
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      9907
           1       0.97      0.97      0.97     10113

    accuracy                           0.97     20020
   macro avg       0.97      0.97      0.97     20020
weighted avg       0.97      0.97      0.97     20020

CPU times: total: 26min 30s
Wall time: 27min 59s


## Using unigrams and bigrams

In [8]:
Tfidf_vect = TfidfVectorizer(max_features=40000, ngram_range=(1,2))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 9568457 stored elements in Compressed Sparse Row format>

In [9]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [10]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.25174825174825
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      9921
           1       0.98      0.98      0.98     10099

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 17min 37s
Wall time: 18min 39s


In [11]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.18681318681318
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      9921
           1       0.98      0.98      0.98     10099

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 42min 39s
Wall time: 44min 29s


## Using unigrams, bigrams and trigrams

In [12]:
Tfidf_vect = TfidfVectorizer(max_features=40000, ngram_range=(1,3))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 10442987 stored elements in Compressed Sparse Row format>

In [13]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [14]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.36663336663337
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10036
           1       0.98      0.98      0.98      9984

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 17min 45s
Wall time: 18min 31s


In [15]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.27172827172828
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10036
           1       0.98      0.98      0.98      9984

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 44min 22s
Wall time: 46min 25s


# Increasing max features to 45000

## Using unigrams only

In [16]:
Tfidf_vect = TfidfVectorizer(max_features=45000, ngram_range=(1,1))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x45000 sparse matrix of type '<class 'numpy.float64'>'
	with 5358268 stored elements in Compressed Sparse Row format>

In [17]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [18]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  96.21878121878123
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     10011
           1       0.96      0.96      0.96     10009

    accuracy                           0.96     20020
   macro avg       0.96      0.96      0.96     20020
weighted avg       0.96      0.96      0.96     20020

CPU times: total: 11min 12s
Wall time: 11min 48s


In [19]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  96.58841158841159
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     10011
           1       0.97      0.96      0.97     10009

    accuracy                           0.97     20020
   macro avg       0.97      0.97      0.97     20020
weighted avg       0.97      0.97      0.97     20020

CPU times: total: 26min 40s
Wall time: 28min 9s


## Using unigrams and bigrams

In [20]:
Tfidf_vect = TfidfVectorizer(max_features=45000, ngram_range=(1,2))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x45000 sparse matrix of type '<class 'numpy.float64'>'
	with 9724870 stored elements in Compressed Sparse Row format>

In [21]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [22]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.03696303696303
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10016
           1       0.98      0.98      0.98     10004

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 18min 12s
Wall time: 19min 21s


In [23]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  97.92207792207792
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10016
           1       0.98      0.98      0.98     10004

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 43min 34s
Wall time: 45min 56s


## Using unigrams, bigrams and trigrams

In [24]:
Tfidf_vect = TfidfVectorizer(max_features=45000, ngram_range=(1,3))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x45000 sparse matrix of type '<class 'numpy.float64'>'
	with 10655617 stored elements in Compressed Sparse Row format>

In [25]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [26]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.32667332667333
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10002
           1       0.98      0.98      0.98     10018

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 18min 53s
Wall time: 19min 46s


In [27]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.27172827172828
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10002
           1       0.98      0.98      0.98     10018

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 45min 39s
Wall time: 47min 42s


# Increasing max features to 50000

## Using unigrams only

In [28]:
Tfidf_vect = TfidfVectorizer(max_features=50000, ngram_range=(1,1))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 5364428 stored elements in Compressed Sparse Row format>

In [29]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [30]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  96.39360639360639
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     10083
           1       0.96      0.96      0.96      9937

    accuracy                           0.96     20020
   macro avg       0.96      0.96      0.96     20020
weighted avg       0.96      0.96      0.96     20020

CPU times: total: 11min 12s
Wall time: 11min 47s


In [31]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  96.85314685314685
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     10083
           1       0.97      0.97      0.97      9937

    accuracy                           0.97     20020
   macro avg       0.97      0.97      0.97     20020
weighted avg       0.97      0.97      0.97     20020

CPU times: total: 26min 50s
Wall time: 27min 52s


## Using unigrams and bigrams

In [32]:
Tfidf_vect = TfidfVectorizer(max_features=50000, ngram_range=(1,2))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 9863314 stored elements in Compressed Sparse Row format>

In [33]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [34]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.13686313686314
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10051
           1       0.98      0.98      0.98      9969

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 19min 8s
Wall time: 19min 54s


In [35]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.1018981018981
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10051
           1       0.98      0.98      0.98      9969

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 45min 10s
Wall time: 47min 6s


## Using unigrams, bigrams and trigrams

In [36]:
Tfidf_vect = TfidfVectorizer(max_features=50000, ngram_range=(1,3))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 10847283 stored elements in Compressed Sparse Row format>

In [37]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [38]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.41658341658342
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10032
           1       0.98      0.98      0.98      9988

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 19min 26s
Wall time: 20min 13s


In [39]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.25674325674326
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10032
           1       0.98      0.98      0.98      9988

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 46min 52s
Wall time: 48min 33s


# Increase max features to 60000

## Using unigrams, bigrams and trigrams

In [40]:
Tfidf_vect = TfidfVectorizer(max_features=60000, ngram_range=(1,3))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x60000 sparse matrix of type '<class 'numpy.float64'>'
	with 11179962 stored elements in Compressed Sparse Row format>

In [41]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [42]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.22677322677322
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      9925
           1       0.98      0.98      0.98     10095

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 20min 12s
Wall time: 21min 47s


In [43]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.12187812187813
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      9925
           1       0.98      0.98      0.98     10095

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 49min
Wall time: 51min 29s


# Increase max features to 70000

## Using unigrams, bigrams and trigrams

In [44]:
Tfidf_vect = TfidfVectorizer(max_features=70000, ngram_range=(1,3))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x70000 sparse matrix of type '<class 'numpy.float64'>'
	with 11462363 stored elements in Compressed Sparse Row format>

In [45]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [46]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.22177822177822
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10183
           1       0.98      0.98      0.98      9837

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 21min 56s
Wall time: 22min 47s


In [47]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.01198801198801
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10183
           1       0.98      0.98      0.98      9837

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 51min 3s
Wall time: 53min 7s


# Increase max features to 80000

## Using unigrams, bigrams and trigrams

In [48]:
Tfidf_vect = TfidfVectorizer(max_features=80000, ngram_range=(1,3))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x80000 sparse matrix of type '<class 'numpy.float64'>'
	with 11707645 stored elements in Compressed Sparse Row format>

In [49]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [50]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.41658341658342
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     10034
           1       0.99      0.98      0.98      9986

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 23min 21s
Wall time: 24min 16s


In [51]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.27172827172828
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     10034
           1       0.99      0.98      0.98      9986

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 51min 1s
Wall time: 53min 35s


# Increase max features to 90000

## Using unigrams, bigrams and trigrams

In [6]:
Tfidf_vect = TfidfVectorizer(max_features=90000, ngram_range=(1,3))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x90000 sparse matrix of type '<class 'numpy.float64'>'
	with 11925346 stored elements in Compressed Sparse Row format>

In [7]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [8]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.35164835164835
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10113
           1       0.98      0.98      0.98      9907

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 24min 37s, sys: 430 ms, total: 24min 37s
Wall time: 24min 37s


In [9]:
# save the model to disk
import pickle
with open('svm_linear_model_90000_features.pkl','wb') as f:
    pickle.dump(SVM,f)

In [10]:
# save the model to disk
import pickle
with open('tfidf_vectorizer_90000_features.pkl','wb') as f:
    pickle.dump(Tfidf_vect,f)

In [55]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.24175824175823
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10058
           1       0.98      0.98      0.98      9962

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 52min 14s
Wall time: 54min 41s


# Increase max features to 100000

## Using unigrams, bigrams and trigrams

In [5]:
Tfidf_vect = TfidfVectorizer(max_features=100000, ngram_range=(1,3))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x100000 sparse matrix of type '<class 'numpy.float64'>'
	with 12120035 stored elements in Compressed Sparse Row format>

In [6]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [6]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.22677322677322
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10010
           1       0.98      0.98      0.98     10010

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 25min 5s
Wall time: 26min 17s


In [7]:
# save the model to disk
import pickle
with open('svm_linear_model_100000_features.pkl','wb') as f:
    pickle.dump(SVM,f)

In [8]:
# save the model to disk
import pickle
with open('tfidf_vectorizer_100000_features.pkl','wb') as f:
    pickle.dump(Tfidf_vect,f)

In [7]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.05194805194806
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10077
           1       0.98      0.98      0.98      9943

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 54min 55s
Wall time: 56min 3s


# Removing the limit on number of features

## Using unigrams, bigrams and trigrams

In [8]:
Tfidf_vect = TfidfVectorizer(ngram_range=(1,3))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x6949859 sparse matrix of type '<class 'numpy.float64'>'
	with 23084138 stored elements in Compressed Sparse Row format>

In [9]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [10]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.4015984015984
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     10100
           1       0.99      0.98      0.98      9920

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: total: 1h 16min 16s
Wall time: 1h 16min 30s


In [11]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  97.47252747252747
              precision    recall  f1-score   support

           0       0.97      0.98      0.98     10100
           1       0.98      0.97      0.97      9920

    accuracy                           0.97     20020
   macro avg       0.97      0.97      0.97     20020
weighted avg       0.97      0.97      0.97     20020

CPU times: total: 1h 36min 56s
Wall time: 1h 37min 49s
