In [4]:
import pandas as pd
import json
import os
import re
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import math

In [5]:
Corpus = pd.read_csv("/kaggle/input/transformed-training-dataset/transformed_dataset.csv")
Corpus

Unnamed: 0,response,label
0,There are many different best seller lists tha...,1
1,Salt is used on roads to help melt ice and sno...,1
2,There are a few reasons why we still have SD (...,1
3,It is generally not acceptable or ethical to a...,1
4,After the Wright Brothers made the first power...,1
...,...,...
66727,Net neutrality says it all in the name . The i...,0
66728,President 's generally get a Congressional aut...,0
66729,There are a few things that have hurt Leno's r...,0
66730,Atheism is a description of what someone belie...,0


In [6]:
Encoder = LabelEncoder()
Y = Encoder.fit_transform(Corpus['label'])
Y

array([1, 1, 1, ..., 0, 0, 0])

# Increasing max features to 25000

## Using unigrams only

In [4]:
Tfidf_vect = TfidfVectorizer(max_features=25000, ngram_range=(1,1))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x25000 sparse matrix of type '<class 'numpy.float64'>'
	with 5302891 stored elements in Compressed Sparse Row format>

In [5]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [6]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  95.91908091908093
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     10048
           1       0.96      0.96      0.96      9972

    accuracy                           0.96     20020
   macro avg       0.96      0.96      0.96     20020
weighted avg       0.96      0.96      0.96     20020

CPU times: user 16min 12s, sys: 432 ms, total: 16min 12s
Wall time: 16min 12s


In [7]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  96.3886113886114
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     10048
           1       0.96      0.96      0.96      9972

    accuracy                           0.96     20020
   macro avg       0.96      0.96      0.96     20020
weighted avg       0.96      0.96      0.96     20020

CPU times: user 40min 2s, sys: 267 ms, total: 40min 2s
Wall time: 40min 3s


## Using unigrams and bigrams

In [8]:
Tfidf_vect = TfidfVectorizer(max_features=25000, ngram_range=(1,2))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x25000 sparse matrix of type '<class 'numpy.float64'>'
	with 8932788 stored elements in Compressed Sparse Row format>

In [9]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [10]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.06693306693307
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      9856
           1       0.98      0.98      0.98     10164

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 22min 47s, sys: 587 ms, total: 22min 48s
Wall time: 22min 48s


In [11]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.14185814185814
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      9856
           1       0.98      0.98      0.98     10164

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 59min 54s, sys: 738 ms, total: 59min 55s
Wall time: 59min 56s


## Using unigrams, bigrams and trigrams

In [12]:
Tfidf_vect = TfidfVectorizer(max_features=25000, ngram_range=(1,3))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x25000 sparse matrix of type '<class 'numpy.float64'>'
	with 9602637 stored elements in Compressed Sparse Row format>

In [13]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [14]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.27672327672327
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      9938
           1       0.98      0.98      0.98     10082

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 22min 45s, sys: 483 ms, total: 22min 45s
Wall time: 22min 46s


In [15]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.20679320679321
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      9938
           1       0.98      0.98      0.98     10082

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 1h 1min 40s, sys: 498 ms, total: 1h 1min 41s
Wall time: 1h 1min 42s


# Increasing max features to 30000

## Using unigrams only

In [16]:
Tfidf_vect = TfidfVectorizer(max_features=30000, ngram_range=(1,1))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 5324258 stored elements in Compressed Sparse Row format>

In [17]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [18]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  95.83416583416583
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     10001
           1       0.96      0.96      0.96     10019

    accuracy                           0.96     20020
   macro avg       0.96      0.96      0.96     20020
weighted avg       0.96      0.96      0.96     20020

CPU times: user 16min 39s, sys: 179 ms, total: 16min 39s
Wall time: 16min 39s


In [19]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  96.21878121878123
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     10001
           1       0.96      0.96      0.96     10019

    accuracy                           0.96     20020
   macro avg       0.96      0.96      0.96     20020
weighted avg       0.96      0.96      0.96     20020

CPU times: user 40min 57s, sys: 265 ms, total: 40min 57s
Wall time: 40min 58s


## Using unigrams and bigrams

In [20]:
Tfidf_vect = TfidfVectorizer(max_features=30000, ngram_range=(1,2))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 9181629 stored elements in Compressed Sparse Row format>

In [21]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [22]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.1018981018981
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10003
           1       0.98      0.98      0.98     10017

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 29min 26s, sys: 429 ms, total: 29min 27s
Wall time: 29min 27s


In [23]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.07692307692307
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10003
           1       0.98      0.98      0.98     10017

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 1h 2min 30s, sys: 453 ms, total: 1h 2min 31s
Wall time: 1h 2min 32s


## Using unigrams, bigrams and trigrams

In [24]:
Tfidf_vect = TfidfVectorizer(max_features=30000, ngram_range=(1,3))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 9927669 stored elements in Compressed Sparse Row format>

In [25]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [26]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.26173826173826
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      9951
           1       0.98      0.98      0.98     10069

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 24min 21s, sys: 348 ms, total: 24min 22s
Wall time: 24min 23s


In [27]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.20179820179821
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      9951
           1       0.98      0.98      0.98     10069

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 1h 3min 52s, sys: 446 ms, total: 1h 3min 52s
Wall time: 1h 3min 53s


# Increasing max features to 35000

## Using unigrams only

In [28]:
Tfidf_vect = TfidfVectorizer(max_features=35000, ngram_range=(1,1))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x35000 sparse matrix of type '<class 'numpy.float64'>'
	with 5338968 stored elements in Compressed Sparse Row format>

In [29]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [30]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  95.91408591408592
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     10067
           1       0.96      0.96      0.96      9953

    accuracy                           0.96     20020
   macro avg       0.96      0.96      0.96     20020
weighted avg       0.96      0.96      0.96     20020

CPU times: user 16min 49s, sys: 149 ms, total: 16min 49s
Wall time: 16min 50s


In [31]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  96.47852147852149
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     10067
           1       0.97      0.96      0.96      9953

    accuracy                           0.96     20020
   macro avg       0.96      0.96      0.96     20020
weighted avg       0.96      0.96      0.96     20020

CPU times: user 41min 9s, sys: 282 ms, total: 41min 9s
Wall time: 41min 10s


## Using unigrams and bigrams

In [32]:
Tfidf_vect = TfidfVectorizer(max_features=35000, ngram_range=(1,2))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x35000 sparse matrix of type '<class 'numpy.float64'>'
	with 9389710 stored elements in Compressed Sparse Row format>

In [33]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [34]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.08191808191809
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      9970
           1       0.98      0.98      0.98     10050

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 26min 8s, sys: 325 ms, total: 26min 8s
Wall time: 26min 9s


In [35]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  97.96703296703296
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      9970
           1       0.98      0.98      0.98     10050

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 1h 5min 24s, sys: 534 ms, total: 1h 5min 24s
Wall time: 1h 5min 27s


## Using unigrams, bigrams and trigrams

In [8]:
Tfidf_vect = TfidfVectorizer(max_features=35000, ngram_range=(1,3))
Tfidf_vect.fit(Corpus['response'])
X_Tfidf = Tfidf_vect.transform(Corpus['response'])
X_Tfidf

<66732x35000 sparse matrix of type '<class 'numpy.float64'>'
	with 10203439 stored elements in Compressed Sparse Row format>

In [9]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X_Tfidf,Y,test_size=0.3)

In [38]:
%%time
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_SVM = SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("Linear Kernel SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_SVM))

Linear Kernel SVM Accuracy Score ->  98.29670329670328
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10074
           1       0.98      0.98      0.98      9946

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 26min 5s, sys: 389 ms, total: 26min 6s
Wall time: 26min 7s


In [10]:
%%time
RBF_SVM = svm.SVC(C=1.0, kernel='rbf')
RBF_SVM.fit(Train_X,Train_Y)
# predict the labels on test dataset
predictions_RBF_SVM = RBF_SVM.predict(Test_X)
# Use accuracy_score function to get the accuracy
print("RBF Kernel SVM Accuracy Score -> ",accuracy_score(predictions_RBF_SVM, Test_Y)*100)
print(classification_report(Test_Y, predictions_RBF_SVM))

RBF Kernel SVM Accuracy Score ->  98.34665334665334
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10028
           1       0.98      0.98      0.98      9992

    accuracy                           0.98     20020
   macro avg       0.98      0.98      0.98     20020
weighted avg       0.98      0.98      0.98     20020

CPU times: user 49min 29s, sys: 540 ms, total: 49min 30s
Wall time: 49min 31s


In [11]:
# save the model to disk
import pickle
with open('svm_rbf_model_35000_features.pkl','wb') as f:
    pickle.dump(RBF_SVM,f)

In [12]:
# save the model to disk
import pickle
with open('tfidf_vectorizer_35000_features.pkl','wb') as f:
    pickle.dump(Tfidf_vect,f)