In [5]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate

In [2]:
df = pd.read_csv('../raw_data/clean_data.csv')
labels = df.target

In [9]:
#Split the dataset
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2)

In [10]:
# Tried Naive Bayes for classifier but accuracy ~ 82%
pipe = make_pipeline(TfidfVectorizer(), PassiveAggressiveClassifier())
cv_results = cross_validate(pipe, x_train, y_train, cv=5)
cv_results


{'fit_time': array([3.00376081, 2.70196581, 2.72558498, 2.6647532 , 2.68786097]),
 'score_time': array([0.65076613, 0.86643505, 0.6467731 , 0.65880203, 0.64156508]),
 'test_score': array([0.93293886, 0.93688363, 0.9260355 , 0.92102665, 0.92201382])}

In [13]:
params = { 'tfidfvectorizer__max_df' : (.4,.5,.6,.7,.8)}
grid_search = GridSearchCV(pipe, params, cv=5, n_jobs=-1)

grid_search.fit(x_train, y_train)
grid_search.best_score_
grid_search.best_params_
# max_df = 0.8 gives highest accuracy on train set ~ 92.74%

{'tfidfvectorizer__max_df': 0.8}

In [14]:
grid_search.best_score_

0.9273863833283682

In [15]:
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df = .8)

#Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [16]:
#Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

#Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.29%


In [17]:
#Build confusion matrix
confusion_matrix(y_test,y_pred, labels=[0,1])

array([[616,  44],
       [ 41, 566]])