In [28]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score,accuracy_score,confusion_matrix, classification_report
import pickle

In [29]:

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dagbo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
# Load the data
df = pd.read_csv('reviews.txt',sep = '\t', names =['Reviews','Comments'])

In [31]:
df

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...
...,...,...
6913,0,Brokeback Mountain was boring.
6914,0,So Brokeback Mountain was really depressing.
6915,0,"As I sit here, watching the MTV Movie Awards, ..."
6916,0,Ok brokeback mountain is such a horrible movie.


In [32]:
stopset = set(stopwords.words('english'))

In [33]:
vectorizer = TfidfVectorizer(use_idf = True,lowercase = True, strip_accents='ascii',stop_words=stopset)

In [34]:
X = vectorizer.fit_transform(df.Comments)
y = df.Reviews
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
nb_clf = naive_bayes.MultinomialNB()
nb_clf.fit(X_train,y_train)


evaluate the model

In [37]:
# Make predictions on the test set
y_pred = nb_clf.predict(X_test)


In [38]:
# Evaluate the model
# Calculate accuracy and ROC-AUC score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
nb_cm = confusion_matrix(y_test, y_pred)
nb_report = classification_report(y_test, y_pred)


In [39]:
# Calculate Naive Bayes precision and recall
nb_precision = nb_cm[1,1] / (nb_cm[0,1] + nb_cm[1,1])
nb_recall = nb_cm[1,1] / (nb_cm[1,0] + nb_cm[1,1])

In [40]:
print(f"Accuracy: {accuracy}")
print(f"ROC-AUC score: {roc_auc}")
print("Naive Bayes precision:", nb_precision)
print("Naive Bayes recall:", nb_recall)
print("Naive Bayes confusion matrix:")
print(nb_cm)
print("Naive Bayes classification report:")
print(nb_report)

Accuracy: 0.9747109826589595
ROC-AUC score: 0.9722293703894321
Naive Bayes precision: 0.9694749694749695
Naive Bayes recall: 0.9875621890547264
Naive Bayes confusion matrix:
[[555  25]
 [ 10 794]]
Naive Bayes classification report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       580
           1       0.97      0.99      0.98       804

    accuracy                           0.97      1384
   macro avg       0.98      0.97      0.97      1384
weighted avg       0.97      0.97      0.97      1384



In [41]:
nb_clf = naive_bayes.MultinomialNB()
nb_clf.fit(X,y)

In [42]:
accuracy_score(y_test,nb_clf.predict(X_test))*100

98.77167630057804

In [43]:
filename = 'similarity.pkl'
pickle.dump(nb_clf, open(filename, 'wb'))

# Cross-validation

In [44]:
from sklearn.model_selection import cross_val_score

# Create a new model
nb_clf = naive_bayes.MultinomialNB()

# Evaluate the model using 10-fold cross-validation
scores = cross_val_score(nb_clf, X, y, cv=10, scoring='accuracy')

print("Cross-validation scores:", scores)
print("Mean accuracy:", np.mean(scores))


Cross-validation scores: [0.96242775 1.         0.98121387 0.84248555 1.         0.96242775
 0.99277457 0.97254335 1.         1.        ]
Mean accuracy: 0.9713872832369942


using Svm model

In [45]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [46]:
from sklearn import svm


In [47]:
# Create a new SVM model
svm_clf = svm.SVC(kernel='linear', probability=True)

In [48]:
# Train the model
svm_clf.fit(X_train, y_train)

In [49]:
# Make predictions on the test set
y_pred = svm_clf.predict(X_test)

In [50]:
# Evaluate the model
# Calculate accuracy and ROC-AUC score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
svm_cm = confusion_matrix(y_test, y_pred)
svm_report = classification_report(y_test, y_pred)

In [51]:
# Calculate SVM precision and recall
svm_precision = svm_cm[1,1] / (svm_cm[0,1] + svm_cm[1,1])
svm_recall = svm_cm[1,1] / (svm_cm[1,0] + svm_cm[1,1])

In [52]:
print(f"Accuracy: {accuracy}")
print(f"ROC-AUC score: {roc_auc}")
print("SVM precision:", svm_precision)
print("SVM recall:", svm_recall)
print("SVM confusion matrix:")
print(svm_cm)
print("SVM classification report:")
print(svm_report)

Accuracy: 0.990606936416185
ROC-AUC score: 0.9897538171212902
SVM precision: 0.9888751545117429
SVM recall: 0.9950248756218906
SVM confusion matrix:
[[571   9]
 [  4 800]]
SVM classification report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       580
           1       0.99      1.00      0.99       804

    accuracy                           0.99      1384
   macro avg       0.99      0.99      0.99      1384
weighted avg       0.99      0.99      0.99      1384

