In [67]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes, svm
from sklearn.metrics import roc_auc_score,accuracy_score
import pickle

In [68]:
# Download the stopwords from the nltk package
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dagbo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load the data

In [69]:
# Load the data
df = pd.read_csv('reviews.txt',sep = '\t', names =['Reviews','Comments'])

# Explore the data

In [70]:
df

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...
...,...,...
6913,0,Brokeback Mountain was boring.
6914,0,So Brokeback Mountain was really depressing.
6915,0,"As I sit here, watching the MTV Movie Awards, ..."
6916,0,Ok brokeback mountain is such a horrible movie.


# Preprocess the data

In [71]:
# Define the set of stopwords
stopset = set(stopwords.words('english'))

# Create a TfidfVectorizer object to convert the text data into a numerical matrix of features

In [72]:
# Create a TfidfVectorizer object to convert the text data into a numerical matrix of features
vectorizer = TfidfVectorizer(use_idf = True,lowercase = True, strip_accents='ascii',stop_words=stopset)

# Convert the comments to a matrix of TF-IDF features

In [73]:
# Convert the comments to a matrix of TF-IDF features
X = vectorizer.fit_transform(df.Comments)
y = df.Reviews

# Save the vectorizer object for later use
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))

# Split the data into training and testing sets

In [74]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Multinomial Naive Bayes classifier object and train it on the training set

In [75]:
# Create a Multinomial Naive Bayes classifier object and train it on the training set
clf = naive_bayes.MultinomialNB()
clf.fit(X_train,y_train)


evaluate the model

# Evaluate the model

In [76]:
# Make predictions on the test set
y_pred = clf.predict(X_test)


# Evaluate the model

In [77]:
# Evaluate the model
# Calculate accuracy and ROC-AUC score
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the results

In [78]:
print(f"Accuracy: {accuracy}")
print(f"ROC-AUC score: {roc_auc}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")
print(f"Classification report: {report}")
print(f"Confusion matrix: {cm}")


Accuracy: 0.9747109826589595
ROC-AUC score: 0.9722293703894321
Precision: 0.9694749694749695
Recall: 0.9875621890547264
F1 score: 0.9784349969192854
Classification report:               precision    recall  f1-score   support

           0       0.98      0.96      0.97       580
           1       0.97      0.99      0.98       804

    accuracy                           0.97      1384
   macro avg       0.98      0.97      0.97      1384
weighted avg       0.97      0.97      0.97      1384

Confusion matrix: [[555  25]
 [ 10 794]]


In [79]:
clf = naive_bayes.MultinomialNB()
clf.fit(X,y)

# Save the model

In [80]:
accuracy_score(y_test,clf.predict(X_test))*100

98.77167630057804

# Save the model

In [81]:
filename = 'model.pkl'
pickle.dump(clf, open(filename, 'wb'))

# Evaluate the model

In [82]:
from sklearn.model_selection import cross_val_score

# Create a new model
clf = naive_bayes.MultinomialNB()

# Evaluate the model using 10-fold cross-validation
svm_clf = svm.SVC()
# Evaluate the model using 10-fold cross-validation
svm_scores = cross_val_score(svm_clf, X, y, cv=10, scoring='accuracy')

scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')

# Print the mean accuracy score
print("Mean accuracy for svm:", np.mean(svm_scores))
print("Cross-validation scores for svm:", svm_scores)
print("------------------------------------------")

print("Mean accuracy for naive bayes:", np.mean(scores))
print("Cross-validation scores:", scores)


Mean accuracy for svm: 0.9585260115606935
Cross-validation scores for svm: [0.98265896 1.         0.91618497 0.8265896  1.         0.87138728
 0.99566474 0.99277457 1.         1.        ]
------------------------------------------
Mean accuracy for naive bayes: 0.9713872832369942
Cross-validation scores: [0.96242775 1.         0.98121387 0.84248555 1.         0.96242775
 0.99277457 0.97254335 1.         1.        ]


# Create the SVM classifier

In [83]:
# Create the SVM classifier
svm_clf = svm.SVC()

# Train the SVM classifier

In [84]:
# Train the SVM classifier
svm_clf.fit(X_train, y_train)

# Evaluate the SVM classifier

In [85]:
# Make predictions on the testing data using SVM
svm_y_pred = svm_clf.predict(X_test)

# Evaluate the SVM classifier

In [86]:
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report
# Evaluate the SVM classifier
# Calculate accuracy and ROC-AUC score
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_roc_auc = roc_auc_score(y_test, svm_y_pred)
svm_report = classification_report(y_test, svm_y_pred)
svm_cm = confusion_matrix(y_test, svm_y_pred)
f1

#Calculate SVM precision and recall
svm_precision = svm_cm[1,1] / (svm_cm[0,1] + svm_cm[1,1])
svm_recall = svm_cm[1,1] / (svm_cm[1,0] + svm_cm[1,1])

# Print the results

In [87]:
print(f"Accuracy: {svm_accuracy}")
print(f"ROC-AUC score: {svm_roc_auc}")
print(f"Precision: {svm_precision}")
print(f"Recall: {svm_recall}")
print(f"Classification report: {svm_report}")


Accuracy: 0.9898843930635838
ROC-AUC score: 0.9888917481557727
Precision: 0.9876543209876543
Recall: 0.9950248756218906
Classification report:               precision    recall  f1-score   support

           0       0.99      0.98      0.99       580
           1       0.99      1.00      0.99       804

    accuracy                           0.99      1384
   macro avg       0.99      0.99      0.99      1384
weighted avg       0.99      0.99      0.99      1384



# Save the SVM classifier

In [88]:
filename = 'model2.pkl'
pickle.dump(svm_clf, open(filename, 'wb'))