In [184]:
import re

import seaborn as sns

import matplotlib.pyplot as plt

import pandas as pd

import numpy as np

from sklearn import svm

from sklearn.svm import SVC

from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import MultinomialNB

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score, KFold

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [185]:
data = pd.read_csv('all_reviews_refined_copy.csv')
data.head()

Unnamed: 0,review_text,review_rating,emotion
0,The service I had from the staff was very good...,4,1
1,The staff were lovely and I couldn't fault it....,4,1
2,I attended Russell's hall a&e department with ...,5,1
3,"My mom was admitted here, She cried that one ...",1,-1
4,The 2☆ go to the staff on the MECU who made su...,2,-1


In [186]:
reviews = data["review_text"]  
emotions = data["emotion"]  

In [187]:
def clean_text(text):
    text = re.sub(r"[^\w\s]", "", text)  
    text = re.sub(r"<[^>]*>", "", text)  
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text

reviews = reviews.apply(clean_text)

In [188]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))  
features = vectorizer.fit_transform(reviews)

In [189]:
X_train, X_test, y_train, y_test = train_test_split(features, emotions, test_size=0.2, random_state=42)

In [190]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

In [191]:
lg_model = LogisticRegression(solver="lbfgs" , max_iter=1000)
lg_model.fit(X_train, y_train)

In [192]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [193]:
rf_model = RandomForestClassifier(n_estimators=10)
rf_model.fit(X_train, y_train)

In [194]:
# Comment
y_pred = svm_model.predict(X_test)

In [195]:
y_pred_2 = lg_model.predict(X_test)

In [196]:
y_pred_3 = nb_model.predict(X_test)

In [197]:
y_pred_4 = rf_model.predict(X_test)

In [198]:
# Comment
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
f1 = f1_score(y_test, y_pred,average='micro' )

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.8818263205013429
Precision: 0.8818263205013429
Recall: 0.8818263205013429
F1-score: 0.8818263205013429


In [199]:
# Comment
accuracy_2 = accuracy_score(y_test, y_pred_2)
precision_2 = precision_score(y_test,y_pred_2, average='micro')
recall_2 = recall_score(y_test, y_pred_2, average='micro')
f1_2 = f1_score(y_test,y_pred_2,average='micro' )

print("Accuracy:", accuracy_2)
print("Precision:", precision_2)
print("Recall:", recall_2)
print("F1-score:", f1_2)

Accuracy: 0.8898836168307968
Precision: 0.8898836168307968
Recall: 0.8898836168307968
F1-score: 0.8898836168307966


In [200]:
accuracy_3 = accuracy_score(y_test, y_pred_3)
precision_3 = precision_score(y_test, y_pred_3, average='micro')
recall_3 = recall_score(y_test, y_pred_3, average='micro')
f1_3 = f1_score(y_test, y_pred_3,average='micro' )

print("Accuracy:", accuracy_3)
print("Precision:", precision_3)
print("Recall:", recall_3)
print("F1-score:", f1_3)

Accuracy: 0.8728737690241719
Precision: 0.8728737690241719
Recall: 0.8728737690241719
F1-score: 0.8728737690241719


In [201]:
# Comment
accuracy_4 = accuracy_score(y_test, y_pred_4)
precision_4 = precision_score(y_test, y_pred_4, average='micro')
recall_4 = recall_score(y_test, y_pred_4, average='micro')
f1_4 = f1_score(y_test, y_pred_4,average='micro' )

print("Accuracy:", accuracy_4)
print("Precision:", precision_4)
print("Recall:", recall_4)
print("F1-score:", f1_4)

Accuracy: 0.8173679498657117
Precision: 0.8173679498657117
Recall: 0.8173679498657117
F1-score: 0.8173679498657117


In [202]:
X_train.shape, y_train.shape
X_test.shape, y_test.shape

((1117, 10000), (1117,))

In [203]:
'''clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, features, emotions, cv=10)

print(f"Mean Accuracy: {scores.mean():.2f}")
print(f"Standard Deviation: {scores.std():.2f}")
scores'''

'clf = svm.SVC(kernel=\'linear\', C=1, random_state=42)\nscores = cross_val_score(clf, features, emotions, cv=10)\n\nprint(f"Mean Accuracy: {scores.mean():.2f}")\nprint(f"Standard Deviation: {scores.std():.2f}")\nscores'

In [None]:
num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
scores = cross_val_score(svm_model, features, emotions, cv=kf, scoring='accuracy')

print(f"Mean Accuracy: {scores2.mean():.2f}")
print(f"Standard Deviation: {scores2.std():.2f}")
scores

In [None]:
num_folds = 10
kf1 = KFold(n_splits=num_folds, shuffle=True, random_state=42)
scores2 = cross_val_score(lg_model, features, emotions, cv=kf, scoring='accuracy')

print(f"Mean Accuracy: {scores2.mean():.2f}")
print(f"Standard Deviation: {scores2.std():.2f}")
scores2

In [None]:
n_folds = 10
kf2 = KFold(n_splits=n_folds, shuffle=True, random_state=42)
scores3 = cross_val_score(nb_model, features, emotions, cv=kf, scoring='accuracy')

print(f"Mean Accuracy: {scores3.mean():.2f}")
print(f"Standard Deviation: {scores3.std():.2f}")
scores3

In [None]:
n_folds = 10
kf3 = KFold(n_splits=n_folds, shuffle=True, random_state=42)
scores4 = cross_val_score(rf_model, features, emotions, cv=kf, scoring='accuracy')

print(f"Mean Accuracy: {scores4.mean():.2f}")
print(f"Standard Deviation: {scores4.std():.2f}")

scores4

In [None]:

counts, bins, patches = plt.hist(emotions, bins=3, range=(-1.5, 1.5), color='blue') 
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.title("Distribution of Ratings")


plt.xticks([-1, 0, 1])


for idx, count in enumerate(counts):
    plt.text(bins[idx] + (bins[idx + 1] - bins[idx]) / 2, count + 0.1, f"{int(count)}", ha="center")

plt.show()

In [None]:
google_review = np.array(["yes"]) 
review_vector = vectorizer.transform(google_review)
predicted_sentiment = svm_model.predict(review_vector)
print(predicted_sentiment)