In [1]:
import re

import seaborn as sns

import matplotlib.pyplot as plt

import pandas as pd

import numpy as np

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score, KFold

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
data = pd.read_csv('all_reviews_refined_copy.csv')
data.head()

Unnamed: 0,review_text,review_rating,emotion
0,The service I had from the staff was very good...,4,1
1,The staff were lovely and I couldn't fault it....,4,1
2,I attended Russell's hall a&e department with ...,5,1
3,"My mom was admitted here, She cried that one ...",1,-1
4,The 2☆ go to the staff on the MECU who made su...,2,-1


In [3]:
reviews = data["review_text"]  
emotions = data["emotion"]  

In [4]:
def clean_text(text):
    text = re.sub(r"[^\w\s]", "", text)  
    text = re.sub(r"<[^>]*>", "", text)  
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text

reviews = reviews.apply(clean_text)

In [5]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))  
features = vectorizer.fit_transform(reviews)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(features, emotions, test_size=0.2, random_state=42)

In [103]:
svm_model = RandomForestClassifier(kernel='linear')
svm_model.fit(X_train, y_train)

In [104]:
# Comment
y_pred = svm_model.predict(X_test)

In [105]:
# Comment
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
f1 = f1_score(y_test, y_pred,average='micro' )

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.8818263205013429
Precision: 0.8818263205013429
Recall: 0.8818263205013429
F1-score: 0.8818263205013429


In [106]:
 X_train.shape, y_train.shape
X_test.shape, y_test.shape


((1117, 10000), (1117,))

In [108]:
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, features, emotions, cv=10)

print(f"Mean Accuracy: {scores.mean():.2f}")
print(f"Standard Deviation of Accuracy: {scores.std():.2f}")
scores

Mean Accuracy: 0.89
Standard Deviation of Accuracy: 0.01


array([0.91413238, 0.89624329, 0.86762075, 0.89087657, 0.89445438,
       0.89605735, 0.87992832, 0.89784946, 0.87992832, 0.86738351])

In [None]:
# Comment
counts, bins, patches = plt.hist(emotions, bins=3, range=(-1.5, 1.5), color='blue')  # Customize bins if needed
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.title("Distribution of Ratings")

# Comment
plt.xticks([-1, 0, 1])

# Comment
for idx, count in enumerate(counts):
    plt.text(bins[idx] + (bins[idx + 1] - bins[idx]) / 2, count + 0.1, f"{int(count)}", ha="center")

plt.show()

In [109]:
google_review = np.array(["yes, no"])  # Note the square brackets to make it an array of strings
review_vector = vectorizer.transform(google_review)
predicted_sentiment = svm_model.predict(review_vector)
print(predicted_sentiment)

[-1]
