<a href="https://colab.research.google.com/github/dhyeyvshah/hotel-review-sentiment-analysis/blob/main/HotelReviewSentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("ggplot")

import nltk
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
df = pd.read_csv("/tripadvisor_hotel_reviews.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/tripadvisor_hotel_reviews.csv'

In [None]:
df.head()

In [None]:
df.shape

In [None]:
ax = df["Rating"].value_counts().sort_index().plot(kind="bar",
                                              title="Count of Reviews by Stars",
                                              figsize=(10, 5))
ax.set_xlabel("Review Rating")
plt.show()

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm import tqdm

sia = SentimentIntensityAnalyzer()

In [None]:
results = {}

df = df[df["Review"].apply(lambda x: len(nltk.word_tokenize(x)) <= 512)]

for i, row in tqdm(df.iterrows(), total=len(df)):
    review = row["Review"]
    results[i] = sia.polarity_scores(review)

In [None]:
vaders = pd.DataFrame(results).T

vaders = pd.concat([vaders, df], axis=1)
vaders.head()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=vaders, x="Rating", y="pos", ax=axs[0])
sns.barplot(data=vaders, x="Rating", y="neu", ax=axs[1])
sns.barplot(data=vaders, x="Rating", y="neg", ax=axs[2])
axs[0].set_title("Positive")
axs[1].set_title("Neutral")
axs[2].set_title("Negative")
plt.tight_layout()
plt.show()

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
device = "cuda:0"
tokenizer = AutoTokenizer.from_pretrained(MODEL, clean_up_tokenization_spaces=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.to(device)

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors="pt", truncation=True, max_length=512).to(device)
    output = model(**encoded_text)
    scores = output[0][0].cpu().detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        "roberta_neg" : scores[0],
        "roberta_neu" : scores[1],
        "roberta_pos" : scores[2]
    }
    return scores_dict

In [None]:
res = {}

for i, row in tqdm(df.iterrows(), total=len(df)):

    review = row["Review"]
    vader_result = sia.polarity_scores(review)
    vader_result_rename = {}
    for key, value in vader_result.items():
        vader_result_rename[f"vader_{key}"] = value
    roberta_result = polarity_scores_roberta(review)
    both = {**vader_result_rename, **roberta_result}
    res[i] = both

In [None]:
results_df = pd.DataFrame(res).T
results_df = pd.concat([results_df, df], axis=1)
results_df.columns

In [None]:
sns.pairplot(data=results_df,
             vars=['vader_neg', 'vader_neu', 'vader_pos',
                  'roberta_neg', 'roberta_neu', 'roberta_pos'],
            hue='Rating',
            palette='tab10')
plt.show()

In [None]:
results_df.query('Rating == 1') \
    .sort_values('roberta_pos', ascending=False)

In [None]:
results_df.query('Rating == 1') \
    .sort_values('vader_pos', ascending=False)['Review'].values[0]

In [None]:
from transformers import pipeline
sent_pipeline = pipeline("sentiment-analysis")

In [None]:
sent_pipeline('booo')