In [4]:
# Importing necessary modules and functions
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import io

# Function to load TripAdvisor data
def load_tripadvisor_data(data_file, num_rows=2000, num_clssifier_rows=50):
    """
      Load TripAdvisor data from a CSV file.

      Args:
      - data_file (str): Path to the CSV file containing the data.
      - num_rows (int): Maximum number of rows to consider from the data file.
      - num_clssifier_rows (int): Number of rows to use for classification purposes.

      Returns:
      - texts_equal (list): List of equalized texts.
      - labels_equal (list): List of equalized labels corresponding to the texts_equal.
      - classifying_texts (list): List of texts for classification purposes.
      - classifying_labels (list): List of labels corresponding to the classifying_texts.
    """

    df = pd.read_csv(data_file, encoding='utf-8')
    texts = df['Review'].tolist()
    labels = df['Rating'].tolist()

    df_equal = df.groupby('Rating', group_keys=False).apply(lambda x: x.sample(min(len(x), 400)))

    if len(df_equal) > num_rows:
        df_equal = df_equal.groupby('Rating', group_keys=False).apply(lambda x: x.sample(min(len(x), num_rows // 5)))

    texts_equal = df_equal['Review'].tolist()
    labels_equal = [0 if rating == 1 else 1 if rating == 2 else 2 if rating == 3 else 3 if rating == 4 else 4 for rating in df_equal['Rating'].tolist()]

    classifying_texts = []
    classifying_labels = []
    for index, row in df.iterrows():
        if row['Review'] not in texts_equal:
            classifying_texts.append(row['Review'])
            classifying_labels.append(0 if row['Rating'] == 1 else 1 if row['Rating'] == 2 else 2 if row['Rating'] == 3 else 3 if row['Rating'] == 4 else 4)

        if len(classifying_texts) == num_clssifier_rows:
            break

    return texts_equal, labels_equal, classifying_texts, classifying_labels

In [5]:
# Load google drive files (next step is to add csv file with dataset into the selected drive)
from google.colab import drive

# Mount Google Drive to '/content/drive'
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Defining tripadvisor location on Google Drive
# Link to original dataset: https://www.kaggle.com/datasets/andrewmvd/trip-advisor-hotel-reviews?resource=download
data_file = '/content/drive/My Drive/datasets/tripadvisor_hotel_reviews.csv'

# Calling the load_tripadvisor_data function with the specified data_file
texts, labels, classifying_texts, classifying_labels = load_tripadvisor_data(data_file)

In [None]:
# Dataset class for classification
class TextClassificationDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length):
          """
            Initialize the TextClassificationDataset.

            Args:
            - texts (list): List of texts.
            - labels (list): List of corresponding labels.
            - tokenizer (object): Tokenizer object for tokenizing the texts.
            - max_length (int): Maximum length for tokenization.

            Returns:
              None
          """

          self.texts = texts
          self.labels = labels
          self.tokenizer = tokenizer
          self.max_length = max_length
  def __len__(self):
          """
            Get the length of the dataset.

            Returns:
            int: Length of the dataset.
          """

          return len(self.texts)
  def __getitem__(self, idx):
          """
            Get a sample from the dataset at the specified index.

            Args:
            - idx (int): Index of the sample to retrieve.

            Returns:
            dict: A dictionary containing the tokenized input, attention mask, and label.
          """

          text = self.texts[idx]
          label = self.labels[idx]

          # Tokenize the text using the provided tokenizer
          encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)

          # Return a dictionary containing the tokenized input, attention mask, and label
          return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [7]:
import torch.nn as nn

# BERT-based classifier model
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        """
          Initialize the BERT-based classifier model.

          Args:
          - bert_model_name (str): Name of the BERT model to use.
          - num_classes (int): Number of classes for classification.

          Returns:
          None
        """

        super(BERTClassifier, self).__init__()

        # Load the BERT model
        self.bert = BertModel.from_pretrained(bert_model_name)

        # Dropout layer for regularization
        self.dropout = nn.Dropout(0.1)

        # Fully connected layer for classification
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        """
          Forward pass of the classifier model.

          Args:
          - input_ids (tensor): Tensor containing the input token IDs.
          - attention_mask (tensor): Tensor containing the attention mask.

          Returns:
          tensor: Logits produced by the classifier.
        """

        # Pass input through the BERT model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Extract the pooled output (CLS token representation)
        pooled_output = outputs.pooler_output

        # Apply dropout for regularization
        x = self.dropout(pooled_output)

        # Apply fully connected layer for classification
        logits = self.fc(x)
        return logits

In [None]:
# Model training function
def train(model, data_loader, optimizer, scheduler, device):
    """
      Train the provided model using the given data loader.

      Args:
      - model (nn.Module): The model to train.
      - data_loader (DataLoader): DataLoader containing the training data.
      - optimizer (optim.Optimizer): Optimizer for updating model parameters.
      - scheduler (optim.lr_scheduler._LRScheduler): Scheduler for adjusting learning rate.
      - device (str): Device (cpu or cuda) on which to perform training.

      Returns:
        None
    """
    # Set the model to training mode
    model.train()

    # Iterate over batches in the data loader
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [None]:
# Model evaluation function
def evaluate(model, data_loader, device):
    """
      Evaluate the provided model using the given data loader.

      Args:
      - model (nn.Module): The model to evaluate.
      - data_loader (DataLoader): DataLoader containing the evaluation data.
      - device (str): Device (cpu or cuda) on which to perform evaluation.

      Returns:
      - accuracy (float): Accuracy of the model on the evaluation data.
      - classification_report (str): Classification report containing precision, recall, and F1-score.
    """
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
# Prediction function
def predict_rating(text, model, tokenizer, device, max_length=128):
    """
      Predict the rating category for the given text using the provided model and tokenizer.

      Args:
      - text (str): The input text to predict the rating category for.
      - model (nn.Module): The trained model for prediction.
      - tokenizer (Tokenizer): Tokenizer object for tokenizing the text.
      - device (str): Device (cpu or cuda) on which to perform prediction.
      - max_length (int): Maximum length for tokenization.

      Returns:
      - rating_category (str): Predicted rating category for the text.
    """
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
    return "negative" if preds.item() == 0 else "below average" if preds.item() == 1 else "average" if preds.item() == 2 else "above average" if preds.item() == 3 else "positive" if preds.item() == 4 else "none"

In [None]:
# Model parameters (learning rate, batch size, ...)
bert_model_name = 'bert-base-uncased'
num_classes = 5
max_length = 128
batch_size = 16
num_epochs = 2
learning_rate = 2e-5

In [None]:
# Split data into training and validation set
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.20, random_state=42)

In [None]:
# Tokenizer, datasets and loaders
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Setup device for training and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Setup optimizer for optimized training
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [None]:
# Training
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/2


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.4375
              precision    recall  f1-score   support

           0       0.45      0.95      0.61        83
           1       0.27      0.07      0.12        82
           2       0.00      0.00      0.00        68
           3       0.24      0.13      0.17        77
           4       0.50      0.89      0.64        90

    accuracy                           0.44       400
   macro avg       0.29      0.41      0.31       400
weighted avg       0.31      0.44      0.33       400

Epoch 2/2
Validation Accuracy: 0.5400
              precision    recall  f1-score   support

           0       0.66      0.77      0.71        83
           1       0.48      0.34      0.40        82
           2       0.45      0.43      0.44        68
           3       0.37      0.21      0.27        77
           4       0.58      0.88      0.70        90

    accuracy                           0.54       400
   macro avg       0.51      0.52      0.50       400
weighted av

In [None]:
# Saving final model so we can test it
torch.save(model.state_dict(), "bert_classifier.pth")

In [None]:
# Test rating prediction with our dummy comments
test_text = "The hotel was great and I really enjoyed my time there."
rating = predict_rating(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted rating: {rating}")

The hotel was great and I really enjoyed my time there.
Predicted rating: positive


In [None]:
# Test rating prediction with our dummy comments
test_text = "The hotel was so bad and I would not recommend it to anyone."
rating = predict_rating(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted rating: {rating}")

The hotel was so bad and I would not recommend it to anyone.
Predicted rating: negative


In [None]:
# Test rating prediction with our dummy comments
test_text = "Worst hotel ever."
rating = predict_rating(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted rating: {rating}")

Worst hotel ever.
Predicted rating: negative


In [None]:
# Test rating prediction with our dummy comments
test_text = "Woman at reception was so angry but beautiful"
rating = predict_rating(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted rating: {rating}")

Woman at reception was so angry but beautiful
Predicted rating: negative


In [None]:
# Test rating prediction with our dummy comments
test_text = "My recent stay at the Riverside Hotel was decent, though it left something to be desired. The location was convenient, situated close to downtown and with easy access to public transportation. The lobby and common areas were clean and well-maintained, creating a pleasant first impression.However, once I got to my room, I noticed a few issues. While it was clean, the furnishings seemed a bit outdated, and the overall decor could use a refresh. The bed was comfortable enough, but the linens were a bit worn, and there were some stains on the carpet. The bathroom was functional but lacked some of the amenities I have come to expect from similar hotels."
rating = predict_rating(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted rating: {rating}")

My recent stay at the Riverside Hotel was decent, though it left something to be desired. The location was convenient, situated close to downtown and with easy access to public transportation. The lobby and common areas were clean and well-maintained, creating a pleasant first impression.However, once I got to my room, I noticed a few issues. While it was clean, the furnishings seemed a bit outdated, and the overall decor could use a refresh. The bed was comfortable enough, but the linens were a bit worn, and there were some stains on the carpet. The bathroom was functional but lacked some of the amenities I have come to expect from similar hotels.
Predicted rating: above average


In [None]:
# Test rating prediction with our dummy comments
test_text = "My recent stay at the Lakeside Inn left me with mixed feelings. The location was picturesque, with stunning views of the lake and surrounding mountains. The tranquility of the setting provided a peaceful escape from the hustle and bustle of daily life. Additionally, the check-in process was smooth, and the staff at the front desk were polite and welcoming. However, despite these highlights, there were several aspects of my stay that fell short of expectations. The room I was assigned to was disappointing, to say the least. While it was clean, it lacked the comfort and modern amenities I had hoped for. The furniture appeared worn-out, and the bed was uncomfortably firm, making it difficult to get a good night's sleep. Furthermore, the bathroom was in need of renovation, with outdated fixtures and limited toiletries provided. The dining experience at the hotel's restaurant was another letdown. Although the food was decent, the service was slow, and it took ages for our orders to arrive. The menu options were limited, and the prices seemed a bit steep for the quality of the meals offered."
rating = predict_rating(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted rating: {rating}")

My recent stay at the Lakeside Inn left me with mixed feelings. The location was picturesque, with stunning views of the lake and surrounding mountains. The tranquility of the setting provided a peaceful escape from the hustle and bustle of daily life. Additionally, the check-in process was smooth, and the staff at the front desk were polite and welcoming. However, despite these highlights, there were several aspects of my stay that fell short of expectations. The room I was assigned to was disappointing, to say the least. While it was clean, it lacked the comfort and modern amenities I had hoped for. The furniture appeared worn-out, and the bed was uncomfortably firm, making it difficult to get a good night's sleep. Furthermore, the bathroom was in need of renovation, with outdated fixtures and limited toiletries provided. The dining experience at the hotel's restaurant was another letdown. Although the food was decent, the service was slow, and it took ages for our orders to arrive. 

In [None]:
# Test rating prediction with our dummy comments
test_text = "My recent experience at the Sunset Hotel was incredibly disappointing, to say the least. Where do I even begin? Firstly, let's talk about the room. It was nothing short of a disaster. The cleanliness was severely lacking, with visible stains on the carpet and bedding that made me question when they were last washed. The furniture looked like it had been salvaged from a thrift store, and the overall ambiance was far from inviting. ut wait, it gets worse. The bathroom was a nightmare. Not only was it outdated, but it also had mold growing in the corners and a foul odor that permeated the entire room. I felt like I needed a hazmat suit just to step inside."
rating = predict_rating(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted rating: {rating}")

My recent experience at the Sunset Hotel was incredibly disappointing, to say the least. Where do I even begin? Firstly, let's talk about the room. It was nothing short of a disaster. The cleanliness was severely lacking, with visible stains on the carpet and bedding that made me question when they were last washed. The furniture looked like it had been salvaged from a thrift store, and the overall ambiance was far from inviting. ut wait, it gets worse. The bathroom was a nightmare. Not only was it outdated, but it also had mold growing in the corners and a foul odor that permeated the entire room. I felt like I needed a hazmat suit just to step inside.
Predicted rating: negative


In [None]:
def predict_rating_percentage(texts, labels, model, tokenizer, device):
    """
      Predict the accuracy percentage of the model on the provided texts and labels.

      Args:
      - texts (list): List of input texts.
      - labels (list): List of corresponding labels.
      - model (nn.Module): The trained model for prediction.
      - tokenizer (Tokenizer): Tokenizer object for tokenizing the texts.
      - device (str): Device (cpu or cuda) on which to perform prediction.

      Returns:
      - average_accuracy (float): Average accuracy percentage of the model on the provided texts.
    """
    predictions = []
    total_accuracy = 0
    total_texts = len(texts)
    for text, label in zip(texts, labels):
        rating = predict_rating(text, model, tokenizer, device)
        predicted_rating = 0
        if rating == "negative":
            predicted_rating = 0
        elif rating == "below average":
            predicted_rating = 1
        elif rating == "average":
            predicted_rating = 2
        elif rating == "above average":
            predicted_rating = 3
        elif rating == "positive":
            predicted_rating = 4
        else:
            predicted_rating = -1  # Unknown label

        # Calculate accuracy
        if predicted_rating == label:
            total_accuracy += 1
        else:
            # Calculate the difference between predicted and actual rating
            difference = abs(predicted_rating - label)
            # Calculate accuracy based on the difference
            accuracy = 1 - (difference / 4)  # Since ratings range from 0 to 4
            total_accuracy += accuracy

    average_accuracy = (total_accuracy / total_texts) * 100
    return average_accuracy

In [None]:
# 89.5% accuracy is obtained with the provided data for classifying purpose
test_accuracy = predict_rating_percentage(classifying_texts, classifying_labels, model, tokenizer, device)
print("Accuracy:", test_accuracy, "%")

Accuracy: 89.5 %
