In [None]:
# Installing several necessary Python packages
! pip install datasets transformers sentencepiece
! pip install accelerate -U
! pip install transformers[torch]
! pip install pytorch-transformers
! pip install accelerate==0.27.2

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.

In [None]:
# Function for loading TripAdvisor data with reviews
import pandas as pd
def load_tripadvisor_data(data_file, num_rows=2000, num_clssifier_rows=50):
    """
      Load TripAdvisor data from a CSV file, including reviews for classification.

      Args:
      - data_file (str): Path to the CSV file containing the data.
      - num_rows (int): Maximum number of rows to consider from the data file.
      - num_clssifier_rows (int): Number of rows to use for classification purposes.

      Returns:
      - df_equal (DataFrame): DataFrame containing a subset of the original data with balanced classes.
      - classifying_texts (list): List of texts for classification purposes.
      - classifying_labels (list): List of labels corresponding to the classifying_texts.
    """

    df = pd.read_csv(data_file, encoding='utf-8')
    texts = df['Review'].tolist()
    labels = df['Rating'].tolist()

    df_equal = df.groupby('Rating', group_keys=False).apply(lambda x: x.sample(min(len(x), 400)))

    if len(df_equal) > num_rows:
        df_equal = df_equal.groupby('Rating', group_keys=False).apply(lambda x: x.sample(min(len(x), num_rows // 5)))

    texts_equal = df_equal['Review'].tolist()

    classifying_texts = []
    classifying_labels = []
    for index, row in df.iterrows():
        if row['Review'] not in texts_equal:
            classifying_texts.append(row['Review'])
            classifying_labels.append(0 if row['Rating'] == 1 else 1 if row['Rating'] == 2 else 2 if row['Rating'] == 3 else 3 if row['Rating'] == 4 else 4)

        if len(classifying_texts) == num_clssifier_rows:
            break

    return df_equal, classifying_texts, classifying_labels

In [None]:
# Load google drive files (next step is to add csv file with dataset into the selected drive)
from google.colab import drive

# Mount Google Drive to '/content/drive'
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import datasets
from datasets import Dataset, DatasetDict

# Defining tripadvisor location on Google Drive
# Link to original dataset: https://www.kaggle.com/datasets/andrewmvd/trip-advisor-hotel-reviews?resource=download
data_file = '/content/drive/My Drive/datasets/tripadvisor_hotel_reviews.csv'

# Load TripAdvisor data and split into train-test sets
review_df, classifying_texts, classifying_labels = load_tripadvisor_data(data_file)
review_dataset = Dataset.from_pandas(review_df).train_test_split(test_size=0.2)

In [None]:
# Defining labels and id2label function
label_names = ["negative", "below average", "average", "above average", "positive"]
id2label = {idx:label for idx, label in enumerate(label_names)}

In [None]:
# Mapping ratings from TripAdvisor dataset to appropriate ids for training (0-4 instead of 1-5)
def map_labels(example):
    """
      Map ratings from the TripAdvisor dataset to appropriate IDs for training.

      Args:
      - example (dict): Dictionary containing the example data, including the "Rating" field.

      Returns:
      - dict: Dictionary containing the mapped label ID and label name.
    """
    # Subtract 1 from the original rating to map it to the range 0-4
    label_id = example["Rating"] - 1
    return {"labels": label_id, "label_name": id2label[label_id]}

In [None]:
# Applying ratings mapping function to orifinal dataset
review_dataset = review_dataset.map(map_labels)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [None]:
# Defining label2id
label2id = {v:k for k,v in id2label.items()}

In [None]:
# Defining initial model name and tokenizer
from transformers import AutoTokenizer

model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [None]:
# Tokenizing reviews data function
def tokenize_reviews(examples):
    """
      Tokenize reviews data using the provided tokenizer.

      Args:
      - examples (dict): Dictionary containing the example data, including the "Review" field.

      Returns:
      - dict: Tokenized representations of the reviews.
    """

    # Tokenize the reviews using the provided tokenizer
    return tokenizer(examples["Review"], truncation=True, max_length=180)

In [None]:
# Applying tokenizing to our dataset
tokenized_dataset = review_dataset.map(tokenize_reviews, batched=True)
tokenized_dataset

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Review', 'Rating', '__index_level_0__', 'labels', 'label_name', 'input_ids', 'attention_mask'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['Review', 'Rating', '__index_level_0__', 'labels', 'label_name', 'input_ids', 'attention_mask'],
        num_rows: 400
    })
})

In [None]:
# Test print of tokenized dataset
tokenized_dataset["train"][0]

{'Review': 'surprise, hotel year, changed decoration rooms, design furnished, realy surprised,  ',
 'Rating': 5,
 '__index_level_0__': 9955,
 'labels': 4,
 'label_name': 'positive',
 'input_ids': [0,
  55640,
  4,
  3018,
  6602,
  4,
  98816,
  65177,
  1363,
  112875,
  4,
  4331,
  16387,
  93,
  67175,
  4,
  2773,
  53,
  144285,
  4,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [None]:
# Defining labels number and model
from transformers import AutoModelForSequenceClassification

num_labels = 5

# Instantiate the sequence classification model
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels, label2id=label2id, id2label=id2label)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Defininf some config for training and other training arguments including directory of my repository on huggingface "darkoo59/xlm-roberta-base-finetuned-darko-tripadvisor"
from transformers import TrainingArguments

# Defining configuration for training and other training arguments
from transformers import TrainingArguments

# Extracting model name from the model checkpoint path
model_name = model_checkpoint.split("/")[-1]

# Batch size for training
batch_size = 16

# Number of training epochs
num_train_epochs = 2

# Calculating logging steps for logging frequency during training
logging_steps = len(tokenized_dataset["train"]) // (batch_size * num_train_epochs)

# Training arguments configuration
args = TrainingArguments(
    output_dir="darkoo59/xlm-roberta-base-finetuned-darko-tripadvisor",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    logging_steps=logging_steps,
    push_to_hub=True,
)

In [None]:
# Including mean absolute error (MAE) as a metric from the sklearn module
import numpy as np
from sklearn.metrics import mean_absolute_error

def compute_metrics(eval_pred):
    """
      Compute metrics for evaluation predictions.

      Args:
      - eval_pred (tuple): Tuple containing predictions and labels.

      Returns:
      - dict: Dictionary containing the computed metric values.
    """

    predictions, labels = eval_pred

    # Convert predictions to class labels
    predictions = np.argmax(predictions, axis=1)

    # Calculate mean absolute error (MAE)
    return {"MAE": mean_absolute_error(labels, predictions)}

In [None]:
# Logging to huggingface so we can train our model and save it to repository after that
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Instantiation of a Trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# Training the model
trainer.train()

Epoch,Training Loss,Validation Loss,Mae
1,1.5492,1.311968,0.74
2,1.1744,1.089721,0.575


TrainOutput(global_step=200, training_loss=1.4053133392333985, metrics={'train_runtime': 7634.7045, 'train_samples_per_second': 0.419, 'train_steps_per_second': 0.026, 'total_flos': 296007910272000.0, 'train_loss': 1.4053133392333985, 'epoch': 2.0})

In [None]:
# Pushing trained model to darkoo59 repository with commit message
trainer.push_to_hub(commit_message="Training complete!")

CommitInfo(commit_url='https://huggingface.co/darkoo59/xlm-roberta-base-finetuned-darko-tripadvisor/commit/3b2dfc27897ab75a8e2ff1570cee13b417d236b5', commit_message='Training complete!', commit_description='', oid='3b2dfc27897ab75a8e2ff1570cee13b417d236b5', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Loading our fine-tuned model for testing purpose
from transformers import pipeline

finetuned_checkpoint = "darkoo59/xlm-roberta-base-finetuned-darko-tripadvisor"
classifier = pipeline("text-classification", model=finetuned_checkpoint, revision="main")


**Testing our model with our dummy review comments**

In [None]:
classifier("Worst hotel ever.")

[{'label': 'negative', 'score': 0.22566157579421997}]

In [None]:
classifier("I loved this hotel.")

[{'label': 'positive', 'score': 0.5283648371696472}]

In [None]:
classifier("My recent experience at the Sunset Hotel was incredibly disappointing, to say the least. Where do I even begin? Firstly, let's talk about the room. It was nothing short of a disaster. The cleanliness was severely lacking, with visible stains on the carpet and bedding that made me question when they were last washed. The furniture looked like it had been salvaged from a thrift store, and the overall ambiance was far from inviting. ut wait, it gets worse. The bathroom was a nightmare. Not only was it outdated, but it also had mold growing in the corners and a foul odor that permeated the entire room. I felt like I needed a hazmat suit just to step inside.")


[{'label': 'negative', 'score': 0.48469945788383484}]

In [None]:
classifier("My recent stay at the Lakeside Inn left me with mixed feelings. The location was picturesque, with stunning views of the lake and surrounding mountains. The tranquility of the setting provided a peaceful escape from the hustle and bustle of daily life. Additionally, the check-in process was smooth, and the staff at the front desk were polite and welcoming. However, despite these highlights, there were several aspects of my stay that fell short of expectations. The room I was assigned to was disappointing, to say the least. While it was clean, it lacked the comfort and modern amenities I had hoped for. The furniture appeared worn-out, and the bed was uncomfortably firm, making it difficult to get a good night's sleep. Furthermore, the bathroom was in need of renovation, with outdated fixtures and limited toiletries provided. The dining experience at the hotel's restaurant was another letdown. Although the food was decent, the service was slow, and it took ages for our orders to arrive. The menu options were limited, and the prices seemed a bit steep for the quality of the meals offered.")


[{'label': 'average', 'score': 0.453290730714798}]

In [None]:
def predict_rating_percentage_with_classifier(texts, labels):
    """
      Predict the accuracy percentage using a classifier model.

      Args:
      - texts (list): List of input texts.
      - labels (list): List of corresponding labels.

      Returns:
      - average_accuracy (float): Average accuracy percentage of the classifier on the provided texts.
    """
    total_accuracy = 0
    total_texts = len(texts)

    for text, label in zip(texts, labels):
        result = classifier(text)
        predicted_label = result[0]['label']
        if predicted_label == "negative":
            predicted_rating = 0
        elif predicted_label == "below average":
            predicted_rating = 1
        elif predicted_label == "average":
            predicted_rating = 2
        elif predicted_label == "above average":
            predicted_rating = 3
        elif predicted_label == "positive":
            predicted_rating = 4
        else:
            predicted_rating = -1  # Unknown label

        # Calculate accuracy
        if predicted_rating == label:
            total_accuracy += 1
        else:
            # Calculate the difference between predicted and actual rating
            difference = abs(predicted_rating - label)
            # Calculate accuracy based on the difference
            accuracy = 1 - (difference / 4)  # Since ratings range from 0 to 4
            total_accuracy += accuracy

    average_accuracy = (total_accuracy / total_texts) * 100
    return average_accuracy

In [None]:
# 88.5% accuracy is obtained with the provided data for classifying purpose
test_accuracy = predict_rating_percentage_with_classifier(classifying_texts, classifying_labels)
print("Average Accuracy:", test_accuracy, "%")

Average Accuracy: 88.5 %
