In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Load the preprocessed dataset
df_reviews = pd.read_csv("../Data/processed/df_reviews.csv")

In [3]:
df_reviews.describe()

Unnamed: 0,sentiment,text,tokens
count,50000,50000,50000
unique,3,49987,49987
top,positive,made reservation last week party accommodated ...,"['made', 'reservation', 'last', 'week', 'party..."
freq,34941,2,2


In [4]:
# Data Preparation
# Split the data
X_train, X_test, y_train, y_test = train_test_split(df_reviews['text'], df_reviews['sentiment'], test_size=0.2, random_state=42)

In [5]:
# Tokenization for RoBERTa
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings_roberta = tokenizer_roberta(X_train.tolist(), truncation=True, padding=True)
test_encodings_roberta = tokenizer_roberta(X_test.tolist(), truncation=True, padding=True)

In [6]:
# Tokenization for XLNet
tokenizer_xlnet = XLNetTokenizer.from_pretrained('xlnet-base-cased')
train_encodings_xlnet = tokenizer_xlnet(X_train.tolist(), truncation=True, padding=True)
test_encodings_xlnet = tokenizer_xlnet(X_test.tolist(), truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [7]:
# Model Training

# Define the RoBERTa model
model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-base')

# Define the XLNet model
model_xlnet = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)


In [9]:
# Trainer for RoBERTa
trainer_roberta = Trainer(
    model=model_roberta,
    args=training_args,
    train_dataset=train_encodings_roberta,
    eval_dataset=test_encodings_roberta,
)

In [10]:
# Trainer for XLNet
trainer_xlnet = Trainer(
    model=model_xlnet,
    args=training_args,
    train_dataset=train_encodings_xlnet,
    eval_dataset=test_encodings_xlnet,
)

In [11]:
# Train the models
trainer_roberta.train()
trainer_xlnet.train()

  0%|          | 0/3 [00:00<?, ?it/s]

KeyError: 'Invalid key. Only three types of key are available: (1) string, (2) integers for backend Encoding, and (3) slices for data subsetting.'