<a href="https://colab.research.google.com/github/go-hyun77/ABSA/blob/test-higher-epochs-and-results/ABSA_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Aspect-Based Sentiment Analysis (ABSA) with T5
# --------------------------------------------------
# This notebook shows how to fine-tune a T5 model for ABSA using HuggingFace.
# SemEval2014 dataset (aspect + sentiment annotations).

!pip install transformers datasets sentencepiece -q
!pip install datasets==3.6.0

import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

Collecting datasets==3.6.0
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-3.6.0


In [2]:
# Load Dataset

dataset = load_dataset("alexcadillon/SemEval2014Task4", "restaurants")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


SemEval2014Task4.py: 0.00B [00:00, ?B/s]

restaurants/trial/0000.parquet:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

restaurants/train/0000.parquet:   0%|          | 0.00/233k [00:00<?, ?B/s]

restaurants/test/0000.parquet:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Generating trial split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/3041 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/800 [00:00<?, ? examples/s]

In [3]:
# examine dataset
train_data = dataset["train"]

# print first 10 entries of train split
for i in range(10):
    print(f"{i+1}: {train_data[i]}")


1: {'sentenceId': '3121', 'text': 'But the staff was so horrible to us.', 'aspectTerms': [{'term': 'staff', 'polarity': 'negative', 'from': '8', 'to': '13'}], 'aspectCategories': [{'category': 'service', 'polarity': 'negative'}]}
2: {'sentenceId': '2777', 'text': "To be completely fair, the only redeeming factor was the food, which was above average, but couldn't make up for all the other deficiencies of Teodora.", 'aspectTerms': [{'term': 'food', 'polarity': 'positive', 'from': '57', 'to': '61'}], 'aspectCategories': [{'category': 'food', 'polarity': 'positive'}, {'category': 'anecdotes/miscellaneous', 'polarity': 'negative'}]}
3: {'sentenceId': '1634', 'text': "The food is uniformly exceptional, with a very capable kitchen which will proudly whip up whatever you feel like eating, whether it's on the menu or not.", 'aspectTerms': [{'term': 'food', 'polarity': 'positive', 'from': '4', 'to': '8'}, {'term': 'kitchen', 'polarity': 'positive', 'from': '55', 'to': '62'}, {'term': 'menu', 'p

In [4]:
# flatten dataset
indexes = [train_data[i] for i in range(20)]  # first 20 entries


rows = []
for i in indexes:
    sentence_id = i["sentenceId"]
    text = i["text"]

    # If aspect terms exist, iterate through them
    if i["aspectTerms"]:
        for asp in i["aspectTerms"]:
            rows.append({
                "sentenceId": sentence_id,
                "text": text,
                "aspect_term": asp["term"],
                "term_polarity": asp["polarity"],
                "category": None,  # Add these to maintain consistent columns
                "category_polarity": None # Add these to maintain consistent columns
            })
    # If no explicit aspect terms, still record categories
    if i["aspectCategories"]:
        for cat in i["aspectCategories"]:
            rows.append({
                "sentenceId": sentence_id,
                "text": text,
                "aspect_term": None, # Add these to maintain consistent columns
                "term_polarity": None, # Add these to maintain consistent columns
                "category": cat["category"],
                "category_polarity": cat["polarity"]
            })


# Convert to DataFrame
df = pd.DataFrame(rows)
print(df.head(10))

  sentenceId                                               text aspect_term  \
0       3121               But the staff was so horrible to us.       staff   
1       3121               But the staff was so horrible to us.        None   
2       2777  To be completely fair, the only redeeming fact...        food   
3       2777  To be completely fair, the only redeeming fact...        None   
4       2777  To be completely fair, the only redeeming fact...        None   
5       1634  The food is uniformly exceptional, with a very...        food   
6       1634  The food is uniformly exceptional, with a very...     kitchen   
7       1634  The food is uniformly exceptional, with a very...        menu   
8       1634  The food is uniformly exceptional, with a very...        None   
9       2534  Where Gabriela personaly greets you and recomm...        None   

  term_polarity                 category category_polarity  
0      negative                     None              None  
1       

In [5]:
#define model

model_name = "t5-small" #try "google/flan-t5-base" for better results
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
#create aspect-sentiment pairs from dataset

def format_target(aspects):

  pairs = []

  for asp in aspects:
    pairs.append({"aspect": asp["term"], "sentiment": asp["polarity"]})
  return str(pairs)

In [7]:
#

def preprocess(ex):

  input_text = f"ABSA: {ex['text']}"
  target_text = format_target(ex["aspectTerms"])

  return {
    "input_ids": tokenizer(input_text, truncation=True, padding="max_length", max_length=128).input_ids,
    "labels": tokenizer(target_text, truncation=True, padding="max_length", max_length=128).input_ids
  }

In [8]:

train_dataset = dataset["train"].map(preprocess)
valid_dataset = dataset["test"].map(preprocess)


Map:   0%|          | 0/3041 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [9]:
#load model
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [10]:
#training setup

args = TrainingArguments(
  output_dir="./absa_t5",
  eval_strategy="epoch", # Corrected parameter name
  learning_rate=5e-5,
  per_device_train_batch_size=8,
  num_train_epochs=10,
  weight_decay=0.01,
  save_total_limit=2,
  logging_steps=50,
  push_to_hub=False,
)


trainer = Trainer(
  model=model,
  args=args,
  train_dataset=train_dataset,
  eval_dataset=valid_dataset,
)

In [11]:
#train model
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgohyun[0m ([33mgohyun-california-state-university-fullerton[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
1,0.1002,0.148791
2,0.0695,0.126138
3,0.0634,0.103427
4,0.0625,0.105405
5,0.0473,0.105
6,0.0481,0.08605
7,0.0433,0.095148
8,0.0441,0.082581
9,0.0431,0.083181
10,0.0408,0.084664




TrainOutput(global_step=3810, training_loss=0.08855206751291521, metrics={'train_runtime': 36464.7207, 'train_samples_per_second': 0.834, 'train_steps_per_second': 0.104, 'total_flos': 1028936045690880.0, 'train_loss': 0.08855206751291521, 'epoch': 10.0})

In [None]:
MEMD-ABSA and OATS (opinion-aspect-target-sentiment)