In [31]:
import pandas as pd
from datasets import Dataset

def load_data(file):
    polarity=[]
    Aspect_Category=[]
    Target_term=[]
    Character_offset=[]
    Sentence=[]
    polarity_to_label={
        "positive":0,
        "negative":1,
        "neutral":2,
    }
    labels=[]
    with open(file) as f:
        for line in f:
            line=line.strip()

            # split by space and remove the \t 
            tokens=line.split("\t") 
            polarity.append(tokens[0])
            Aspect_Category.append(tokens[1])
            Target_term.append(tokens[2])
            Character_offset.append(tokens[3])
            assert len(tokens[4:])==1,"sentence should be one token,got "+str(len(tokens[4:]))
            Sentence.append(str(tokens[4:][0]))
            labels.append(polarity_to_label[tokens[0]])
    ds_train=pd.DataFrame({"polarity":polarity,
                        "Aspect_Category":Aspect_Category,
                        "Target_term":Target_term,
                        "Character_offset":Character_offset,

                        "labels":labels,
                        "Sentence":Sentence})

    ds_train = Dataset.from_pandas(ds_train)
    return ds_train 

In [2]:
ds_train = load_data("data/traindata.csv")

In [32]:
ds_val = load_data("data/devdata.csv")

In [3]:
ds_train

Dataset({
    features: ['polarity', 'Aspect_Category', 'Target_term', 'Character_offset', 'labels', 'Sentence'],
    num_rows: 1503
})

In [4]:
ds_train[0]

{'polarity': 'positive',
 'Aspect_Category': 'AMBIENCE#GENERAL',
 'Target_term': 'seating',
 'Character_offset': '18:25',
 'labels': 0,
 'Sentence': "short and sweet – seating is great:it's romantic,cozy and private."}

In [5]:
model_name = "FacebookAI/roberta-large"
import numpy as np
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch

tokenizer = AutoTokenizer.from_pretrained(model_name)
pretrained_model = AutoModel.from_pretrained(model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
pretrained_model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  

In [7]:
X_train_encoded = tokenizer(ds_train["Sentence"],
                            truncation=True,
                            padding=False,
                            add_special_tokens=True,
                            return_tensors=None,
                            return_offsets_mapping=False,
                        )

In [8]:
X_train_encoded.keys()

dict_keys(['input_ids', 'attention_mask'])

In [9]:
tokenizer.decode(X_train_encoded['input_ids'][0])

"<s>short and sweet – seating is great:it's romantic,cozy and private.</s>"

In [10]:
ds_train[0]

{'polarity': 'positive',
 'Aspect_Category': 'AMBIENCE#GENERAL',
 'Target_term': 'seating',
 'Character_offset': '18:25',
 'labels': 0,
 'Sentence': "short and sweet – seating is great:it's romantic,cozy and private."}

In [11]:
sentence = "short and sweet – seating is great:it's romantic,cozy and private."
target_term = "seating"
target_span = (18, 25)

encoding = tokenizer(
    sentence,
    return_offsets_mapping=True,
    return_tensors='pt',
    truncation=True
)

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
offsets = encoding['offset_mapping'][0]
print(offsets)

tensor([[ 0,  0],
        [ 0,  5],
        [ 6,  9],
        [10, 15],
        [16, 17],
        [18, 25],
        [26, 28],
        [29, 34],
        [34, 35],
        [35, 37],
        [37, 39],
        [40, 48],
        [48, 49],
        [49, 51],
        [51, 53],
        [54, 57],
        [58, 65],
        [65, 66],
        [ 0,  0]])


In [12]:
target_token_indices = [
    i for i, (start, end) in enumerate(offsets)
    if start >= target_span[0] and end <= target_span[1]
]

In [13]:
from transformers import RobertaModel
model = RobertaModel.from_pretrained("roberta-large")
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    last_hidden_state = outputs.last_hidden_state  # shape: (1, seq_len, hidden_size)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
last_hidden_state.shape

torch.Size([1, 19, 1024])

In [15]:
target_hidden_states = last_hidden_state[0, target_token_indices, :]  # shape: (num_tokens, hidden_size)
pooled = target_hidden_states.mean(dim=0, keepdim=True)  # shape: (hidden_size,)

In [16]:
target_hidden_states.shape

torch.Size([1, 1024])

In [17]:
pooled.shape

torch.Size([1, 1024])

In [26]:
import torch.nn as nn
class AspectSentimentClassifier(nn.Module):
    def __init__(self, hidden_size=1024, num_labels=1):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-large")
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Linear(256, num_labels),
        )
        for param in self.roberta.parameters():
            param.requires_grad = False

        # Then unfreeze the pooler
        for param in self.roberta.pooler.parameters():
            param.requires_grad = True

        
    def forward(self, input_ids, attention_mask, target_token_indices_batch):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # (batch, seq_len, hidden)
        
        pooled = []
        for i, indices in enumerate(target_token_indices_batch):
            token_embeddings = last_hidden_state[i, indices, :]  # (num_tokens, hidden)
            pooled.append(token_embeddings.mean(dim=0))
        
        pooled = torch.stack(pooled)  # (batch_size, hidden_size)
        logits = self.classifier(pooled)
        return logits

In [19]:
def preprocess(example):
    sentence = example['Sentence']
    target = example['Target_term']
    char_offset = tuple(map(int, example['Character_offset'].split(':')))
    
    encoding = tokenizer(
        sentence,
        return_offsets_mapping=True,
        truncation=True,
        padding=False
    )

    offsets = encoding['offset_mapping']
    target_indices = [i for i, (start, end) in enumerate(offsets) if start >= char_offset[0] and end <= char_offset[1]]

    # Add tokenized input + token indices
    example['input_ids'] = encoding['input_ids']
    example['attention_mask'] = encoding['attention_mask']
    example['target_token_indices'] = target_indices
    return example

ds_train = ds_train.map(preprocess)

Map:   0%|          | 0/1503 [00:00<?, ? examples/s]

Map: 100%|██████████| 1503/1503 [00:00<00:00, 5455.50 examples/s]


In [20]:
ds_train[0]

{'polarity': 'positive',
 'Aspect_Category': 'AMBIENCE#GENERAL',
 'Target_term': 'seating',
 'Character_offset': '18:25',
 'labels': 0,
 'Sentence': "short and sweet – seating is great:it's romantic,cozy and private.",
 'input_ids': [0,
  20263,
  8,
  4045,
  126,
  14591,
  16,
  372,
  35,
  405,
  18,
  8728,
  6,
  876,
  5144,
  8,
  940,
  4,
  2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'target_token_indices': [5]}

In [21]:
from transformers import DataCollatorWithPadding

class CustomCollator:
    def __init__(self, tokenizer):
        self.token_collator = DataCollatorWithPadding(tokenizer, return_tensors="pt")

    def __call__(self, features):
        # Remove 'target_token_indices' temporarily
        token_features = [{k: v for k, v in f.items() if k != 'target_token_indices'} for f in features]
        batch = self.token_collator(token_features)

        # Re-pad the target_token_indices manually
        max_len = max(len(f["target_token_indices"]) for f in features)
        padded_indices = [
            f["target_token_indices"].tolist() + [0] * (max_len - len(f["target_token_indices"]))
            for f in features
        ]
        batch["target_token_indices"] = torch.tensor(padded_indices)
        return batch

collator = CustomCollator(tokenizer)

In [22]:
from torch.utils.data import Dataset

class AspectDataset(Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"]),
            "attention_mask": torch.tensor(item["attention_mask"]),
            "target_token_indices": torch.tensor(item["target_token_indices"]),
            "labels": torch.tensor(item["labels"]),
        }

In [34]:
ds_train

Dataset({
    features: ['polarity', 'Aspect_Category', 'Target_term', 'Character_offset', 'labels', 'Sentence', 'input_ids', 'attention_mask', 'target_token_indices'],
    num_rows: 1503
})

In [36]:
ds_val = ds_val.map(preprocess)
ds_val  =AspectDataset(ds_val)

Map:   0%|          | 0/376 [00:00<?, ? examples/s]

Map: 100%|██████████| 376/376 [00:00<00:00, 5204.02 examples/s]


In [None]:
from torch.utils.data import DataLoader
from torch import nn, optim
from tqdm import tqdm
import torch

model = AspectSentimentClassifier().cuda()
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.BCEWithLogitsLoss()

train_loader = DataLoader(AspectDataset(ds_train), batch_size=8, shuffle=True, collate_fn=collator)
val_loader = DataLoader(AspectDataset(ds_val), batch_size=8, shuffle=False, collate_fn=collator)

for epoch in range(20):
    # === TRAINING ===
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
        input_ids = batch['input_ids'].cuda()
        attention_mask = batch['attention_mask'].cuda()
        target_token_indices = batch['target_token_indices'].cuda()
        labels = batch['labels'].cuda().unsqueeze(1).float()

        logits = model(input_ids, attention_mask, target_token_indices)
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_train_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}")

    # === VALIDATION ===
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Validation]"):
            input_ids = batch['input_ids'].cuda()
            attention_mask = batch['attention_mask'].cuda()
            target_token_indices = batch['target_token_indices'].cuda()
            labels = batch['labels'].cuda().unsqueeze(1).float()

            logits = model(input_ids, attention_mask, target_token_indices)
            loss = criterion(logits, labels)
            val_loss += loss.item()

            preds = torch.sigmoid(logits) > 0.5
            correct += (preds == labels.bool()).sum().item()
            total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_acc = correct / total
    print(f"Epoch {epoch+1} - Val Loss: {avg_val_loss:.4f} - Val Acc: {val_acc:.4f}")



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 [Train]: 100%|██████████| 188/188 [00:19<00:00,  9.57it/s]


Epoch 1 - Train Loss: 0.6239


  "input_ids": torch.tensor(item["input_ids"]),
  "attention_mask": torch.tensor(item["attention_mask"]),
  "target_token_indices": torch.tensor(item["target_token_indices"]),
  "labels": torch.tensor(item["labels"]),
Epoch 1 [Validation]: 100%|██████████| 47/47 [00:04<00:00,  9.96it/s]


Epoch 1 - Val Loss: 0.5972 - Val Acc: 0.7021


Epoch 2 [Train]: 100%|██████████| 188/188 [00:19<00:00,  9.43it/s]


Epoch 2 - Train Loss: 0.5492


Epoch 2 [Validation]: 100%|██████████| 47/47 [00:04<00:00,  9.89it/s]


Epoch 2 - Val Loss: 0.5443 - Val Acc: 0.7633


Epoch 3 [Train]: 100%|██████████| 188/188 [00:19<00:00,  9.56it/s]


Epoch 3 - Train Loss: 0.4971


Epoch 3 [Validation]: 100%|██████████| 47/47 [00:04<00:00,  9.96it/s]


Epoch 3 - Val Loss: 0.4710 - Val Acc: 0.7979


Epoch 4 [Train]: 100%|██████████| 188/188 [00:19<00:00,  9.63it/s]


Epoch 4 - Train Loss: 0.4676


Epoch 4 [Validation]: 100%|██████████| 47/47 [00:04<00:00,  9.95it/s]


Epoch 4 - Val Loss: 0.5108 - Val Acc: 0.7580


Epoch 5 [Train]: 100%|██████████| 188/188 [00:19<00:00,  9.64it/s]


Epoch 5 - Train Loss: 0.4422


Epoch 5 [Validation]: 100%|██████████| 47/47 [00:04<00:00,  9.96it/s]


Epoch 5 - Val Loss: 0.4829 - Val Acc: 0.7713


Epoch 6 [Train]: 100%|██████████| 188/188 [00:19<00:00,  9.52it/s]


Epoch 6 - Train Loss: 0.3887


Epoch 6 [Validation]: 100%|██████████| 47/47 [00:04<00:00,  9.96it/s]


Epoch 6 - Val Loss: 0.4661 - Val Acc: 0.7527


Epoch 7 [Train]:  16%|█▌        | 30/188 [00:02<00:15, 10.51it/s]