In [43]:
import polars as pl
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd
import torch.nn as nn

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [44]:
label2id = {"O": 0, "B-ASP": 1, "I-ASP": 2}
id2label = {v: k for k, v in label2id.items()}
sentiment2id = {
    "negative": 0,
    "positive": 1,
    "neutral": 2
}
id2sentiment = {v: k for k, v in sentiment2id.items()}

In [47]:
df=pl.read_parquet("../data/processed/df_aspect_pos.parquet")

In [48]:
df.columns

['input_ids',
 'attention_mask',
 'labels',
 'aspects_index',
 'aspects_sentiment',
 'type']

In [49]:
def custom_collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])

    aspects_index = [item["aspects_index"] for item in batch]          # List[List[List[int]]]
    aspects_sentiment = [item["aspects_sentiment"] for item in batch]  # List[List[int]]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        "aspects_index": aspects_index,
        "aspects_sentiment": aspects_sentiment,
    }


In [50]:
def extract_aspect_spans(pred_labels):
    spans = []
    i = 0
    while i < len(pred_labels):
        if pred_labels[i] == 1:  # B-ASP
            start = i
            i += 1
            while i < len(pred_labels) and pred_labels[i] == 2:  # I-ASP
                i += 1
            end = i - 1
            spans.append([start, end])
        else:
            i += 1
    return spans

In [51]:
class CustomDataset(Dataset):
    def __init__(self, df):
        self.input_ids = df["input_ids"].to_numpy()
        self.attention_mask = df["attention_mask"].to_numpy()
        self.labels = df["labels"].to_numpy()
        self.aspects_index = df["aspects_index"].to_list()
        self.aspects_sentiment = df["aspects_sentiment"].to_list()

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.attention_mask[idx], dtype=torch.long),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
            "aspects_index": self.aspects_index[idx],           # list of [start, end]
            "aspects_sentiment": self.aspects_sentiment[idx],   # list of sentiment values
        }


batch_size = 32
train_dataset = CustomDataset(df.filter(pl.col("type") == "train"))
val_dataset = CustomDataset(df.filter(pl.col("type") == "val"))
test_dataset = CustomDataset(df.filter(pl.col("type") == "test"))
val_dataloader = DataLoader(val_dataset, batch_size=batch_size , collate_fn=custom_collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=custom_collate_fn)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)


In [52]:
for batch in train_dataloader:
    print(batch["input_ids"].shape)
    print(batch["attention_mask"].shape)
    print(batch["labels"].shape)
    print(batch["aspects_sentiment"])
    print(batch["aspects_index"])
    break

torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
[[1, 1], [2], [2], [0], [1, 1], [1], [0], [1], [1], [1], [0, 0], [1], [1], [0, 2], [1, 1], [0, 0, 0, 0, 0], [0], [0], [1, 1, 1], [1], [1], [0], [1, 0], [1, 1], [1], [1], [1], [0, 1, 1], [1], [1], [0], [1]]
[[[9, 9], [11, 11]], [[6, 6]], [[4, 4]], [[2, 2]], [[2, 2], [5, 5]], [[2, 3]], [[2, 3]], [[6, 6]], [[8, 12]], [[2, 2]], [[7, 7], [13, 13]], [[5, 9]], [[2, 3]], [[15, 15], [23, 23]], [[2, 2], [8, 8]], [[6, 6], [9, 9], [13, 13], [19, 20], [30, 30]], [[11, 11]], [[7, 7]], [[3, 3], [6, 6], [12, 12]], [[6, 7]], [[2, 2]], [[13, 20]], [[1, 1], [10, 10]], [[1, 1], [6, 6]], [[11, 12]], [[2, 6]], [[8, 8]], [[32, 32], [8, 10], [17, 17]], [[2, 7]], [[2, 2]], [[2, 2]], [[8, 8]]]


In [53]:
class SentimentClassifier(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attention = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=1, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.norm = nn.LayerNorm(hidden_size)
        self.classifier = nn.Linear(hidden_size, 3)  # Sentiment classes: pos, neg, neutral

    def forward(self, token_embeddings, aspect_mask):
        # token_embeddings: [B, L, H], aspect_mask: [B, L]
        aspect_mask = aspect_mask.unsqueeze(-1).expand_as(token_embeddings)  # [B, L, H]
        aspect_embeddings = token_embeddings * aspect_mask  # Zero out non-aspect tokens

        aspect_pooled = aspect_embeddings.sum(dim=1) / (aspect_mask.sum(dim=1) + 1e-8)  # [B, H]

        query = aspect_pooled.unsqueeze(1)  # [B, 1, H]
        key = value = token_embeddings  # [B, L, H]

        attended_output, attn_weights = self.attention(query, key, value)  # [B, 1, H]
        attended_output = self.dropout(attended_output)
        attended_output = self.norm(attended_output)

        logits = self.classifier(attended_output.squeeze(1))  # [B, 3]
        return logits, attn_weights 


class AspectDetectionModel(nn.Module):
    def __init__(self):
        super(AspectDetectionModel, self).__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.token_classifier = nn.Linear(self.bert.config.hidden_size, len(label2id))
        self.sentiment_classifier = SentimentClassifier(hidden_size=self.bert.config.hidden_size)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)  # [B, L, H]

        token_logits = self.token_classifier(sequence_output)  # For aspect term tagging (BIO)

        return token_logits, sequence_output

model = AspectDetectionModel().to("mps")
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.8)

In [54]:
num_epochs = 10

for epoch in range(num_epochs):
    total_aspect_train_loss = 0
    total_sentiment_train_loss = 0
    total_aspect_val_loss = 0
    total_sentiment_val_loss = 0

    model.train()
    total_train_loss = 0

    for batch in train_dataloader:
        input_ids = batch["input_ids"].to("mps")
        attention_mask = batch["attention_mask"].to("mps")
        labels = batch["labels"].to("mps")


        token_logits, sequence_output = model(input_ids, attention_mask)
        aspect_loss = criterion(token_logits.view(-1, len(label2id)), labels.view(-1))
        total_aspect_train_loss += aspect_loss.item()


      
        sentiment_losses = []
        for i in range(len(input_ids)):
            for aspect_index, sentiment in zip(batch["aspects_index"][i], batch["aspects_sentiment"][i]):
                if aspect_index[1] >= sequence_output.size(1):
                    continue
                # Create a new aspect mask for each aspect instead of modifying in-place
                aspect_mask = torch.zeros_like(input_ids, dtype=torch.float).to("mps")
                aspect_mask[i, aspect_index[0]:aspect_index[1]+1] = 1
                sentiment_logits, _ = model.sentiment_classifier(sequence_output[i].unsqueeze(0), aspect_mask[i].unsqueeze(0))
                sentiment_target = torch.tensor([sentiment], dtype=torch.long).to("mps")
                sentiment_loss = criterion(sentiment_logits, sentiment_target)
                sentiment_losses.append(sentiment_loss)


        if sentiment_losses:
            sentiment_loss = torch.stack(sentiment_losses).mean()
        else:
            sentiment_loss = torch.tensor(0.0).to("mps")

        total_sentiment_train_loss += sentiment_loss.item()
        total_loss = aspect_loss + sentiment_loss

        total_loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    with torch.no_grad():
        model.eval()

        for batch in val_dataloader:
            input_ids = batch["input_ids"].to("mps")
            attention_mask = batch["attention_mask"].to("mps")
            labels = batch["labels"].to("mps")

            token_logits, sequence_output = model(input_ids, attention_mask)
            aspect_loss = criterion(token_logits.view(-1, len(label2id)), labels.view(-1))
            total_aspect_val_loss += aspect_loss.item()

            sentiment_losses = []
            preds = torch.argmax(token_logits, dim=2)
            for i in range(len(input_ids)):
                aspects = extract_aspect_spans(preds[i].cpu().tolist())
                for aspect_index, sentiment in zip(batch["aspects_index"][i], batch["aspects_sentiment"][i]):
                    if aspect_index in aspects and aspect_index[1] < sequence_output.size(1):
                        aspect_mask = torch.zeros_like(input_ids, dtype=torch.float).to("mps")
                        aspect_mask[i, aspect_index[0]:aspect_index[1]+1] = 1
                        sentiment_logits, _ = model.sentiment_classifier(sequence_output[i].unsqueeze(0), aspect_mask[i].unsqueeze(0))
                        sentiment_target = torch.tensor([sentiment], dtype=torch.long).to("mps")
                        sentiment_loss = criterion(sentiment_logits.view(-1, 3), sentiment_target)
                        sentiment_losses.append(sentiment_loss)

            if sentiment_losses:
                sentiment_loss = torch.stack(sentiment_losses).mean()
            else:
                sentiment_loss = torch.tensor(0.0).to("mps")

            total_sentiment_val_loss += sentiment_loss.item()

    scheduler.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], "
        f"Train Aspect Loss: {total_aspect_train_loss/len(train_dataloader):.4f}, "
        f"Train Sentiment Loss: {total_sentiment_train_loss/len(train_dataloader):.4f}, "
        f"Val Aspect Loss: {total_aspect_val_loss/len(val_dataloader):.4f}, "
        f"Val Sentiment Loss: {total_sentiment_val_loss/len(val_dataloader):.4f}")



Epoch [1/10], Train Aspect Loss: 0.3801, Train Sentiment Loss: 0.4980, Val Aspect Loss: 0.2209, Val Sentiment Loss: 0.4874
Epoch [2/10], Train Aspect Loss: 0.1951, Train Sentiment Loss: 0.3486, Val Aspect Loss: 0.1637, Val Sentiment Loss: 0.3560
Epoch [3/10], Train Aspect Loss: 0.1467, Train Sentiment Loss: 0.2787, Val Aspect Loss: 0.1377, Val Sentiment Loss: 0.4225
Epoch [4/10], Train Aspect Loss: 0.1237, Train Sentiment Loss: 0.2259, Val Aspect Loss: 0.1298, Val Sentiment Loss: 0.3918
Epoch [5/10], Train Aspect Loss: 0.1106, Train Sentiment Loss: 0.1848, Val Aspect Loss: 0.1241, Val Sentiment Loss: 0.4247
Epoch [6/10], Train Aspect Loss: 0.0997, Train Sentiment Loss: 0.1653, Val Aspect Loss: 0.1194, Val Sentiment Loss: 0.4578
Epoch [7/10], Train Aspect Loss: 0.0931, Train Sentiment Loss: 0.1333, Val Aspect Loss: 0.1189, Val Sentiment Loss: 0.4685
Epoch [8/10], Train Aspect Loss: 0.0875, Train Sentiment Loss: 0.1155, Val Aspect Loss: 0.1146, Val Sentiment Loss: 0.5045
Epoch [9/10], Tr

In [79]:
import os
name="AspectDetectionModel_Sentiment_Analysis_Attention"

os.makedirs("../models/AspectDetectionModel", exist_ok=True)

torch.save(model.state_dict(), "../models/AspectDetectionModel/" + name + ".pth")

model_config = {
	"hidden_size": model.bert.config.hidden_size,
	"num_labels": len(label2id),
	"id2label": id2label,
	"label2id": label2id,
	"name": name
}

import json
with open(f"../models/AspectDetectionModel/{name}_config.json", "w") as f:
	json.dump(model_config, f)

print("Model saved successfully to ../models/AspectDetectionModel/")

Model saved successfully to ../models/AspectDetectionModel/


In [12]:
name="AspectDetectionModel_Sentiment_Analysis_Attention"
model = AspectDetectionModel()
model.load_state_dict(torch.load("../models/AspectDetectionModel/" + name + ".pth"))
model = model.to("mps")
model.eval()

AspectDetectionModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [55]:
temp = tokenizer(("""Car quality is very nice but the controller sucks . The controller of this car do not works properly and the final in the controller do not rotate fully it only rotate like button""").split(), is_split_into_words=True,
                          truncation=True,
                          padding="max_length",
                          max_length=128)

print(temp)

{'input_ids': [101, 2482, 3737, 2003, 2200, 3835, 2021, 1996, 11486, 19237, 1012, 1996, 11486, 1997, 2023, 2482, 2079, 2025, 2573, 7919, 1998, 1996, 2345, 1999, 1996, 11486, 2079, 2025, 24357, 3929, 2009, 2069, 24357, 2066, 6462, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [63]:
temp = tokenizer(("""This remote control car is fun, fast, and easy to handle—perfect for kids! The build quality is sturdy and it runs smoothly on different surfaces. Battery life is decent and the controls are very responsive. A great gift for kids!""").split(), is_split_into_words=True,
                          truncation=True,
                          padding="max_length",
                          max_length=128)

print(temp)

{'input_ids': [101, 2023, 6556, 2491, 2482, 2003, 4569, 1010, 3435, 1010, 1998, 3733, 2000, 5047, 1517, 3819, 2005, 4268, 999, 1996, 3857, 3737, 2003, 23073, 1998, 2009, 3216, 15299, 2006, 2367, 9972, 1012, 6046, 2166, 2003, 11519, 1998, 1996, 7711, 2024, 2200, 26651, 1012, 1037, 2307, 5592, 2005, 4268, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [71]:
temp = tokenizer(("""the high prices you ' re going to pay is for the view not for the food .""").split(), is_split_into_words=True,
                          truncation=True,
                          padding="max_length",
                          max_length=128)

print(temp)

{'input_ids': [101, 1996, 2152, 7597, 2017, 1005, 2128, 2183, 2000, 3477, 2003, 2005, 1996, 3193, 2025, 2005, 1996, 2833, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [72]:
device="mps"
model.eval()
with torch.no_grad():

    
    input_ids = torch.tensor(temp["input_ids"], dtype=torch.long).unsqueeze(0).to(device)
    attention_mask = torch.tensor(temp["attention_mask"], dtype=torch.long).unsqueeze(0).to(device)

    logits, sequence_output = model(input_ids, attention_mask)
    # decode the logits to get the predicted labels
    preds = torch.argmax(logits, dim=2)[0]
    aspects = extract_aspect_spans(preds.cpu().tolist())
    sentiments=[]
    for aspect in aspects:
        aspect_mask = torch.zeros_like(input_ids, dtype=torch.long).to("mps")
        aspect_mask[0, aspect[0]:aspect[1]+1] = 1
        # print("am", aspect_mask[0].unsqueeze(0))
        sentiment_logits, _ = model.sentiment_classifier(
            sequence_output, aspect_mask
        )
        sentiments.append({"pos":aspect,"senti":torch.argmax(sentiment_logits, dim=1).item()})

preds

tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='mps:0')

In [73]:
logits

tensor([[[ 4.4958, -1.5557, -3.3050],
         [ 5.4345, -3.3632, -3.1920],
         [ 5.1599, -2.9615, -3.2711],
         [ 0.8643,  2.0685, -3.0147],
         [ 5.2092, -3.1613, -2.9502],
         [ 5.2654, -3.5524, -2.3321],
         [ 5.3565, -3.4839, -2.6964],
         [ 5.1150, -2.9785, -2.4061],
         [ 5.0020, -3.0589, -2.7768],
         [ 4.6132, -2.4140, -3.0822],
         [ 5.2563, -3.0737, -3.2871],
         [ 4.7397, -2.7990, -3.0575],
         [ 4.3096, -2.6864, -2.9525],
         [-0.3268,  2.3367, -2.5131],
         [ 5.2072, -2.7667, -3.4396],
         [ 5.1848, -2.9837, -3.0111],
         [ 4.4994, -2.8495, -3.1791],
         [-0.4544,  3.5515, -3.1453],
         [ 5.8347, -2.7945, -3.2315],
         [ 5.8347, -2.8297, -3.2292],
         [ 2.1516, -0.9112, -1.5927],
         [ 2.6923, -1.1390, -1.8864],
         [ 1.6735, -0.6768, -1.4028],
         [ 3.0229, -1.3560, -2.0624],
         [ 2.8599, -1.2942, -1.9649],
         [ 2.1354, -1.0662, -1.8766],
         [ 2

In [74]:
arr=[-1] * 128
arr

for item in sentiments:
    pos=item["pos"]
    start = pos[0]
    end = pos[0]+pos[1]
    arr[start:end] = [item["senti"]] * (end-start)

In [75]:
aspects

[[3, 3], [13, 13], [17, 17]]

In [77]:
predictions = [id2label[label] for label in preds.tolist()]
predictions

['O',
 'O',
 'O',
 'B-ASP',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ASP',
 'O',
 'O',
 'O',
 'B-ASP',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [78]:
def print_sentiment(bio_tag, sentiment):
    if(bio_tag in ["B-ASP", "I-ASP"]):
        return id2sentiment[sentiment]
    else:
        return ""

rows = []

for item in zip(predictions, temp["input_ids"], arr):
    word = tokenizer.convert_ids_to_tokens(item[1])
    if word in ['[CLS]', '[SEP]', '[PAD]']:
        continue
    sentiment = print_sentiment(item[0], item[2])
    rows.append({
        "BIO Tag": item[0],
        "Word": word,
        "Sentiment": sentiment
    })

df = pd.DataFrame(rows)
df

Unnamed: 0,BIO Tag,Word,Sentiment
0,O,the,
1,O,high,
2,B-ASP,prices,negative
3,O,you,
4,O,',
5,O,re,
6,O,going,
7,O,to,
8,O,pay,
9,O,is,


In [1]:
# df.to_html(index=False).replace("\n","")

In [None]:
def extract_aspects(input_ids, predictions, tokenizer):
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    
    aspects = []
    current_aspect = []
    current_position = []
    current_idx = 0

    for idx, (token, label) in enumerate(zip(tokens, predictions[0])):
        if token in [tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token]:
            continue
            
        label_tag = id2label.get(label, "O")
        
        if label_tag == "B-ASP":
            if current_aspect:
                aspects.append((" ".join(current_aspect), (current_position[0], current_position[-1])))
            current_aspect = [token]
            current_position = [idx]
        elif label_tag == "I-ASP" and current_aspect:
            current_aspect.append(token)
            current_position.append(idx)
        else:
            if current_aspect:
                aspects.append((" ".join(current_aspect), (current_position[0], current_position[-1])))
                current_aspect = []
                current_position = []

    if current_aspect:
        aspects.append((" ".join(current_aspect), (current_position[0], current_position[-1])))

    clean_aspects = []
    for aspect, pos in aspects:
        cleaned = ""
        for word in aspect.split():
            if word.startswith("##"):
                cleaned += word[2:]  # Remove ## prefix
            else:
                if cleaned:
                    cleaned += " "
                cleaned += word
        clean_aspects.append((cleaned, pos))

    return clean_aspects


extracted_aspects = extract_aspects(input_ids, predictions, tokenizer)
extracted_aspects

[('chrome', (2, 2)), ('book', (3, 3))]