In [3]:
import polars as pl
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import pandas as pd
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [4]:
label2id = {"O": 0, "B-ASP": 1, "I-ASP": 2}
id2label = {v: k for k, v in label2id.items()}
sentiment2id = {
    "negative": 0,
    "positive": 1,
    "neutral": 2
}
id2sentiment = {v: k for k, v in sentiment2id.items()}

In [5]:
df=pl.read_parquet("../data/processed/df_aspect_pos.parquet")

In [6]:
df.columns

['input_ids',
 'attention_mask',
 'labels',
 'aspects_index',
 'aspects_sentiment',
 'type']

In [7]:
def custom_collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])

    aspects_index = [item["aspects_index"] for item in batch]          # List[List[List[int]]]
    aspects_sentiment = [item["aspects_sentiment"] for item in batch]  # List[List[int]]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        "aspects_index": aspects_index,
        "aspects_sentiment": aspects_sentiment,
    }


In [8]:
def extract_aspect_spans(pred_labels):
    spans = []
    i = 0
    while i < len(pred_labels):
        if pred_labels[i] == 1:  # B-ASP
            start = i
            i += 1
            while i < len(pred_labels) and pred_labels[i] == 2:  # I-ASP
                i += 1
            end = i - 1
            spans.append([start, end])
        else:
            i += 1
    return spans

In [9]:
class CustomDataset(Dataset):
    def __init__(self, df):
        self.input_ids = df["input_ids"].to_numpy()
        self.attention_mask = df["attention_mask"].to_numpy()
        self.labels = df["labels"].to_numpy()
        self.aspects_index = df["aspects_index"].to_list()
        self.aspects_sentiment = df["aspects_sentiment"].to_list()

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.attention_mask[idx], dtype=torch.long),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
            "aspects_index": self.aspects_index[idx],           # list of [start, end]
            "aspects_sentiment": self.aspects_sentiment[idx],   # list of sentiment values
        }


batch_size = 32
train_dataset = CustomDataset(df.filter(pl.col("type") != "val"))
val_dataset = CustomDataset(df.filter(pl.col("type") == "val"))
test_dataset = CustomDataset(df.filter(pl.col("type") == "test"))
val_dataloader = DataLoader(val_dataset, batch_size=batch_size , collate_fn=custom_collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=custom_collate_fn)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)


In [10]:
for batch in train_dataloader:
    print(batch["input_ids"].shape)
    print(batch["attention_mask"].shape)
    print(batch["labels"].shape)
    print(batch["aspects_sentiment"])
    print(batch["aspects_index"])
    break

torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
[[1], [0], [1, 1], [1], [1], [0], [1], [1], [1], [1, 1], [1, 1], [0], [1], [0], [1], [1], [1], [1, 1], [1, 1], [0], [2], [0, 2], [1], [1], [0, 0], [1], [0], [1, 1], [1], [1], [1], [1]]
[[[2, 3]], [[8, 8]], [[3, 4], [9, 9]], [[1, 1]], [[2, 3]], [[2, 3]], [[3, 6]], [[1, 2]], [[1, 1]], [[2, 3], [8, 10]], [[3, 4], [9, 10]], [[16, 16]], [[1, 3]], [[1, 1]], [[2, 2]], [[14, 15]], [[7, 7]], [[2, 2], [7, 7]], [[11, 11], [13, 13]], [[13, 13]], [[2, 2]], [[9, 9], [2, 2]], [[5, 9]], [[9, 9]], [[22, 23], [13, 13]], [[3, 3]], [[2, 2]], [[3, 6], [10, 11]], [[2, 2]], [[2, 3]], [[2, 3]], [[1, 1]]]


In [11]:
class AspectDetectionModel(nn.Module):
    def __init__(self):
        super(AspectDetectionModel, self).__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, len(label2id))
        self.sentiment_classifier = nn.Linear(self.bert.config.hidden_size, 3)  # positive, negative, neutral

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)  # [B, L, H]
        logits = self.classifier(sequence_output)  # [B, L, num_labels]
        return logits, sequence_output

model = AspectDetectionModel().to("mps")
criterion = nn.CrossEntropyLoss(ignore_index=-100)
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
optimizer = optimizer = torch.optim.AdamW([
    {'params': model.bert.parameters(), 'lr': 2e-5},
    {'params': model.classifier.parameters(), 'lr': 5e-5},
    {'params': model.sentiment_classifier.parameters(), 'lr': 5e-5}
])
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.8)

In [None]:
num_epochs = 20

for epoch in range(num_epochs):
    total_aspect_train_loss = 0
    total_sentiment_train_loss = 0
    total_aspect_val_loss = 0
    total_sentiment_val_loss = 0

    model.train()
    total_train_loss = 0

    for batch in train_dataloader:
        input_ids = batch["input_ids"].to("mps")
        attention_mask = batch["attention_mask"].to("mps")
        labels = batch["labels"].to("mps")

        # logits contains the aspect extraction head output and sequence_output contains the contextualized embeddings
        logits, sequence_output = model(input_ids, attention_mask)
        loss = criterion(logits.view(-1, len(label2id)), labels.view(-1))
        total_aspect_train_loss += loss.item()

        sentiment_losses = []
        for i in range(len(input_ids)):
            for aspect_index, sentiment in zip(batch["aspects_index"][i], batch["aspects_sentiment"][i]):
                if aspect_index[1] >= sequence_output.size(1):
                    continue
                # Assume the aspect span is a list of words ['chrome', '##book'], BIO tags are ['B-ASP', 'I-ASP'], and indices are [15, 16]
                # Each word in the aspect span have its own embedding in the sequence output
                # We take the mean of the embeddings for the aspect span
                # aspect_index = [15, 16] means we take the mean of sequence_output[i, 15:17]
                pooled = sequence_output[i, aspect_index[0]:aspect_index[1]+1].mean(dim=0)
                sentiment_logits = model.sentiment_classifier(pooled.unsqueeze(0))
                sentiment_target = torch.tensor([sentiment], dtype=torch.long).to("mps")
                sentiment_loss = criterion(sentiment_logits.view(-1, 3), sentiment_target)
                sentiment_losses.append(sentiment_loss)


        if sentiment_losses:
            sentiment_loss = torch.stack(sentiment_losses).mean()
        else:
            sentiment_loss = torch.tensor(0.0).to("mps")

        total_sentiment_train_loss += sentiment_loss.item()
        total_loss = aspect_loss + sentiment_loss

        total_loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    with torch.no_grad():
        model.eval()

        for batch in val_dataloader:
            input_ids = batch["input_ids"].to("mps")
            attention_mask = batch["attention_mask"].to("mps")
            labels = batch["labels"].to("mps")

            logits, sequence_output = model(input_ids, attention_mask)
            aspect_loss = criterion(logits.view(-1, len(label2id)), labels.view(-1))
            total_aspect_val_loss += aspect_loss.item()

            sentiment_losses = []
            preds = torch.argmax(logits, dim=2)
            for i in range(len(input_ids)):
                # During training, I used the existing aspect spans from the training data. But during evaluation, 
                # I will extract the aspect spans from the predicted labels
                # This is because the aspect spans are not available in the validation data
                # I will use the predicted labels to extract the aspect spans
                # The predicted labels are in the form of BIO tags 
                aspects = extract_aspect_spans(preds[i].cpu().tolist())
                for aspect_index, sentiment in zip(batch["aspects_index"][i], batch["aspects_sentiment"][i]):
                    if aspect_index in aspects and aspect_index[1] < sequence_output.size(1):
                        pooled = sequence_output[i, aspect_index[0]:aspect_index[1]+1].mean(dim=0)
                        sentiment_logits = model.sentiment_classifier(pooled.unsqueeze(0))
                        sentiment_target = torch.tensor([sentiment], dtype=torch.long).to("mps")
                        sentiment_loss = criterion(sentiment_logits.view(-1, 3), sentiment_target)
                        sentiment_losses.append(sentiment_loss)

            if sentiment_losses:
                sentiment_loss = torch.stack(sentiment_losses).mean()
            else:
                sentiment_loss = torch.tensor(0.0).to("mps")

            total_sentiment_val_loss += sentiment_loss.item()

    scheduler.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], "
        f"Train Aspect Loss: {total_aspect_train_loss/len(train_dataloader):.4f}, "
        f"Train Sentiment Loss: {total_sentiment_train_loss/len(train_dataloader):.4f}, "
        f"Val Aspect Loss: {total_aspect_val_loss/len(val_dataloader):.4f}, "
        f"Val Sentiment Loss: {total_sentiment_val_loss/len(val_dataloader):.4f}")



Epoch [1/20], Train Aspect Loss: 1.2302, Train Sentiment Loss: 0.2913, Val Aspect Loss: 1.2678, Val Sentiment Loss: 0.3287
Epoch [2/20], Train Aspect Loss: 1.2532, Train Sentiment Loss: 0.2033, Val Aspect Loss: 1.2364, Val Sentiment Loss: 0.2798
Epoch [3/20], Train Aspect Loss: 1.2745, Train Sentiment Loss: 0.1448, Val Aspect Loss: 1.2950, Val Sentiment Loss: 0.3547
Epoch [4/20], Train Aspect Loss: 1.2728, Train Sentiment Loss: 0.1073, Val Aspect Loss: 1.2714, Val Sentiment Loss: 0.3511
Epoch [5/20], Train Aspect Loss: 1.2672, Train Sentiment Loss: 0.0797, Val Aspect Loss: 1.2999, Val Sentiment Loss: 0.3814
Epoch [6/20], Train Aspect Loss: 1.2770, Train Sentiment Loss: 0.0562, Val Aspect Loss: 1.2683, Val Sentiment Loss: 0.3613
Epoch [7/20], Train Aspect Loss: 1.2575, Train Sentiment Loss: 0.0469, Val Aspect Loss: 1.2739, Val Sentiment Loss: 0.3880
Epoch [8/20], Train Aspect Loss: 1.2410, Train Sentiment Loss: 0.0360, Val Aspect Loss: 1.2574, Val Sentiment Loss: 0.4583
Epoch [9/20], Tr

In [None]:
import os

name="AspectDetectionModel_Sentiment_Analysis"

os.makedirs("../models/AspectDetectionModel", exist_ok=True)

torch.save(model.state_dict(), "../models/AspectDetectionModel/" + name + ".pth")

model_config = {
	"hidden_size": model.bert.config.hidden_size,
	"num_labels": len(label2id),
	"id2label": id2label,
	"label2id": label2id,
	"name": name
}

import json
with open(f"../models/AspectDetectionModel/{name}_config.json", "w") as f:
	json.dump(model_config, f)

print("Model saved successfully to ../models/AspectDetectionModel/")

Model saved successfully to ../models/AspectDetectionModel/


In [12]:
name="AspectDetectionModel_Sentiment_Analysis"
model = AspectDetectionModel()
model.load_state_dict(torch.load("../models/AspectDetectionModel/" + name + ".pth"))
model = model.to("mps")
model.eval()

AspectDetectionModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [19]:
temp = tokenizer(("""Car quality is very nice but the controller sucks . The controller of this car do not works properly and the final in the controller do not rotate fully it only rotate like button""").split(), is_split_into_words=True,
                          truncation=True,
                          padding="max_length",
                          max_length=128)

print(temp)

{'input_ids': [101, 2482, 3737, 2003, 2200, 3835, 2021, 1996, 11486, 19237, 1012, 1996, 11486, 1997, 2023, 2482, 2079, 2025, 2573, 7919, 1998, 1996, 2345, 1999, 1996, 11486, 2079, 2025, 24357, 3929, 2009, 2069, 24357, 2066, 6462, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [20]:
device="mps"
model.eval()
with torch.no_grad():

    
    input_ids = torch.tensor(temp["input_ids"], dtype=torch.long).unsqueeze(0).to(device)
    attention_mask = torch.tensor(temp["attention_mask"], dtype=torch.long).unsqueeze(0).to(device)

    logits, sequence_output = model(input_ids, attention_mask)
    # decode the logits to get the predicted labels
    preds = torch.argmax(logits, dim=2)[0]
    aspects = extract_aspect_spans(preds.cpu().tolist())
    sentiments=[]
    for aspect in aspects:
        aspect_mask = torch.zeros_like(input_ids, dtype=torch.long).to("mps")
        aspect_mask[0, aspect[0]:aspect[1]+1] = 1
        # print("am", aspect_mask[0].unsqueeze(0))
        sentiment_logits, _ = model.sentiment_classifier(
            sequence_output, aspect_mask
        )
        sentiments.append({"pos":aspect,"senti":torch.argmax(sentiment_logits, dim=1).item()})

preds

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='mps:0')

In [21]:
logits

tensor([[[ 2.1199, -0.6186, -1.3607],
         [ 1.5348, -0.3757, -1.1551],
         [ 2.3946, -0.6324, -1.4118],
         [ 2.7261, -0.6541, -1.6831],
         [ 2.8969, -0.6056, -1.7033],
         [ 2.7322, -0.6657, -1.6498],
         [ 2.6590, -1.3430, -1.3958],
         [ 2.5372, -1.1359, -1.3866],
         [ 1.7412, -0.2118, -1.1744],
         [ 2.2420, -0.9260, -1.4533],
         [ 2.7131, -1.2151, -1.2666],
         [ 2.2218, -1.1370, -1.3244],
         [ 1.8272, -0.4533, -1.2689],
         [ 2.2467, -1.0234, -1.3112],
         [ 2.5115, -1.2448, -1.6580],
         [ 1.6553, -0.4988, -1.3287],
         [ 2.5372, -1.2216, -1.4076],
         [ 2.4190, -1.2300, -1.4726],
         [ 2.1285, -0.9137, -1.4293],
         [ 2.5112, -1.1887, -1.5611],
         [ 2.8577, -1.1763, -1.5856],
         [ 2.1029, -1.1725, -1.2467],
         [ 1.4849, -0.7978, -0.8340],
         [ 2.1959, -0.7358, -0.8819],
         [ 2.1930, -1.0523, -1.2716],
         [ 2.0271, -0.3724, -1.2488],
         [ 2

In [22]:
arr=[-1] * 128
arr

for item in sentiments:
    pos=item["pos"]
    start = pos[0]
    end = pos[0]+pos[1]
    arr[start:end] = [item["senti"]] * (end-start)

In [23]:
aspects

[]

In [26]:
predictions = [id2label[label] for label in preds.tolist()]
predictions

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [28]:
def print_sentiment(bio_tag, sentiment):
    if(bio_tag in ["B-ASP", "I-ASP"]):
        return id2sentiment[sentiment]
    else:
        return ""

rows = []

for item in zip(predictions, temp["input_ids"], arr):
    word = tokenizer.convert_ids_to_tokens(item[1])
    if word in ['[CLS]', '[SEP]', '[PAD]']:
        continue
    sentiment = print_sentiment(item[0], item[2])
    rows.append({
        "BIO Tag": item[0],
        "Word": word,
        "Sentiment": sentiment
    })

df = pd.DataFrame(rows)
df

Unnamed: 0,BIO Tag,Word,Sentiment
0,O,car,
1,O,quality,
2,O,is,
3,O,very,
4,O,nice,
5,O,but,
6,O,the,
7,O,controller,
8,O,sucks,
9,O,.,


In [None]:
def extract_aspects(input_ids, predictions, tokenizer):
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    
    aspects = []
    current_aspect = []
    current_position = []
    current_idx = 0

    for idx, (token, label) in enumerate(zip(tokens, predictions[0])):
        if token in [tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token]:
            continue
            
        label_tag = id2label.get(label, "O")
        
        if label_tag == "B-ASP":
            if current_aspect:
                aspects.append((" ".join(current_aspect), (current_position[0], current_position[-1])))
            current_aspect = [token]
            current_position = [idx]
        elif label_tag == "I-ASP" and current_aspect:
            current_aspect.append(token)
            current_position.append(idx)
        else:
            if current_aspect:
                aspects.append((" ".join(current_aspect), (current_position[0], current_position[-1])))
                current_aspect = []
                current_position = []

    if current_aspect:
        aspects.append((" ".join(current_aspect), (current_position[0], current_position[-1])))

    clean_aspects = []
    for aspect, pos in aspects:
        cleaned = ""
        for word in aspect.split():
            if word.startswith("##"):
                cleaned += word[2:]  # Remove ## prefix
            else:
                if cleaned:
                    cleaned += " "
                cleaned += word
        clean_aspects.append((cleaned, pos))

    return clean_aspects


extracted_aspects = extract_aspects(input_ids, predictions, tokenizer)
extracted_aspects

[('chrome', (2, 2)), ('book', (3, 3))]