In [None]:
import polars as pl
import json
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [96]:
train_lines=[]
test_lines=[]
val_lines=[]


with open("../data/raw/absa_datasets/acos_datasets/501.Laptop14/laptop_quad_train.tsv.jsonl", "r") as f:
    train_lines = f.readlines()

with open("../data/raw/absa_datasets/acos_datasets/501.Laptop14/laptop_quad_test.tsv.jsonl", "r") as f:
    test_lines = f.readlines()

with open("../data/raw/absa_datasets/acos_datasets/501.Laptop14/laptop_quad_dev.tsv.jsonl", "r") as f:
    val_lines = f.readlines()


with open("../data/raw/absa_datasets/acos_datasets/502.Restaurant14/train.jsonl", "r") as f:
    train_lines += f.readlines()

with open("../data/raw/absa_datasets/acos_datasets/502.Restaurant14/test.jsonl", "r") as f:
    test_lines += f.readlines()

with open("../data/raw/absa_datasets/acos_datasets/502.Restaurant14/dev.jsonl", "r") as f:
    val_lines += f.readlines()


with open("../data/raw/absa_datasets/acos_datasets/503.Restaurant15/train.jsonl", "r") as f:
    train_lines += f.readlines()

with open("../data/raw/absa_datasets/acos_datasets/503.Restaurant15/test.jsonl", "r") as f:
    test_lines += f.readlines()
    
with open("../data/raw/absa_datasets/acos_datasets/503.Restaurant15/dev.jsonl", "r") as f:
    val_lines += f.readlines()


with open("../data/raw/absa_datasets/acos_datasets/504.Restaurant16/rest16_quad_train.tsv.jsonl", "r") as f:
    train_lines += f.readlines()

with open("../data/raw/absa_datasets/acos_datasets/504.Restaurant16/rest16_quad_test.tsv.jsonl", "r") as f:
    test_lines += f.readlines()
    
with open("../data/raw/absa_datasets/acos_datasets/504.Restaurant16/rest16_quad_dev.tsv.jsonl", "r") as f:
    val_lines += f.readlines()

In [97]:
def clean_data(lines):
    cleaned_lines = []
    for line in lines:
        line_obj = json.loads(line)
        valid_line = True
        for label in line_obj["labels"]:
            if label["aspect"] == "NULL" or label["polarity"] == "NULL":
                valid_line = False
                break
        if valid_line and line_obj["labels"]:  # Only append if it has valid labels
            cleaned_lines.append(line_obj)
    return cleaned_lines

In [98]:
train_lines = clean_data(train_lines)
test_lines = clean_data(test_lines)
val_lines = clean_data(val_lines)

In [99]:
train_lines[5]

{'text': "now i ' m really bummed that i have a very nice looking chromebook with a beautiful screen that is totally unusable .",
 'labels': [{'aspect': 'chromebook',
   'opinion': 'nice',
   'polarity': 'positive',
   'category': 'LAPTOP#DESIGN_FEATURES'},
  {'aspect': 'chromebook',
   'opinion': 'bummed',
   'polarity': 'negative',
   'category': 'LAPTOP#OPERATION_PERFORMANCE'},
  {'aspect': 'chromebook',
   'opinion': 'unusable',
   'polarity': 'negative',
   'category': 'LAPTOP#OPERATION_PERFORMANCE'},
  {'aspect': 'screen',
   'opinion': 'beautiful',
   'polarity': 'positive',
   'category': 'DISPLAY#OPERATION_PERFORMANCE'}]}

In [100]:
len(train_lines), len(test_lines), len(val_lines)

(5050, 1827, 828)

In [2]:
label2id = {"O": 0, "B-ASP": 1, "I-ASP": 2}
id2label = {v: k for k, v in label2id.items()}
sentiment2id = {
    "negative": 0,
    "positive": 1,
    "neutral": 2
}

In [None]:
data = {
    'text': "now i ' m really bummed that i have a very nice looking chromebook with a beautiful screen that is totally unusable .",
    'labels': [
        {'aspect': 'chromebook', 'opinion': 'nice', 'polarity': 'positive', 'category': 'LAPTOP#DESIGN_FEATURES'},
        {'aspect': 'chromebook', 'opinion': 'bummed', 'polarity': 'negative', 'category': 'LAPTOP#OPERATION_PERFORMANCE'},
        {'aspect': 'chromebook', 'opinion': 'unusable', 'polarity': 'negative', 'category': 'LAPTOP#OPERATION_PERFORMANCE'},
        {'aspect': 'screen', 'opinion': 'beautiful', 'polarity': 'positive', 'category': 'DISPLAY#OPERATION_PERFORMANCE'}
    ]
}

include_opinions = False 

encoding = tokenizer(
    data['text'],
    return_offsets_mapping=True,
    add_special_tokens=True,
    truncation=True,
    padding="max_length",
    max_length=128,
)

offset_mapping = encoding['offset_mapping']
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

token_labels = [label2id['O']] * len(input_ids)

aspect_spans = []
polarities = []

tokens = tokenizer.convert_ids_to_tokens(input_ids)

def find_all_spans(text, word):
    spans = []
    start = 0
    while True:
        start = text.lower().find(word.lower(), start)
        if start == -1:
            break
        end = start + len(word)
        spans.append((start, end))
        start = end
    return spans

seen_aspects = set()

for label_entry in data['labels']:
    aspect_word = label_entry['aspect']
    opinion_word = label_entry['opinion']
    polarity = label_entry['polarity']
    
    if include_opinions:
        key = (aspect_word.lower(), opinion_word.lower())
    else:
        key = aspect_word.lower()
    
    if key in seen_aspects:
        continue  
    seen_aspects.add(key)
    
    asp_spans = find_all_spans(data['text'], aspect_word)
    
    for asp_start, asp_end in asp_spans:
        start_token_idx, end_token_idx = None, None

        for idx, (tok_start, tok_end) in enumerate(offset_mapping):
            if tok_start == tok_end:
                continue
            if start_token_idx is None and (tok_start >= asp_start and tok_end <= asp_end):
                start_token_idx = idx
            if start_token_idx is not None:
                if tok_start >= asp_start and tok_end <= asp_end:
                    end_token_idx = idx
        
        if start_token_idx is not None and end_token_idx is not None:
            aspect_spans.append([start_token_idx, end_token_idx])
            polarities.append(polarity)

            token_labels[start_token_idx] = label2id['B-ASP']
            for i in range(start_token_idx + 1, end_token_idx + 1):
                token_labels[i] = label2id['I-ASP']


token_labels[0] = -100
token_labels = [token_labels[i] if attention_mask[i] == 1 else -100 for i in range(len(token_labels))]


polarities = [sentiment2id[polarity] for polarity in polarities]


print("Tokens:", tokens)
print("Input IDs:", input_ids)
print("Attention Mask:", attention_mask)
print("Aspect Spans:", aspect_spans)
print("Polarities:", polarities)
print("Token Labels:", [id2label[label] for label in token_labels if label != -100])

output = {
    'input_ids': input_ids,
    'attention_mask': attention_mask,
    'token_labels': token_labels,
    'aspect_spans': aspect_spans,
    'polarities': polarities,
    'original_text': data['text']
}


Tokens: ['[CLS]', 'now', 'i', "'", 'm', 'really', 'bum', '##med', 'that', 'i', 'have', 'a', 'very', 'nice', 'looking', 'chrome', '##book', 'with', 'a', 'beautiful', 'screen', 'that', 'is', 'totally', 'un', '##usa', '##ble', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]

In [None]:
def tokenize_and_align_v2(data):
    
    include_opinions = False 
    encoding = tokenizer(
        data['text'],
        return_offsets_mapping=True,
        add_special_tokens=True,
        truncation=True,
        padding="max_length",
        max_length=128,
    )

    offset_mapping = encoding['offset_mapping']
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    
    token_labels = [label2id['O']] * len(input_ids)

    
    aspect_spans = []
    polarities = []

    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    def find_all_spans(text, word):
        spans = []
        start = 0
        while True:
            start = text.lower().find(word.lower(), start)
            if start == -1:
                break
            end = start + len(word)
            spans.append((start, end))
            start = end
        return spans

    seen_aspects = set()

    for label_entry in data['labels']:
        aspect_word = label_entry['aspect']
        opinion_word = label_entry['opinion']
        polarity = label_entry['polarity']
        
        if include_opinions:
            key = (aspect_word.lower(), opinion_word.lower())
        else:
            key = aspect_word.lower()
        
        if key in seen_aspects:
            continue 
        seen_aspects.add(key)
        
        asp_spans = find_all_spans(data['text'], aspect_word)
        
        for asp_start, asp_end in asp_spans:
            start_token_idx, end_token_idx = None, None

            for idx, (tok_start, tok_end) in enumerate(offset_mapping):
                if tok_start == tok_end:
                    continue
                if start_token_idx is None and (tok_start >= asp_start and tok_end <= asp_end):
                    start_token_idx = idx
                if start_token_idx is not None:
                    if tok_start >= asp_start and tok_end <= asp_end:
                        end_token_idx = idx
            
            if start_token_idx is not None and end_token_idx is not None:
                aspect_spans.append([start_token_idx, end_token_idx])
                polarities.append(polarity)

                # Also label tokens for extraction task
                token_labels[start_token_idx] = label2id['B-ASP']
                for i in range(start_token_idx + 1, end_token_idx + 1):
                    token_labels[i] = label2id['I-ASP']


    token_labels[0] = -100
    token_labels = [token_labels[i] if attention_mask[i] == 1 else -100 for i in range(len(token_labels))]


    polarities = [sentiment2id[polarity] for polarity in polarities]
    output = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': token_labels,
        'aspects_index': aspect_spans,
        'aspects_sentiment': polarities,
        # 'original_text': data['text']
    }
    return output


In [102]:
df = pl.DataFrame()
for line in train_lines:
    tokenized = tokenize_and_align_v2(line)
    df = df.vstack(pl.DataFrame({
        # "text": [tokenized["text"]],
        "input_ids": [tokenized["input_ids"]],
        "attention_mask": [tokenized["attention_mask"]],
        "labels": [tokenized["labels"]],
        "aspects_index": [tokenized["aspects_index"]],
        "aspects_sentiment": [tokenized["aspects_sentiment"]],
        "type": "train"
    }))
for line in test_lines:
    tokenized = tokenize_and_align_v2(line)
    df = df.vstack(pl.DataFrame({
        "input_ids": [tokenized["input_ids"]],
        "attention_mask": [tokenized["attention_mask"]],
        "labels": [tokenized["labels"]],
        "aspects_index": [tokenized["aspects_index"]],
        "aspects_sentiment": [tokenized["aspects_sentiment"]],
        "type": "test"
    }))
for line in val_lines:
    tokenized = tokenize_and_align_v2(line)
    df = df.vstack(pl.DataFrame({
        "input_ids": [tokenized["input_ids"]],
        "attention_mask": [tokenized["attention_mask"]],
        "labels": [tokenized["labels"]],
        "aspects_index": [tokenized["aspects_index"]],
        "aspects_sentiment": [tokenized["aspects_sentiment"]],
        "type": "val"
    }))

In [103]:
df.sample(10)

input_ids,attention_mask,labels,aspects_index,aspects_sentiment,type
list[i64],list[i64],list[i64],list[list[i64]],list[i64],str
"[101, 2043, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[9, 11], [3, 3]]","[0, 0]","""test"""
"[101, 1996, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[2, 2], [7, 7]]","[1, 1]","""train"""
"[101, 1996, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[2, 2], [20, 25]]","[0, 2]","""train"""
"[101, 1996, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[2, 3]]",[1],"""train"""
"[101, 5341, … 0]","[1, 1, … 0]","[-100, 1, … -100]","[[1, 2]]",[1],"""val"""
"[101, 2096, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[7, 7]]",[1],"""test"""
"[101, 2023, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[17, 22]]",[0],"""train"""
"[101, 5983, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[5, 5]]",[1],"""val"""
"[101, 1996, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[2, 2], [7, 7]]","[1, 1]","""test"""
"[101, 2049, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[4, 4], [32, 32], [37, 37]]","[0, 0, 1]","""train"""


In [108]:
len(df.filter(pl.col("type")== 'test'))

1827

In [109]:
temp=df.filter(pl.col("aspects_index").list.len() > 0)
temp

input_ids,attention_mask,labels,aspects_index,aspects_sentiment,type
list[i64],list[i64],list[i64],list[list[i64]],list[i64],str
"[101, 9078, … 0]","[1, 1, … 0]","[-100, 1, … -100]","[[1, 2]]",[2],"""train"""
"[101, 1996, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[19, 20]]",[0],"""train"""
"[101, 1996, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[2, 2]]",[0],"""train"""
"[101, 2255, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[13, 14]]",[0],"""train"""
"[101, 2823, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[21, 21]]",[0],"""train"""
…,…,…,…,…,…
"[101, 1045, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[4, 5]]",[1],"""val"""
"[101, 2079, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[8, 8]]",[0],"""val"""
"[101, 1996, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[2, 2], [7, 7]]","[0, 0]","""val"""
"[101, 9467, … 0]","[1, 1, … 0]","[-100, 0, … -100]","[[9, 9], [14, 15]]","[0, 0]","""val"""


In [110]:
len(temp)

7705

In [111]:
temp.write_parquet("../data/processed/df_aspect_pos.parquet")