In [1]:
import os
import torch
import random
import numpy as np
import pandas as pd
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, get_linear_schedule_with_warmup

from models.model import SpanAsteModel
from models.metrics import SpanEvaluator, metrics
from models.losses import log_likelihood
from utils.tager import SpanLabel, RelationLabel, SentimentTriple, SentenceTagger

# Untuk mencegah hasil yang berbeda setiap kali dijalankan
def set_seed(seed):
    print(f"Setting random seed to {seed}")
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    print("Random seed set successfully")

set_seed(42)

# Menggunakan GPU jika tersedia
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Kelas dataset kustom untuk memproses data triplet
class TripletsDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length=128):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length
        print(f"Dataset created with {len(sentences)} sentences")
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        # Format data input: text####[([target_indices], [opinion_indices], sentiment)]
        sentence_data = self.sentences[idx]
        
        print(f"\nProcessing sentence: {sentence_data}")
        
        # Memisahkan text dan anotasi
        parts = sentence_data.split("####")
        text = parts[0]
        triplets_str = parts[1]
        
        # Parse triplets
        triplets = eval(triplets_str)
        print(f"Extracted triplets: {triplets}")
        
        # Tokenisasi input text
        tokens = text.split()
        print(f"Tokens: {tokens}")
        
        # Tokenize menggunakan BERT tokenizer
        inputs = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True)
        
        input_ids = inputs.input_ids
        attention_mask = inputs.attention_mask
        token_type_ids = inputs.token_type_ids
        
        print(f"Input IDs: {input_ids[:10]}...")
        print(f"Attention Mask: {attention_mask[:10]}...")
        
        # Konversi triplets ke format yang dibutuhkan model
        sentiment_triples = []
        for triplet in triplets:
            aspect_indices = triplet[0]
            opinion_indices = triplet[1]
            sentiment = triplet[2]
            
            # Mendapatkan indeks kata berdasarkan tokenisasi BERT
            # Dalam kasus real, ini memerlukan pemetaan yang tepat dari indeks token asal ke subwords BERT
            # Untuk sederhananya, kita menggunakan offset yang tetap sebagai contoh
            # Catatan: Dalam implementasi real perlu pemetaan yang tepat
            
            # Map target indices ke token BERT
            words = text.split()
            
            # Mendapatkan offset token untuk aspect dan opinion
            aspect_start_offset = 1  # Akun untuk token [CLS]
            for i in range(min(aspect_indices)):
                aspect_start_offset += 1
                
            aspect_end_offset = aspect_start_offset + len(aspect_indices) - 1
            
            opinion_start_offset = 1  # Akun untuk token [CLS]
            for i in range(min(opinion_indices)):
                opinion_start_offset += 1
                
            opinion_end_offset = opinion_start_offset + len(opinion_indices) - 1
            
            print(f"Aspect: {[words[i] for i in aspect_indices]} at positions {aspect_indices}")
            print(f"Opinion: {[words[i] for i in opinion_indices]} at positions {opinion_indices}")
            print(f"Mapped to BERT tokens - Aspect: [{aspect_start_offset}, {aspect_end_offset}], Opinion: [{opinion_start_offset}, {opinion_end_offset}]")
            
            # Konversi sentimen ke format numerik
            relation = {"POS": "POS", "NEG": "NEG", "NEU": "NEU"}
            sentiment_value = relation.get(sentiment, sentiment)
            
            # Buat objek SentimentTriple
            triple = SentimentTriple(
                aspect=[aspect_start_offset, aspect_end_offset],
                opinion=[opinion_start_offset, opinion_end_offset],
                sentiment=sentiment_value
            )
            sentiment_triples.append(triple)
        
        # Buat objek SentenceTagger untuk mendapatkan spans dan relations
        sentence_tagger = SentenceTagger(sentiment_triples)
        spans, span_labels = sentence_tagger.spans
        relations, relation_labels = sentence_tagger.relations
        
        print(f"Spans: {spans}")
        print(f"Span labels: {span_labels}")
        print(f"Relations: {relations}")
        print(f"Relation labels: {relation_labels}")
        
        seq_len = len([i for i in input_ids if i != 0])
        
        return input_ids, attention_mask, token_type_ids, spans, relations, span_labels, relation_labels, seq_len

# Fungsi kolasi untuk batch processing
def collate_fn(batch):
    input_ids, attention_mask, token_type_ids, spans, relations, span_labels, relation_labels, seq_len = zip(*batch)
    return input_ids, attention_mask, token_type_ids, spans, relations, span_labels, relation_labels, seq_len

# Fungsi untuk gold label (Mengidentifikasi label yang benar)
def gold_labels(span_indices, spans, span_labels):
    """
    Mengorganisir gold labels dan indices
    """
    print("\nCreating gold labels:")
    gold_indices, gold_labels = [], []
    for batch_idx, indices in enumerate(span_indices):
        gold_ind, gold_lab = [], []
        print(f"  Batch {batch_idx} - Processing {len(indices)} indices")
        for indice in indices:
            if indice in spans[batch_idx]:
                ix = spans[batch_idx].index(indice)
                gold_lab.append(span_labels[batch_idx][ix])
                print(f"    Found span {indice} with label {span_labels[batch_idx][ix]}")
            else:
                gold_lab.append(0)
                print(f"    No match for span {indice}, assigning label 0")
            gold_ind.append(indice)
        gold_indices.append(gold_ind)
        gold_labels.append(gold_lab)
    
    print(f"  Created {len(gold_indices)} gold indices groups")
    return gold_indices, gold_labels

# Fungsi evaluasi
def evaluate(model, metric, data_loader, device):
    """
    Mengevaluasi model pada dataset
    """
    print("\n=== Evaluation Started ===")
    model.eval()
    metric.reset()
    with torch.no_grad():
        for batch_ix, batch in enumerate(data_loader):
            print(f"\nEvaluating batch {batch_ix+1}")
            input_ids, attention_mask, token_type_ids, spans, relations, span_labels, relation_labels, seq_len = batch
            
            # Konversi ke tensor dan pindahkan ke device
            input_ids = torch.tensor(input_ids, device=device)
            attention_mask = torch.tensor(attention_mask, device=device)
            token_type_ids = torch.tensor(token_type_ids, device=device)
            
            print(f"Batch statistics - Sequences: {len(input_ids)}, Max length: {max(seq_len)}")
            
            # Forward pass
            spans_probability, span_indices, relations_probability, candidate_indices = model(
                input_ids, attention_mask, token_type_ids, seq_len)
            
            print(f"Generated {len(span_indices)} span indices")
            print(f"Spans probability shape: {spans_probability.shape}")
            print(f"Relations probability shape: {relations_probability.shape}")
            
            # Mendapatkan gold labels untuk spans dan relations
            gold_span_indices, gold_span_labels = gold_labels(span_indices, spans, span_labels)
            gold_relation_indices, gold_relation_labels = gold_labels(candidate_indices, relations, relation_labels)
            
            # Menghitung metrik
            num_correct, num_infer, num_label = metric.compute(relations_probability.cpu(),
                                                               torch.tensor(gold_relation_labels))
            
            print(f"Metrics - Correct: {num_correct}, Inferred: {num_infer}, Label: {num_label}")
            metric.update(num_correct, num_infer, num_label)
    
    precision, recall, f1 = metric.accumulate()
    print(f"Evaluation Results - Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    model.train()
    return precision, recall, f1

# Contoh data
examples = [
    "apaan sih baru di download nggak bisa dibuka.####[([4], [5, 6, 7], 'NEG')]",
    "kalo ga bisa bikin aplikasi ga ush bikin nyusahin doang.####[([4], [1, 2, 3], 'NEG')]",
    "tolong google berikan opsi untuk 0 bintang , aplikasi asu.####[([8], [9], 'NEG')]",
    "scannya lamaaa.####[([0], [1], 'NEG')]",
    "aplikasi yang sangat bermanfaat bagi masyarakat kota yogyskarta.####[([0], [2, 3], 'POS')]"
]

# Inisialisasi tokenizer BERT
print("\n=== Initializing BERT Tokenizer ===")
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p2')
print(f"Tokenizer vocabulary size: {len(tokenizer.vocab)}")

# Persiapkan dataset
print("\n=== Preparing Dataset ===")
dataset = TripletsDataset(examples, tokenizer, max_length=128)
train_dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Inisialisasi model
print("\n=== Initializing Span-ASTE Model ===")
target_dim = len(SpanLabel)
relation_dim = len(RelationLabel)
print(f"Target dimension: {target_dim}, Relation dimension: {relation_dim}")

model = SpanAsteModel(
    'indobenchmark/indobert-base-p2',
    target_dim,
    relation_dim,
    ffnn_hidden_dim=150,
    span_width_embedding_dim=20,
    span_maximum_length=8,
    span_pruned_threshold=0.5,
    pair_distance_embeddings_dim=128,
    device=device
)
model.to(device)

print("\nModel architecture:")
for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")

# Parameter pelatihan
num_epochs = 3
learning_rate = 5e-5
weight_decay = 0.01
warmup_proportion = 0.1

# Optimizer dan scheduler
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay': weight_decay},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 
     'weight_decay': 0.0}
]

optimizer = optim.AdamW(optimizer_grouped_parameters, lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=int(total_steps * warmup_proportion),
    num_training_steps=total_steps
)

# Metrik evaluasi
metric = SpanEvaluator()

# Pelatihan model
print("\n=== Starting Training ===")
global_step = 0
best_f1 = 0.0
training_loss = []

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    model.train()
    epoch_loss = 0.0
    
    for batch_idx, batch in enumerate(train_dataloader):
        print(f"\nTraining batch {batch_idx+1}/{len(train_dataloader)}")
        input_ids, attention_mask, token_type_ids, spans, relations, span_labels, relation_labels, seq_len = batch
        
        # Konversi ke tensor dan pindahkan ke device
        input_ids = torch.tensor(input_ids, device=device)
        attention_mask = torch.tensor(attention_mask, device=device)
        token_type_ids = torch.tensor(token_type_ids, device=device)
        
        print(f"Batch input shapes - IDs: {input_ids.shape}, Mask: {attention_mask.shape}")
        
        # Forward pass
        spans_probability, span_indices, relations_probability, candidate_indices = model(
            input_ids, attention_mask, token_type_ids, seq_len)
        
        print(f"Forward pass complete - Spans: {spans_probability.shape}, Relations: {relations_probability.shape}")
        
        # Mendapatkan gold labels
        gold_span_indices, gold_span_labels = gold_labels(span_indices, spans, span_labels)
        gold_relation_indices, gold_relation_labels = gold_labels(candidate_indices, relations, relation_labels)
        
        # Hitung loss
        loss_ner = log_likelihood(spans_probability, span_indices, gold_span_indices, gold_span_labels)
        loss_relation = log_likelihood(relations_probability, candidate_indices, gold_relation_indices, gold_relation_labels)
        
        # Total loss
        loss = 0.2 * loss_ner + loss_relation
        print(f"Loss calculation - NER: {loss_ner.item():.4f}, Relation: {loss_relation.item():.4f}, Total: {loss.item():.4f}")
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        # Simpan loss
        epoch_loss += loss.item()
        training_loss.append(loss.item())
        
        global_step += 1
        print(f"Step {global_step} completed - Learning rate: {scheduler.get_last_lr()[0]:.6f}")
        
        # Evaluasi pada setiap 2 batch (untuk contoh saja)
        if global_step % 2 == 0:
            print("\nPerforming evaluation after 2 batches")
            precision, recall, f1 = evaluate(model, metric, train_dataloader, device)
            
            print(f"Evaluation metrics - Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
            
            if f1 > best_f1:
                best_f1 = f1
                print(f"New best F1 score: {best_f1:.4f}")
                
                # Simpan model
                # torch.save(model.state_dict(), "best_model.pt")
    
    # Evaluasi pada akhir epoch
    print(f"\n=== End of Epoch {epoch+1} ===")
    precision, recall, f1 = evaluate(model, metric, train_dataloader, device)
    
    print(f"Epoch {epoch+1} summary:")
    print(f"  Average loss: {epoch_loss / len(train_dataloader):.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1: {f1:.4f}")

print("\n=== Training Complete ===")
print(f"Best F1 score: {best_f1:.4f}")

# Inferensi pada contoh baru
print("\n=== Running Inference on New Example ===")
# Isi dengan contoh baru
new_example = "bukannya mempermudah , malah mempersulit . setelah mendaftar tidak bisa langsung login malah suruh nunggu aktivasi dari pihak desa , permasalahannya butuh berapa minggu untuk menunggu aktivasi dari desa ? katanya ngurus apa suruh online , giliran udah daftar online , eh masih aja nunggu .####[([11], [8, 9], 'NEG')]"

new_dataset = TripletsDataset([new_example], tokenizer, max_length=128)
new_dataloader = DataLoader(new_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

model.eval()
with torch.no_grad():
    for batch in new_dataloader:
        input_ids, attention_mask, token_type_ids, spans, relations, span_labels, relation_labels, seq_len = batch
        
        # Konversi ke tensor dan pindahkan ke device
        input_ids = torch.tensor(input_ids, device=device)
        attention_mask = torch.tensor(attention_mask, device=device)
        token_type_ids = torch.tensor(token_type_ids, device=device)
        
        # Forward pass
        spans_probability, span_indices, relations_probability, candidate_indices = model(
            input_ids, attention_mask, token_type_ids, seq_len)
        
        # Print prediksi spans dengan probabilitas tertinggi
        for batch_idx in range(spans_probability.size(0)):
            print(f"\nTop predicted spans for batch {batch_idx+1}:")
            probs, indices = torch.topk(spans_probability[batch_idx, :, 1:], k=3, dim=1)  # Ambil top-3 spans yang bukan O
            probs = probs.cpu().numpy()
            indices = indices.cpu().numpy() + 1  # Tambahkan 1 karena kita mengabaikan label O
            
            for i, (prob, idx) in enumerate(zip(probs[:5], indices[:5])):  # Hanya tampilkan 5 spans pertama
                span = span_indices[batch_idx][i]
                label = SpanLabel(idx[0]).name if idx[0] < len(SpanLabel) else "Unknown"
                print(f"  Span {span} - Label: {label}, Probability: {prob[0]:.4f}")
        
        # Print prediksi relations dengan probabilitas tertinggi
        for batch_idx in range(relations_probability.size(0)):
            print(f"\nTop predicted relations for batch {batch_idx+1}:")
            probs, indices = torch.topk(relations_probability[batch_idx, :, 1:], k=3, dim=1)  # Ambil top-3 relations yang bukan INVALID
            probs = probs.cpu().numpy()
            indices = indices.cpu().numpy() + 1  # Tambahkan 1 karena kita mengabaikan label INVALID
            
            for i, (prob, idx) in enumerate(zip(probs[:5], indices[:5])):  # Hanya tampilkan 5 relations pertama
                if i < len(candidate_indices[batch_idx]):
                    relation = candidate_indices[batch_idx][i]
                    label = RelationLabel(idx[0]).name if idx[0] < len(RelationLabel) else "Unknown"
                    print(f"  Relation {relation} - Label: {label}, Probability: {prob[0]:.4f}")

print("\n=== Debug Process Complete ===")

  from .autonotebook import tqdm as notebook_tqdm


Setting random seed to 42
Random seed set successfully
Using device: cuda

=== Initializing BERT Tokenizer ===
Tokenizer vocabulary size: 30521

=== Preparing Dataset ===
Dataset created with 5 sentences

=== Initializing Span-ASTE Model ===
Target dimension: 3, Relation dimension: 4
Using BertModel for indobenchmark/indobert-base-p2

Model architecture:
bert.embeddings.word_embeddings.weight: torch.Size([50000, 768])
bert.embeddings.position_embeddings.weight: torch.Size([512, 768])
bert.embeddings.token_type_embeddings.weight: torch.Size([2, 768])
bert.embeddings.LayerNorm.weight: torch.Size([768])
bert.embeddings.LayerNorm.bias: torch.Size([768])
bert.encoder.layer.0.attention.self.query.weight: torch.Size([768, 768])
bert.encoder.layer.0.attention.self.query.bias: torch.Size([768])
bert.encoder.layer.0.attention.self.key.weight: torch.Size([768, 768])
bert.encoder.layer.0.attention.self.key.bias: torch.Size([768])
bert.encoder.layer.0.attention.self.value.weight: torch.Size([768, 7

RuntimeError: selected index k out of range