In [None]:
# # Param
# test = "True"

In [None]:
is_test = True
if test == "False" or test == False:
    is_test = False

In [2]:
dataset = "data"

In [3]:
from utils.data_loader import DataLoader

In [4]:
data_loadet = DataLoader(dataset)

In [5]:
keyword_wl_df = data_loadet.get_keyword_wl_data()
abstract_wl_df = data_loadet.get_abstract_wl_data()

In [6]:
keyword_wl_df.shape, abstract_wl_df.shape

((6250, 3), (1250, 3))

In [None]:
kw_min_samp = min(keyword_wl_df.value_counts('score'))
kw_min_samp
if kw_min_samp > 15_000:
    kw_min_samp = 15_000

692

In [None]:
ab_min_samp = min(abstract_wl_df.value_counts('score'))
ab_min_samp
if ab_min_samp > 15_000:
    ab_min_samp = 15_000

404

In [14]:
if is_test:
    keyword_wl_df = keyword_wl_df.groupby('score').sample(10)
    abstract_wl_df = abstract_wl_df.groupby('score').sample(10)
else:
    keyword_wl_df = keyword_wl_df.groupby('score').sample(kw_min_samp)
    abstract_wl_df = abstract_wl_df.groupby('score').sample(ab_min_samp)

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [10]:
class NLIDataset(Dataset):
    def __init__(self, texts, hypotheses, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, hypotheses, truncation=True, padding=True, max_length=max_len)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
def compute_accuracy(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = batch['labels'].cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels)
    return accuracy_score(true_labels, predictions)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [12]:
def train_nli(
    df, text_column='text', hypothesis_column='hypothesis', label_column='score',
    model_name='bert-base-uncased',
    epochs=10, batch_size=16, lr=2e-5, max_len=128, patience=2
):
    # Tách tập train/val
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

    # Tokenizer & Datasets
    tokenizer = BertTokenizer.from_pretrained(model_name)
    train_dataset = NLIDataset(train_df[text_column].tolist(), train_df[hypothesis_column].tolist(), train_df[label_column].tolist(), tokenizer, max_len)
    val_dataset = NLIDataset(val_df[text_column].tolist(), val_df[hypothesis_column].tolist(), val_df[label_column].tolist(), tokenizer, max_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=lr)

    # Early stopping variables
    best_val_loss = float('inf')
    no_improve_epochs = 0
    best_model_state = None

    for epoch in range(epochs):
        # === TRAIN ===
        model.train()
        total_train_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} - Training"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_train_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        train_acc = compute_accuracy(model, train_loader, device)

        # === VALIDATION ===
        model.eval()
        total_val_loss = 0

        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                total_val_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(val_loader)
        val_acc = compute_accuracy(model, val_loader, device)

        print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

        # === EARLY STOPPING ===
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            no_improve_epochs = 0
            best_model_state = model.state_dict()
        else:
            no_improve_epochs += 1
            if no_improve_epochs >= patience:
                print(f"Early stopping triggered after {patience} epochs without val_loss improvement.")
                if best_model_state:
                    model.load_state_dict(best_model_state)
                break

    return model, tokenizer


In [13]:
import os
from transformers import BertTokenizer, BertForSequenceClassification

def save_model(model, tokenizer, path):
    os.makedirs(path, exist_ok=True)
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

def load_model(path):
    model = BertForSequenceClassification.from_pretrained(path)
    tokenizer = BertTokenizer.from_pretrained(path)
    return model, tokenizer

In [14]:
n_epochs = 10
patience = 3
if is_test:
    n_epochs = 2

In [15]:
keyword_model, keyword_tokenizer = train_nli(
    keyword_wl_df,
    text_column='text',
    hypothesis_column='keyword',
    label_column='score',
    epochs=n_epochs,
    batch_size=32,
    lr=3e-5,
    patience=patience
)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch 1 | Train Loss: 0.7121 | Val Loss: 0.7347 | Train Acc: 0.5444 | Val Acc: 0.3000


Epoch 2 - Training: 100%|██████████| 3/3 [00:13<00:00,  4.52s/it]


Epoch 2 | Train Loss: 0.6877 | Val Loss: 0.7023 | Train Acc: 0.6111 | Val Acc: 0.4000


In [16]:
abstract_model, abstract_tokenizer = train_nli(
    abstract_wl_df,
    text_column='text',
    hypothesis_column='abstract_concept',
    label_column='score',
    epochs=n_epochs,
    batch_size=32,
    lr=3e-5,
    patience=patience
)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Epoch 1 | Train Loss: 0.7030 | Val Loss: 0.6497 | Train Acc: 0.6778 | Val Acc: 0.6000


Epoch 2 - Training: 100%|██████████| 3/3 [00:14<00:00,  4.79s/it]


Epoch 2 | Train Loss: 0.6387 | Val Loss: 0.5844 | Train Acc: 0.8000 | Val Acc: 0.8000


In [17]:
from utils.data_io import join_path

In [18]:
save_model(keyword_model, keyword_tokenizer, join_path(dataset, 'scorer_model', 'keyword_scorer'))

In [19]:
save_model(abstract_model, abstract_tokenizer, join_path(dataset, 'scorer_model', 'abstract_scorer'))