In [1]:
import torch
import torch.nn as nn
import numpy as np
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader

MAX_LEN = 128

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
env = ".env"

try:
    with open(env, "r") as file:
        AUTH_TOKEN = file.read()
except FileNotFoundError:
    print(f"The file {env} does not exist.")
except Exception as e:
    print(f"An error occurred: {e}")


# MRC_PATH = 'nguyenvulebinh/vi-mrc-base'
PHOBERT_PATH = 'vinai/phobert-base'

In [4]:
TRAIN_PATH = 'data/sentiment_analysis_data/train.jsonl'
DEV_PATH = 'data/sentiment_analysis_data/dev.jsonl'
TEST_PATH = 'data/sentiment_analysis_data/test.jsonl'

# Prepare material

In [5]:
# function read jsonl file as dataframe
import pandas as pd
import json

def read_jsonl_to_dataframe(file_path):
    data = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                json_obj = json.loads(line)
                data.append(json_obj)
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON: {e}")

    df = pd.DataFrame(data)

    return df

In [6]:
# load train and dev data
df_train = read_jsonl_to_dataframe(TRAIN_PATH)
df_dev = read_jsonl_to_dataframe(DEV_PATH)

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(df_train['sentiment'].values)

In [8]:
def split_text(text):
    return text.split()

df_train.text.map(split_text).map(len).describe()

count    24771.000000
mean         6.477615
std          5.680464
min          1.000000
25%          3.000000
50%          5.000000
75%          8.000000
max         80.000000
Name: text, dtype: float64

In [9]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(PHOBERT_PATH, use_auth_token=AUTH_TOKEN)



In [10]:
# number of tokens of pre-trained tokenizer
len(tokenizer)

64001

# Prepare data

In [46]:
class SentimentAnalysisDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=MAX_LEN):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        text = str(self.data.text[index])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        label = self.data.sentiment[index]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(label, dtype=torch.long)
        }
        
    
    def __len__(self):
        return self.len

In [47]:
# create train and dev dataset
train_dataset = SentimentAnalysisDataset(df_train, tokenizer)
dev_dataset = SentimentAnalysisDataset(df_dev, tokenizer)

In [22]:
def print_sample(dataset, index):
    
    sample = dataset[index]
    ids = sample['ids']
    mask = sample['mask']
    token_type_ids = sample['token_type_ids']
    label = sample['target']

    print(f"""index: {index}
ids: {ids}
mask: {mask}
token_type_ids: {token_type_ids}
label: {label}""")

print_sample(train_dataset, 1)

index: 1
ids: tensor([   0, 1901, 4752,    2,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1])
mask: tensor([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [48]:
# function to create dataloader
def create_data_loader(datasets, params):

    return DataLoader(
        datasets,
        **params
    )

# define dataloader params
train_params = {
    'batch_size': 16,
    'shuffle': True,
    'num_workers': 0
}

dev_params = {
    'batch_size': 8,
    'shuffle': False,
    'num_workers': 0
}

# create dataloader
train_dataloader = create_data_loader(train_dataset, train_params)
dev_dataloader = create_data_loader(dev_dataset, dev_params)

# Define model

Here we define the model, BertForTokenClassification, and load it with the pretrained weights of "bert-base-uncased". The only thing we need to additionally specify is the number of labels (as this will determine the architecture of the classification head).

Note that only the base layers are initialized with the pretrained weights. The token classification head of top has just randomly initialized weights, which we will train, together with the pretrained weights, using our labelled dataset. This is also printed as a warning when you run the code cell below.

Then, we move the model to the GPU.

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(PHOBERT_PATH,
                                                   num_labels=label_encoder.classes_.shape[0],
                                                   use_auth_token=AUTH_TOKEN)

model.to(device)
model.eval()

In [35]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score


def loss_fn(logits, targets):
    """
    outputs: (batch_size, num_labels)
    targets: (batch_size,)
    """


    # reshape
    logits = logits.view(-1, logits.shape[-1])
    targets = targets.view(-1)

    # compute cross entropy loss
    return nn.CrossEntropyLoss()(logits, targets)

def accuracy_f1(logits, targets):
    """
    outputs: (batch_size, num_labels)
    targets: (batch_size, 1)
    """
    # reshape to (batch_size * seq_len, num_labels)
    logits = logits.view(-1, logits.shape[-1])
    targets = targets.view(-1)

    # compute accuracy
    preds = torch.argmax(logits, dim=1)
    acc = accuracy_score(targets.cpu().numpy(), preds.cpu().numpy())

    # compute f1 score
    f1 = f1_score(targets.cpu().numpy(), preds.cpu().numpy(), average='macro')

    return acc, f1

In [54]:
# sample batch
sample_batch = next(iter(train_dataloader))

out = model(
    input_ids=sample_batch['ids'].to(device),
    attention_mask=sample_batch['mask'].to(device),
    token_type_ids=sample_batch['token_type_ids'].to(device)
)

loss = loss_fn(out.logits, sample_batch['targets'].to(device))
acc, f1 = accuracy_f1(out.logits, sample_batch['targets'].to(device))

print(f"loss: {loss}")
print(f"acc: {acc}")
print(f"f1: {f1}")

loss: 1.1246957778930664
acc: 0.3125
f1: 0.24603174603174605


In [42]:
# optimizer
from transformers import get_linear_schedule_with_warmup
import torch.optim as optim

def optimizer_scheduler(model, num_train_steps, lr=5e-5):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", 'LayerNorm.bias', "LayerNorm.weight"]
    optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]

    opt = optim.AdamW(optimizer_parameters, lr=lr)
    sch = get_linear_schedule_with_warmup(
        opt,
        num_warmup_steps=int(0.05*num_train_steps),
        num_training_steps=num_train_steps,
        last_epoch=-1,
    )
    return opt, sch

In [43]:
# prarams for training
epochs = 15
accumulation_steps = 4

total_steps = len(train_dataloader) * epochs // accumulation_steps
optimizer, scheduler = optimizer_scheduler(model, total_steps)

In [56]:
def train(epoch):

    loss, accuracy, f1_score = 0, 0, 0
    train_steps = len(train_dataloader) // accumulation_steps
    model.train()

    for step, batch in tqdm(enumerate(train_dataloader), total=train_steps):

        ids = batch['ids'].to(device, dtype=torch.long)
        mask = batch['mask'].to(device, dtype=torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
        targets = batch['targets'].to(device, dtype=torch.long)

        optimizer.zero_grad()

        # forward pass
        outputs = model(ids, mask, token_type_ids=token_type_ids)
        logits = outputs.logits

        # compute loss
        batch_loss = loss_fn(logits, targets)

        if step % 100 == 0:
            print(f"Batch loss of epoch {epoch} at step {step}: {batch_loss.item()}")
        
        # backward pass
        batch_loss /= accumulation_steps
        batch_loss.backward()

        # update weights
        if (step + 1) % accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        # compute accuracy and f1 score
        batch_acc, batch_f1 = accuracy_f1(logits, targets)

        loss += batch_loss.item()
        accuracy += batch_acc.item()
        f1_score += batch_f1     

    return loss / train_steps, accuracy / train_steps, f1_score / train_steps

def evaluate(epoch):

    with torch.no_grad():
        loss, accuracy, f1_score = 0, 0, 0
        dev_steps = len(dev_dataloader)
        model.eval()
        for step, batch in tqdm(enumerate(dev_dataloader), total=dev_steps):

            ids = batch['ids'].to(device, dtype=torch.long)
            mask = batch['mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            targets = batch['targets'].to(device, dtype=torch.long)

            # forward pass
            outputs = model(ids, mask, token_type_ids=token_type_ids)
            logits = outputs.logits

            # compute loss
            batch_loss = loss_fn(logits, targets)

            # compute accuracy and f1 score
            batch_acc, batch_f1 = accuracy_f1(logits, targets)

            loss += batch_loss.item()
            accuracy += batch_acc.item()
            f1_score += batch_f1     

    return loss / dev_steps, accuracy / dev_steps, f1_score / dev_steps

In [57]:
for epoch in range(epochs):
    print(f"Epoch {epoch + 1} of {epochs}")
    train_loss, train_acc, train_f1 = train(epoch)
    print(f"Train loss: {train_loss}, Train accuracy: {train_acc}, Train F1 score: {train_f1}")
    dev_loss, dev_acc, dev_f1 = evaluate(epoch)
    print(f"Dev loss: {dev_loss}, Dev accuracy: {dev_acc}, Dev F1 score: {dev_f1}")

Epoch 1 of 10


  0%|          | 0/387 [00:01<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacty of 2.00 GiB of which 0 bytes is free. Of the allocated memory 5.23 GiB is allocated by PyTorch, and 79.69 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# save model
model.save_pretrained('model/sentiment_analysis_bert_base')

# End