In [64]:
!pip install transformers



In [65]:
from google.colab import files

uploaded = files.upload()


# Import lib

In [66]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
import json
import os
from sklearn.preprocessing import LabelEncoder

# Processing Dataset

In [67]:
class ArticleDataset(Dataset):
    def __init__(self, articles, labels, tokenizer, max_length):
        self.articles = articles
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        article = self.articles[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            article,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'article_text': article,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


# Model

In [68]:
class ArticleClassifier(nn.Module):
    def __init__(self, n_classes):
        super(ArticleClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("vinai/phobert-base")
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

# Load dataset and setup hyperparameters

In [69]:
print(os.getcwd())

/content


In [70]:
with open("raw_data_vietnambiz_1.json", 'r', encoding='utf-8') as f:
    articles = json.load(f)

In [71]:
labels = set()

for item in articles:
    label = item.get("Label")
    if label:
        labels.add(label)
print(len(labels))

352


In [72]:
articles = []
for item in articles:
    articles.append(item.get("Content"))

In [73]:
le = LabelEncoder()
labels = le.fit_transform([i for i in labels])
labels = [i for i in labels]

In [74]:
MAX_LENGTH = 512  # Adjust as needed
BATCH_SIZE = 32
N_CLASSES = len(labels)  # Replace with your actual number of classes
EPOCHS = 3

# Load phoBERT

In [75]:
# Load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Create dataset

In [76]:
# Create PyTorch Dataset
dataset = ArticleDataset(articles, labels, tokenizer, MAX_LENGTH)

In [77]:
# Create PyTorch DataLoader
data_loader = DataLoader(dataset, batch_size=BATCH_SIZE)

In [78]:
print(data_loader)

<torch.utils.data.dataloader.DataLoader object at 0x7f7e5d1a8c40>


# Initialize Model

In [79]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [80]:
# Initialize our classifier
model = ArticleClassifier(N_CLASSES)
model = model.to(device)

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Setup Loss function and optimizer

In [81]:
# Loss function and optimizer
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


# Train Model

In [84]:
# Training loop
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    for step, batch in enumerate(data_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()


Epoch 1/3
----------
Epoch 2/3
----------
Epoch 3/3
----------
