In [1]:
%cd /content/drive/MyDrive/

/content/drive/MyDrive


In [2]:
%cd VietInfoUITNLP/vietinfo_uitnlp_api

/content/drive/MyDrive/VietInfoUITNLP/vietinfo_uitnlp_api


In [3]:
!unzip vietinfo_uitnlp_data.zip

Archive:  vietinfo_uitnlp_data.zip
   creating: vietinfo_uitnlp_data/
   creating: vietinfo_uitnlp_data/env_docker_cpu/
  inflating: vietinfo_uitnlp_data/env_docker_cpu/odbcinst.ini  
  inflating: vietinfo_uitnlp_data/env_docker_cpu/Dockerfile  
   creating: vietinfo_uitnlp_data/data/
   creating: vietinfo_uitnlp_data/data/sogtvt_new_fold_4/
  inflating: vietinfo_uitnlp_data/data/sogtvt_new_fold_4/vanbansudung_ids.txt  
   creating: vietinfo_uitnlp_data/data/sogtvt_new_fold_4/part_1/
  inflating: vietinfo_uitnlp_data/data/sogtvt_new_fold_4/part_1/documents.json  
  inflating: vietinfo_uitnlp_data/data/sogtvt_new_fold_4/part_1/test.json  
  inflating: vietinfo_uitnlp_data/data/sogtvt_new_fold_4/part_1/dev.json  
  inflating: vietinfo_uitnlp_data/data/sogtvt_new_fold_4/part_1/train.json  
   creating: vietinfo_uitnlp_data/data/sogtvt_new_fold_5/
  inflating: vietinfo_uitnlp_data/data/sogtvt_new_fold_5/vanbansudung_ids.txt  
   creating: vietinfo_uitnlp_data/data/sogtvt_new_fold_5/part_1/

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, Sampler
import numpy as np
import pandas as pd
import os
from torch import nn
from tqdm import tqdm

In [None]:
def pad(tensors, padding_value=0):
    size = [len(tensors)] + [max(tensor.size(i) for tensor in tensors)
                             for i in range(len(tensors[0].size()))]
    out_tensor = tensors[0].data.new(*size).fill_(padding_value)
    for i, tensor in enumerate(tensors):
        out_tensor[i][[slice(0, i) for i in tensor.size()]] = tensor
    return out_tensor


class TextDataset(Dataset):
    def __init__(self, df, pad_index):
        super(TextDataset, self).__init__()
        self.df = df
        self.col_names = df.columns
        self.col_text_names = [col_name for col_name in self.col_names if col_name.startswith('text_')]
        self.pad_index = pad_index

    def __getitem__(self, index):
        for col_name in self.col_names:
            yield self.df.iloc[index][col_name]

    def __len__(self):
        return len(self.df)

    @property
    def loader(self):
        if hasattr(self, 'data_loader'):
            return self.data_loader
        else:
            raise AttributeError

    @loader.setter
    def loader(self, data_loader):
        self.data_loader = data_loader

    @classmethod
    def collate_fn(cls, batch):
        return (field for field in zip(*batch))


class TextDataLoader(DataLoader):
    def __init__(self, *args, **kwargs):
        super(TextDataLoader, self).__init__(*args, **kwargs)

        self.col_names = self.dataset.col_names
        self.pad_index = self.dataset.pad_index

    def __iter__(self):
        for raw_batch in super(TextDataLoader, self).__iter__():
            batch, device = {}, 'cuda' if torch.cuda.is_available() else 'cpu'
            for data, col_name in zip(raw_batch, self.col_names):
                if col_name.startswith('text_a'):
                    batch[col_name] = pad(data, self.pad_index).to(device)
                elif col_name == 'label_a':  
                    batch[col_name] = torch.tensor(data).to(device)

            yield batch

In [None]:
def read_uit_vsfc(path):
    df = pd.read_csv(path)
    df = df[df['label'] != 1]
    df['label'].replace(2, 1, inplace=True)
    df.rename(columns={'label': 'label_a'}, inplace=True)
    return df

df_train = read_uit_vsfc('train.csv')
df_dev = read_uit_vsfc('dev.csv')
df_test = read_uit_vsfc('test.csv')

idx2word = ['<s>', '</s>', '<pad>', '<unk>']
word2idx = {word: idx for idx, word in enumerate(idx2word)}

train_words = {}
for sentence in df_train['sentence']:
    for word in sentence.split(' '):
        if word not in train_words: train_words[word] = 1
        else: train_words[word] += 1

for word in train_words:
    if train_words[word] > 1:
        if word not in word2idx:
            word2idx[word] = len(word2idx)
            idx2word.append(word)

In [None]:
df_train['text_a'] = df_train['sentence'].apply(lambda inp: torch.tensor([
    word2idx[word] if word in word2idx else word2idx['<unk>'] for word in inp.split(' ')
]))

df_dev['text_a'] = df_dev['sentence'].apply(lambda inp: torch.tensor([
    word2idx[word] if word in word2idx else word2idx['<unk>'] for word in inp.split(' ')
]))

df_test['text_a'] = df_test['sentence'].apply(lambda inp: torch.tensor([
    word2idx[word] if word in word2idx else word2idx['<unk>'] for word in inp.split(' ')
]))

In [None]:
learning_rate = 1e-3
batch_size = 64
epochs = 5

In [None]:
dataset_train = TextDataset(df=df_train, pad_index=word2idx['<pad>'])
dataloader_train = TextDataLoader(dataset=dataset_train,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  collate_fn=dataset_train.collate_fn,
                                  num_workers=0)

dataset_dev = TextDataset(df=df_dev, pad_index=word2idx['<pad>'])
dataloader_dev = TextDataLoader(dataset=dataset_dev,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  collate_fn=dataset_dev.collate_fn,
                                  num_workers=0)

dataset_test = TextDataset(df=df_test, pad_index=word2idx['<pad>'])
dataloader_test = TextDataLoader(dataset=dataset_test,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  collate_fn=dataset_test.collate_fn,
                                  num_workers=0)

In [None]:
class Model(nn.Module):
    def __init__(self, config):
        super(Model, self).__init__()

        self.config = config
        self.embedding = nn.Embedding(config['num_embeddings'], config['embedding_dim'])
        
        self.W_x = nn.Linear(config['embedding_dim'], 2*config['embedding_dim'])
        self.W_h = nn.Linear(2*config['embedding_dim'], 2*config['embedding_dim'])
        self.b = nn.parameter.Parameter(torch.rand(2*config['embedding_dim'], dtype=torch.float32))
        self.act_funct = nn.Tanh()

        self.h_0 = nn.parameter.Parameter(torch.zeros(2*config['embedding_dim'], dtype=torch.float32))

        self.cls = nn.Linear(2*config['embedding_dim'], 1)

    def forward(self, batch):
        # batch['text_a']: shape: (batch_size, max_seq_length)
        # batch['label_a']: shape: (batch_size,)

        vec_h = [self.h_0.expand(batch['text_a'].shape[0], 2*self.config['embedding_dim'])]
        max_length = batch['text_a'].shape[1]
        embed = self.embedding(batch['text_a'])
        vec_b = self.b.expand(batch['text_a'].shape[0], 2*self.config['embedding_dim'])
        
        for i in range(max_length):
            vec_h.append(self.act_funct(self.W_x(embed[:, i]) + self.W_h(vec_h[-1]) + vec_b))
        vec_h.pop(0)

        vec_h = torch.stack(vec_h, dim=1)
        mask = batch['text_a'] != self.config['pad_index']
        lens = mask.sum(dim=1)

        last_vec_h = []
        for idx, pos in enumerate(lens):
            last_vec_h.append(vec_h[idx, pos - 1, :])

        last_vec_h = torch.stack(last_vec_h)

        # avg_reps = torch.stack([torch.mean(reps, dim=0) for reps in torch.split(embed[mask], lens.tolist())])
        
        return torch.sigmoid(self.cls(last_vec_h))

In [None]:
config = {
    'num_embeddings': len(word2idx),
    'embedding_dim': 25,
    'pad_index': word2idx['<pad>'],
}

In [None]:
model = Model(config)
loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
size = len(dataloader_train.dataset)

for epoch in range(epochs):
    for batch_idx, batch in enumerate(dataloader_train):
        out = torch.squeeze(model(batch))
        loss = loss_func(out, batch['label_a'].float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx % 100:
            nb_correct = 0
            nb_total = 0
            for dev_batch in dataloader_dev:
                out = torch.squeeze(model(dev_batch)) > 0.5
                nb_correct += (dev_batch['label_a'] == out).sum()
                nb_total += len(dev_batch['label_a'])
                
            loss, current = loss.item(), batch_idx * len(batch)
            acc = nb_correct/nb_total
            print(f"acc: {acc:>7f}  [{current:>5d}/{size:>5d}]")