In [1]:
!git clone https://github.com/dqxiu/ParaSCI.git

fatal: destination path 'ParaSCI' already exists and is not an empty directory.


In [None]:
!pip install pandas torch transformers datasets

In [2]:
import pandas as pd

In [3]:
import torch

In [4]:
import math
from typing import Tuple

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset

class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        src = self.encoder(src) * math.sqrt(self.d_model)
        
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output

In [5]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [64]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

with open('./ParaSCI/Data/ParaSCI-ACL/train/train.src') as file:
    train_source = file.readlines()

with open('./ParaSCI/Data/ParaSCI-ACL/train/train.tgt') as file:
    train_target = file.readlines()
    
with open('./ParaSCI/Data/ParaSCI-ACL/test/test.src') as file:
    test_source = file.readlines()

with open('./ParaSCI/Data/ParaSCI-ACL/test/test.tgt') as file:
    test_target = file.readlines()
    
with open('./ParaSCI/Data/ParaSCI-ACL/val/val.src') as file:
    val_source = file.readlines()

with open('./ParaSCI/Data/ParaSCI-ACL/val/val.tgt') as file:
    val_target = file.readlines()

tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_source+train_target+test_source+test_target+val_source+val_target))

from datasets import Dataset, DatasetDict, load_metric
import numpy as np

train_dataset = Dataset.from_dict({
    'id': np.arange(len(train_source)),
    'paraphrase': [{'input': data[0], 'output': data[1]} for data in zip(train_source, train_target)],
})
test_dataset = Dataset.from_dict({
        'id': np.arange(len(test_source)),
    'paraphrase': [{'input': data[0], 'output': data[1]} for data in zip(test_source, test_target)],
})
val_dataset = Dataset.from_dict({
    'id': np.arange(len(val_source)),
    'paraphrase': [{'input': data[0], 'output': data[1]} for data in zip(val_source, val_target)]
})
raw_dataset = DatasetDict()
raw_dataset['train'] = train_dataset
raw_dataset['test'] = test_dataset
raw_dataset['val'] = val_dataset

In [65]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('t5-base', return_tensors="pt")

In [66]:
max_input_length = 64
max_target_length = 64


def preprocess_function(examples):
    inputs = [ex["input"] for ex in examples["paraphrase"]]
    targets = [ex["output"] for ex in examples["paraphrase"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_dataset["train"].column_names,
)

  0%|          | 0/29 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [67]:
ntokens = tokenizer.vocab_size  # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # number of heads in nn.MultiheadAttention
dropout = 0.2  # dropout probability
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)

In [68]:
from transformers import DataCollator

batch_size = 512

In [69]:
train_dataloader = torch.utils.data.DataLoader(
    dataset=tokenized_datasets['train'],
    batch_size=batch_size,
    shuffle=True
)

In [70]:
test_dataloader = torch.utils.data.DataLoader(
    dataset=tokenized_datasets['test'],
    batch_size=batch_size,
    shuffle=True
)

In [76]:
import copy
import time
from tqdm import tqdm

criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
num_batches = len(train_dataloader) / batch_size
def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 20
    start_time = time.time()

    for i,batch in tqdm(enumerate(train_dataloader)):
        sources, mask, targets = batch['input_ids'], batch['attention_mask'], batch['labels']
        output = model(torch.stack(sources), torch.stack(mask))
        loss = criterion(output.view(-1, ntokens), torch.stack(targets).view(-1))

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if i % log_interval == 0 and i > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = cur_loss
            print(f'| epoch {epoch} | {i}/{num_batches} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

def evaluate(model: nn.Module) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    with torch.no_grad():
        for i,batch in tqdm(enumerate(test_dataloader)):
            sources, mask, targets = torch.stack(batch['input_ids']), torch.stack(batch['attention_mask']), torch.stack(batch['labels'])
            output = model(sources, mask)
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets.view(-1)).item()
    return total_loss / (len(test_dataloader) - 1)


In [None]:
best_val_loss = float('inf')
epochs = 20
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    val_loss = evaluate(model)
    val_ppl = val_loss
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)

    scheduler.step()


21it [08:06, 21.53s/it]

| epoch 1 | 20/0.111328125 batches | lr 1.79 | ms/batch 24333.47 | loss  2.69 | ppl     2.69


41it [15:30, 21.81s/it]

| epoch 1 | 40/0.111328125 batches | lr 1.79 | ms/batch 22172.27 | loss  2.53 | ppl     2.53


57it [20:56, 22.04s/it]
5it [00:31,  6.36s/it]


-----------------------------------------------------------------------------------------
| end of epoch 1 | time: 1288.10s | valid loss 1787.97 | valid ppl  1787.97
-----------------------------------------------------------------------------------------


21it [17:31, 23.32s/it] 

| epoch 2 | 20/0.111328125 batches | lr 1.70 | ms/batch 52587.18 | loss  2.65 | ppl     2.65


41it [38:55, 21.75s/it] 

| epoch 2 | 40/0.111328125 batches | lr 1.70 | ms/batch 64168.70 | loss  2.53 | ppl     2.53


57it [44:26, 46.79s/it]
5it [00:35,  7.20s/it]


-----------------------------------------------------------------------------------------
| end of epoch 2 | time: 2703.12s | valid loss 1779.74 | valid ppl  1779.74
-----------------------------------------------------------------------------------------


21it [07:40, 21.49s/it]

| epoch 3 | 20/0.111328125 batches | lr 1.62 | ms/batch 23025.37 | loss  2.61 | ppl     2.61


41it [14:59, 21.92s/it]

| epoch 3 | 40/0.111328125 batches | lr 1.62 | ms/batch 21945.72 | loss  2.50 | ppl     2.50


57it [20:39, 21.74s/it]
5it [00:33,  6.62s/it]


-----------------------------------------------------------------------------------------
| end of epoch 3 | time: 1272.69s | valid loss 1771.52 | valid ppl  1771.52
-----------------------------------------------------------------------------------------


21it [07:34, 21.94s/it]

| epoch 4 | 20/0.111328125 batches | lr 1.54 | ms/batch 22702.77 | loss  2.61 | ppl     2.61


41it [14:58, 22.52s/it]

| epoch 4 | 40/0.111328125 batches | lr 1.54 | ms/batch 22224.24 | loss  2.49 | ppl     2.49


57it [20:40, 21.76s/it]
5it [00:38,  7.76s/it]


-----------------------------------------------------------------------------------------
| end of epoch 4 | time: 1279.15s | valid loss 1750.70 | valid ppl  1750.70
-----------------------------------------------------------------------------------------


21it [07:50, 22.10s/it]

| epoch 5 | 20/0.111328125 batches | lr 1.46 | ms/batch 23529.50 | loss  2.60 | ppl     2.60


41it [15:12, 22.18s/it]

| epoch 5 | 40/0.111328125 batches | lr 1.46 | ms/batch 22117.70 | loss  2.47 | ppl     2.47


57it [20:56, 22.04s/it]
5it [00:34,  6.80s/it]


-----------------------------------------------------------------------------------------
| end of epoch 5 | time: 1290.62s | valid loss 1752.40 | valid ppl  1752.40
-----------------------------------------------------------------------------------------


21it [07:55, 22.78s/it]

| epoch 6 | 20/0.111328125 batches | lr 1.39 | ms/batch 23764.15 | loss  2.57 | ppl     2.57


41it [15:38, 22.96s/it]

| epoch 6 | 40/0.111328125 batches | lr 1.39 | ms/batch 23170.19 | loss  2.47 | ppl     2.47


57it [21:26, 22.57s/it]
5it [00:36,  7.36s/it]


-----------------------------------------------------------------------------------------
| end of epoch 6 | time: 1323.47s | valid loss 1754.74 | valid ppl  1754.74
-----------------------------------------------------------------------------------------


21it [07:56, 22.65s/it]

| epoch 7 | 20/0.111328125 batches | lr 1.32 | ms/batch 23833.43 | loss  2.57 | ppl     2.57


41it [17:24, 26.18s/it]

| epoch 7 | 40/0.111328125 batches | lr 1.32 | ms/batch 28386.24 | loss  2.45 | ppl     2.45


57it [23:02, 24.25s/it]
5it [00:33,  6.60s/it]


-----------------------------------------------------------------------------------------
| end of epoch 7 | time: 1415.54s | valid loss 1730.45 | valid ppl  1730.45
-----------------------------------------------------------------------------------------


21it [39:14, 69.04s/it] 

| epoch 8 | 20/0.111328125 batches | lr 1.25 | ms/batch 117724.03 | loss  2.56 | ppl     2.56


31it [1:34:06, 497.56s/it]

In [None]:
test_loss = evaluate(best_model)
test_ppl = test_loss
print('=' * 89)
print(f'| End of training | test loss {test_loss:5.2f} | '
      f'test ppl {test_ppl:8.2f}')
print('=' * 89)


In [None]:
inp = tokenizer('foobarbazbub')

In [None]:
out = best_model(torch.tensor(inp['input_ids']).unsqueeze(0), torch.tensor(inp['attention_mask']).unsqueeze(0))

In [None]:
tokenizer.decode(out.argmax(dim=2).squeeze())