# Solution parts

- https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
- https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
- https://coderzcolumn.com/tutorials/artificial-intelligence/how-to-use-glove-embeddings-with-pytorch

- https://ojs.aaai.org/index.php/AAAI/article/view/12047


In [8]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe

In [2]:
# DEVICE = torch.device("cuda") or torch.device("cpu")
DEVICE = torch.device("cpu")
DEVICE

device(type='cpu')

## Dataset preparation

In [7]:
tokenizer = get_tokenizer("basic_english")
print(tokenizer("Hello, world?"))

global_vectors = GloVe(cache="../data")

global_vectors.get_vecs_by_tokens(tokenizer("Hello, world?"), lower_case_backup=True)

['hello', ',', 'world', '?']


../data\glove.840B.300d.zip: 2.18GB [08:04, 4.50MB/s]                                
100%|█████████▉| 2196016/2196017 [06:17<00:00, 5810.16it/s]


In [22]:
data = [
    ("hello, world!", "hi world", 1.0),
    ("hello, world!", "cat eats beetle", 0.0),
    ("sky is blue", "horizon is not red", 0.5),
    ("sky is red", "horizon is not blue", 0.5),
]

In [23]:
class PlagiarismDataset(Dataset):
    def __init__(self, data: list[tuple[str, str, int]]):
        self.data = np.array(data)

        self.targets = self.data[:, 0]
        self.candidates = self.data[:, 1]
        self.scores = self.data[:, 2].astype(np.float16)

    def __len__(self):
        return len(self.scores)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.targets[idx], self.candidates[idx], self.scores[idx]

In [24]:
dataset = PlagiarismDataset(data)
print(len(dataset))
print(dataset[0])

4
('hello, world!', 'hi world', 1.0)


In [119]:
def text_pipeline(x):
    return global_vectors.get_vecs_by_tokens(tokenizer(x), lower_case_backup=True)


def collate_batch(batch):
    target_list, candidate_list, score_list = [], [], []
    for _target, _candidate, _score in batch:
        target_list.append(text_pipeline(_target))
        candidate_list.append(text_pipeline(_candidate))
        score_list.append(_score)

    target_list = torch.cat(target_list).float()
    candidate_list = torch.cat(candidate_list).float()
    score_list = torch.tensor(score_list).float()
    return target_list.to(DEVICE), candidate_list.to(DEVICE), score_list.to(DEVICE)


data_loader = DataLoader(
    dataset,
    batch_size=1,
    shuffle=False,
    collate_fn=collate_batch,
)
next(iter(data_loader))

(tensor([[ 0.2523,  0.1018, -0.6748,  ...,  0.1787, -0.5192,  0.3359],
         [-0.0828,  0.6720, -0.1499,  ..., -0.1918, -0.3785, -0.0659],
         [-0.0067,  0.2224,  0.2771,  ...,  0.0594,  0.0014,  0.0987],
         [-0.2655,  0.3353,  0.2186,  ..., -0.1786, -0.0629,  0.1623]]),
 tensor([[ 2.8796e-02,  4.1306e-01, -4.6690e-01, -7.8175e-02,  3.7058e-01,
           1.2867e-01,  4.7714e-01, -9.2372e-01, -6.7789e-02,  6.2381e-01,
          -2.9670e-01, -4.4328e-01, -8.4224e-02, -3.1270e-01, -1.8197e-01,
           3.2360e-01, -7.7793e-02,  1.3314e+00, -1.5676e-01,  1.2857e-01,
           4.3474e-02,  7.9883e-02,  1.1311e-02,  1.4428e-01,  1.7653e-01,
          -2.2321e-01, -4.2480e-02,  2.1707e-03, -4.7640e-02,  3.8532e-01,
          -5.9911e-02,  1.8338e-01, -1.9145e-01, -1.3184e-01, -2.2440e-01,
          -3.4313e-01, -1.9527e-01,  2.0129e-01, -2.8915e-01, -2.0750e-01,
           1.9230e-01, -4.3318e-01, -3.5914e-02, -1.7492e-01,  5.1793e-03,
           4.1998e-01,  1.0637e-01,  1.

## LSTM

In [41]:
EMBED_DIM = global_vectors.dim
LSTM_LAYERS = 2
HIDDEN_SIZE = EMBED_DIM

In [72]:
lstm = nn.LSTM(EMBED_DIM, HIDDEN_SIZE, num_layers=LSTM_LAYERS, bidirectional=False).to(
    DEVICE
)

In [73]:
def lstm_loop(sent, lstm):
    h = torch.zeros((LSTM_LAYERS, HIDDEN_SIZE)).to(DEVICE)
    c = torch.zeros((LSTM_LAYERS, HIDDEN_SIZE)).to(DEVICE)
    for t in sent:
        torch.cat([h.flatten(), c.flatten(), t])
        # print(state.shape, state[:10])
        _, (h, c) = lstm(t.view(1, -1))
    return h

In [None]:
# One by one

torch.manual_seed(42)
inputs = [torch.randn(300).to(DEVICE) for _ in range(5)]  # make a sequence of length 5

# initialize the hidden state.
h_c = (
    torch.zeros(LSTM_LAYERS, HIDDEN_SIZE).to(DEVICE),
    torch.zeros(LSTM_LAYERS, HIDDEN_SIZE).to(DEVICE),
)
for i in inputs:
    out, h_c = lstm(i.view(1, -1), h_c)
    print(out[0, :3], h_c[0][0, :3], h_c[1][0, :3])

In [None]:
# All at once

inputs2 = torch.cat(inputs).view(len(inputs), 1, -1)
h_c = (
    torch.zeros(LSTM_LAYERS, 1, HIDDEN_SIZE).to(DEVICE),
    torch.zeros(LSTM_LAYERS, 1, HIDDEN_SIZE).to(DEVICE),
)
out, hidden = lstm(inputs2, h_c)
print(out[-1, 0, :3])
# print(out.shape)

## Regression

In [77]:
class CNet(nn.Module):
    def __init__(self, input_dim: int, output_dim: int, hidden_dim: int = 16) -> None:
        super(CNet, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.net(x)

In [134]:
cnet = CNet(2 * LSTM_LAYERS * HIDDEN_SIZE, 1).to(DEVICE)
loss_fn = F.mse_loss

## Loop

In [133]:
LEARNING_RATE = 1e-2

optimizer_lstm = optim.Adam(
    lstm.parameters(),
    lr=LEARNING_RATE,
    # eps=1e-3
)

optimizer_cnet = optim.Adam(
    cnet.parameters(),
    lr=LEARNING_RATE,
    # eps=1e-3
)

In [113]:
optimizer_lstm.zero_grad()
optimizer_cnet.zero_grad()

target_sent, candidate_sent, score = next(iter(data_loader))


target_hidden = lstm_loop(target_sent, lstm)
candidate_hidden = lstm_loop(candidate_sent, lstm)

inp = torch.cat([target_hidden, candidate_hidden]).to(DEVICE)

out = cnet(inp.view(1, -1))

print(out.squeeze(-1))
print(score)

loss = loss_fn(out.squeeze(-1), score)
print(loss.item())

loss.backward()

optimizer_lstm.step()
optimizer_cnet.step()

tensor([1.0000], grad_fn=<SqueezeBackward1>)
tensor([1.])
1.2789769243681803e-13


  target_list.append(torch.tensor(text_pipeline(_target)))
  candidate_list.append(torch.tensor(text_pipeline(_candidate)))
