In [None]:
import math
import torch
from torch import nn
import json
import numpy as np

class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(num_embeddings=ntoken, embedding_dim=d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: torch.Tensor, src_mask: torch.Tensor = None) -> torch.Tensor:
        """
        Arguments:
            src: Tensor, shape ``[batch_size, seq_len]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = src.to(torch.long)
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src[0])).to(device)
        output = self.transformer_encoder(src, src_mask)
        output = self.linear(output)
        return output

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
import tqdm
from tqdm import tqdm


root = 'drive/MyDrive/Colab_Notebooks/Torch/'
root = ''
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.long
filepath = root + 'numerized_dataset512.json'
# hyperparameters
BATCH_SIZE = 32
EPOCHS = 100
LEARNING_RATE = 1e-5
LOSS_EVERY = 100
# get vocab size

with open(root + 'token_map.json', 'r') as f:
    token_map = json.load(f)
    VOCAB_SIZE = len(token_map) + 2 # +1 for padding token, +1 for start token
print(VOCAB_SIZE)
# SEQ_LENGTH: to change seq_length, run preprocess_dataset with desired SEQ_LENGTH

# load dataset
def load_dataset(filepath):
    '''
    :return: each train/val/test data is an array of sequences [[...], [...] ...]
    '''
    with open(filepath, 'r') as f:
        dataset = json.load(f)
        train_data = []
        val_data = []
        test_data = []
        for sequence_pair in dataset['train']:
            train_data.append(sequence_pair[0])
        for sequence_pair in dataset['val']:
            val_data.append(sequence_pair[0])
        for sequence_pair in dataset['test']:
            test_data.append(sequence_pair[0])
    return train_data, val_data, test_data

def train_batch(batch, targets):
    '''
    :param batch: sequence shifted right, shape (batch_size, seq_length)
    :param targets: unshifted sequence, shape (batch_size, seq_length)
    :return:
    '''

    loss_fn = nn.CrossEntropyLoss() # shapes: input (N, num_classes, d1); target (N, d1); returns (N, d1)
                                    # so we have to permute logit dimensions
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    logits = model(batch) # logits have shape (batch_size, seq_length, vocab_size)
    loss = loss_fn(logits.permute(0, 2, 1), targets).sum()

    accuracy = torch.eq(logits.argmax(dim=2, keepdim=False), targets).float().sum()/\
                       (targets.shape[0]*targets.shape[1])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss, accuracy

def get_input_sequences(batch):
    '''
    :param batch: shape (batch_size, seq_length)
    remove last token in each sequence, prepend start token to each sequence
    '''
    batch = np.array(batch)
    batch -= 1
    batch_size, seq_length = batch.shape
    # Remove the last token from each sequence
    batch_trimmed = batch[:, :-1]
    # Prepend start token (int = vocab_size) to each sequence
    start_token = np.ones((batch_size, 1))*(VOCAB_SIZE-1)  # Replace this with the actual start token
    input_data = np.concatenate([start_token, batch_trimmed], axis=1)
    #print(input_data[0].tolist())
    return input_data

#==============================================================================================================

train_data, val_data, test_data = load_dataset(filepath)
num_iterations = len(train_data)//(BATCH_SIZE)
EPOCH = 15
model = TransformerModel(ntoken=VOCAB_SIZE, d_model=512, nhead=4, d_hid=1024, nlayers=4, dropout=0.1)
model.to(device)
state_dict = torch.load(f'test8_epoch{EPOCH}_sd.pth', map_location='cuda')
model.load_state_dict(state_dict)

num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(num_params)

t = tqdm(range(num_iterations))

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    epoch_accuracy = 0
    for i in range(num_iterations):
        batch = train_data[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
        batch_loss, batch_accuracy = train_batch(torch.tensor(get_input_sequences(batch), dtype=dtype, device=device),
                                                 torch.tensor((np.array(batch)-1), dtype=dtype, device=device))

        epoch_loss += batch_loss
        epoch_accuracy += batch_accuracy

        if i % LOSS_EVERY == 0 and i > 0:
            train_loss = epoch_loss/(i+1)
            train_accuracy = epoch_accuracy/(i+1)
            #t.set_description(f"Epoch: {epoch}: Iteration: {i} Loss: {train_loss:.4f} Accuracy: {train_accuracy:.4f}")

    # get validation stats on epoch end

    model.eval()
    with torch.no_grad():
        val_iterations = len(val_data)//BATCH_SIZE
        val_loss = 0
        val_accuracy = 0
        loss_fn = nn.CrossEntropyLoss()
        for j in range(val_iterations):
            batch = val_data[j * BATCH_SIZE:(j + 1) * BATCH_SIZE]
            targets = torch.tensor((np.array(batch)-1), dtype=dtype, device=device)
            #batch.to(device)
            #targets.to(device)

            logits = model(torch.tensor(get_input_sequences(batch), dtype=dtype, device=device))  # logits have shape (batch_size, seq_length, vocab_size)
            loss = loss_fn(logits.permute(0, 2, 1), targets).sum()

            accuracy = torch.eq(logits.argmax(dim=2, keepdim=False), targets).float().sum()/\
                       (targets.shape[0]*targets.shape[1])

            #print(logits.argmax(dim=2))

            val_loss += loss
            val_accuracy += accuracy

        val_loss /= (j+1)
        val_accuracy /= (j+1)

        print(f'\nEpoch {epoch+1}/{EPOCHS}. loss:{train_loss:.4f}; val loss: {val_loss:.4f}; acc: {train_accuracy:.4f}; val acc: {val_accuracy:.4f}')
        # save model
        model_path = f'epoch{epoch+1+EPOCH}_sd.pth'
        torch.save(model.state_dict(), model_path)

417
8838561


  0%|          | 0/972 [00:00<?, ?it/s]


Epoch 1/100. loss:1.7151; val loss: 1.6134; acc: 0.5247; val acc: 0.5496

Epoch 2/100. loss:1.7142; val loss: 1.6145; acc: 0.5248; val acc: 0.5491

Epoch 3/100. loss:1.7140; val loss: 1.6132; acc: 0.5249; val acc: 0.5498

Epoch 4/100. loss:1.7136; val loss: 1.6131; acc: 0.5251; val acc: 0.5493

Epoch 5/100. loss:1.7128; val loss: 1.6114; acc: 0.5252; val acc: 0.5504

Epoch 6/100. loss:1.7124; val loss: 1.6119; acc: 0.5253; val acc: 0.5503

Epoch 7/100. loss:1.7117; val loss: 1.6113; acc: 0.5253; val acc: 0.5501

Epoch 8/100. loss:1.7114; val loss: 1.6101; acc: 0.5255; val acc: 0.5507

Epoch 9/100. loss:1.7107; val loss: 1.6104; acc: 0.5257; val acc: 0.5499

Epoch 10/100. loss:1.7102; val loss: 1.6101; acc: 0.5258; val acc: 0.5501

Epoch 11/100. loss:1.7094; val loss: 1.6094; acc: 0.5259; val acc: 0.5508

Epoch 12/100. loss:1.7090; val loss: 1.6091; acc: 0.5260; val acc: 0.5503

Epoch 13/100. loss:1.7085; val loss: 1.6094; acc: 0.5261; val acc: 0.5502

Epoch 14/100. loss:1.7079; val lo

In [None]:

model = TransformerModel(ntoken=VOCAB_SIZE, d_model=512, nhead=4, d_hid=1024, nlayers=4, dropout=0.1)
model.to(device)
state_dict = torch.load('test4_epoch21_sd.pth', map_location='cuda')
model.load_state_dict(state_dict)

inputs = [[416]]
outputs = []
softmax = nn.softmax(dim=2)
model.eval()
with torch.no_grad():
  for i in range(50):
    pred_logits = model(torch.tensor(inputs, dtype=torch.long, device=device))
    pred_probs = softmax(pred_logits)
    pred_index = torch.multinomial(pred_probs[0, i], num_samples=1, replacement=True)
    inputs[0].append(pred_index)
    outputs.append(pred_index)
  print(outputs)


In [None]:
import numpy as np
import torch
def create_mask(type, d):
  '''
  :param type {1, 2, 3}
  type 1: for input layer
  type 2: for hidden layers
  type 3: for output layer
  type 4: matrix for converting x -> y
  :return:
  '''
  mask = []

  if type == 1:
    for i in range(2 * d):
      new_row = []
      for j in range(d):
        if j * 2 + 1 < i:
          new_row.append(1)
        else:
          new_row.append(0)
      mask.append(new_row)
    mask = torch.tensor(mask, requires_grad=False).type(torch.float32).to(device)

  elif type == 2:
    for i in range(d):
      new_row = []
      for j in range(d):
        if j <= i:
          new_row.append(1)
        else:
          new_row.append(0)
      mask.append(new_row)
    print(np.array(mask))
    mask = torch.tensor(mask, requires_grad=False).type(torch.float32).to(device)

  elif type == 3:
    for i in range(d):
      new_row = []
      for j in range(2 * d):
        if j <= i * 2 + 1:
          new_row.append(1)
        else:
          new_row.append(0)
      mask.append(new_row)
    mask = torch.tensor(mask, requires_grad=False).type(torch.float32).to(device)

  elif type == 4:
    for i in range(2 * d):
      new_row = []
      for j in range(d):
        if j * 2 <= i and i % 2 != 0 and j * 2 + 1 == i:
          new_row.append(1)
        else:
          new_row.append(0)
      mask.append(new_row)
    mask = np.array(mask)
  return mask

create_mask(2, 6)

[[1 0 0 0 0 0]
 [1 1 0 0 0 0]
 [1 1 1 0 0 0]
 [1 1 1 1 0 0]
 [1 1 1 1 1 0]
 [1 1 1 1 1 1]]


NameError: name 'device' is not defined

In [None]:
import torch
x = torch.arange(5)
print(x)
print(x[2:4])
print(x.dtype)

tensor([0, 1, 2, 3, 4])
tensor([2, 3])
torch.int64
