In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
import math

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [3]:
dropout_prob = 0.2
class Embedding(nn.Module):

  def __init__(self, vocab_size, embed_size):
    super().__init__()
    self.token_embeddings = nn.Embedding(vocab_size, embed_size)
  
  def forward(self, input):
    out = self.token_embeddings(input)
    # print(out.shape)
    return out

class PositionalEmbedding(nn.Module):
  """For every token in context length we will generate a positional encoding"""

  def __init__(self, context_len, embed_size):
    super().__init__()
    pos_encoding = torch.zeros(context_len, embed_size)
    for pos in range(context_len):
      for i in range(0,embed_size,2):
        pos_encoding[pos,i] = math.sin(pos / (10000 ** ((2 * i)/embed_size)))
        pos_encoding[pos, i+1] = math.cos(pos / (10000 ** ((2 * (i + 1))/embed_size)))
    pos_encoding = pos_encoding.unsqueeze(dim=0)
    pos_encoding = pos_encoding
    # self.dropout = nn.Dropout(dropout_prob)
    self.register_buffer('pe',pos_encoding)
  
  def forward(self, input):
    context_len = input.shape[1]
    # print(context_len)
    # print(input.shape)
    # print(self.pe[:,:context_len].shape)
    input = input + self.pe[:,:context_len,:]
    # input = input + torch.autograd.Variable(self.pe[:,:context_len],requires_grad=False)
    return input
#------self attention with single head----------#
class Head(nn.Module):

  def __init__(self,head_size,embed_size,context_length):
    super().__init__()

    self.query = nn.Linear(embed_size, head_size, bias=False)
    self.key = nn.Linear(embed_size, head_size, bias=False)
    self.value = nn.Linear(embed_size, head_size, bias=False)
    self.dropout = nn.Dropout(dropout_prob)
    self.register_buffer('tril',torch.tril(torch.ones(context_length, context_length)))
  
  def forward(self, q, k, v, pad_mask=None):
    batch_size, context_len, embed_size = q.shape
    q = self.query(q)
    k = self.key(k)
    v = self.value(v)
    # print(q.shape, k.shape, v.shape)
    weights = q @ k.transpose(-2,-1) * embed_size**-0.5
    if pad_mask is not None:
      weigths = weights.masked_fill(pad_mask[:,None, None, :] == 0, float('-inf'))
    #--this is the causal mask--#
    weights = weights.masked_fill(self.tril[:context_len,:context_len]==0, float('-inf'))
    # print(weights[0])
    weights = F.softmax(weights, dim=-1)
    weights = self.dropout(weights)
    out = weights @ v
    return out
#-----multihead attention------#
class MultiHeadAttention(nn.Module):

  def __init__(self, num_heads, head_size,embed_size,context_length):
    super().__init__()
    self.multiheads = nn.ModuleList([Head(head_size,embed_size,context_length) for _ in range(num_heads)])
    self.projection = nn.Linear(embed_size, embed_size)
    self.dropout = nn.Dropout(dropout_prob)
  
  def forward(self,q,k,v,pad_mask=None):
    out = torch.cat([h(q,k,v,pad_mask) for h in self.multiheads], dim = -1)
    return self.dropout(self.projection(out))
#---------Feedforward--------------#
class FeedForward(nn.Module):
  """Self attention while calculates the interactions among the tokens the feedforward will train the model on
  individual tokens and try to extract the information individually"""
  def __init__(self,embed_size):
    super().__init__()
    self.neural_net = nn.Sequential(
        nn.Linear(embed_size, 4 * embed_size), # multiplying by 4 as per the paper 'attention is all you need', this expands the hidden layer
        nn.GELU(),
        nn.Linear(4 * embed_size, embed_size), #--projection layer 
        nn.Dropout(dropout_prob)
    )
  def forward(self, x):
    return self.neural_net(x)




In [4]:
class DecoderBlock(nn.Module):

  def __init__(self, num_heads, embed_size,context_length):
    super().__init__()
    head_size = embed_size // num_heads
    self.masked_multiheads = MultiHeadAttention(num_heads, head_size,embed_size,context_length)
    self.multiheads = MultiHeadAttention(num_heads, head_size,embed_size,context_length)
    self.feedforward = FeedForward(embed_size)
    self.ln1 = nn.LayerNorm(embed_size)
    self.ln2 = nn.LayerNorm(embed_size)
    self.ln3 = nn.LayerNorm(embed_size)
    self.dropout1 = nn.Dropout(dropout_prob)
    self.dropout2 = nn.Dropout(dropout_prob)
    self.dropout3 = nn.Dropout(dropout_prob)

  def forward(self,input,pad_mask=None):
    #adding residual connection
    input = input + self.dropout1(self.masked_multiheads(self.ln1(input),self.ln1(input),self.ln1(input), pad_mask = pad_mask))
    # targets += self.dropout1(self.masked_multiheads(self.ln1(targets),self.ln1(targets),self.ln1(targets), mask = True))
    # if out_from_encoder is not None:
    #   targets += self.dropout2(self.multiheads(self.ln2(targets),self.ln2(out_from_encoder),self.ln2(out_from_encoder),mask=True))
    out  = input + self.dropout3(self.feedforward(self.ln3(input)))
    return out

class Decoder(nn.Module):

  def __init__(self, num_blocks, context_length, embed_size, num_heads, head_size, vocab_size):
    super().__init__()
    self.embeddings = Embedding(vocab_size,embed_size)
    self.position_embeddings = PositionalEmbedding(context_length, embed_size)
    self.decoder_blocks = nn.Sequential(*[DecoderBlock(num_heads, embed_size,context_length) for _ in range(num_blocks)])
    self.ln1 = nn.LayerNorm(embed_size)
    self.linear_layer = nn.Linear(embed_size, vocab_size)
  
  def forward(self, input, pad_mask = None):
    embed_output = self.embeddings(input)
    pos_out = self.position_embeddings(embed_output) #----these are our inputs to the block
    for block in self.decoder_blocks:
      pos_out = block(pos_out, pad_mask)
    out = self.ln1(pos_out)
    out = self.linear_layer(out)
    # out_probs = F.softmax(self.linear_layer(pos_out), dim = -1)
    return out

    




# Testing the decoder

In [5]:
num_blocks, context_length, embed_size, num_heads, head_size, vocab_size, batch_size,num_clases = 2, 1024, 64, 4, 16, 20000, 64,5

In [6]:
model = Decoder(num_blocks, context_length, embed_size, num_heads, head_size, vocab_size)

In [9]:
model.to(device)

Decoder(
  (embeddings): Embedding(
    (token_embeddings): Embedding(20000, 64)
  )
  (position_embeddings): PositionalEmbedding()
  (decoder_blocks): Sequential(
    (0): DecoderBlock(
      (masked_multiheads): MultiHeadAttention(
        (multiheads): ModuleList(
          (0-3): 4 x Head(
            (query): Linear(in_features=64, out_features=16, bias=False)
            (key): Linear(in_features=64, out_features=16, bias=False)
            (value): Linear(in_features=64, out_features=16, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (projection): Linear(in_features=64, out_features=64, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (multiheads): MultiHeadAttention(
        (multiheads): ModuleList(
          (0-3): 4 x Head(
            (query): Linear(in_features=64, out_features=16, bias=False)
            (key): Linear(in_features=64, out_features=16, bias=False)
            (value): Linear(in_feature

In [10]:
x = torch.randint(0,20000,(8,512))
x = x.to(device)

In [11]:
mask = torch.ones((8,512))
mask[:,256:] = 0
mask = mask.to(device)

In [12]:
y = model(x, mask)

In [13]:
y.shape

torch.Size([8, 512, 20000])

In [None]:
!pip install transformers datasets

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
from datasets import load_dataset
raw_dataset = load_dataset("glue","sst2")

In [None]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation=True)
tokenized_datasets = raw_dataset.map(tokenize_fn,batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [None]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence","idx","label"])

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(tokenized_datasets['train'],
                          shuffle = True,
                          batch_size = 32,
                          collate_fn = data_collator)

In [None]:
for batch in train_loader:
  for k, v in batch.items():
    print(k,v.shape)
  break

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids torch.Size([32, 55])
attention_mask torch.Size([32, 55])


In [None]:
tokenizer.pad_token_id

0

In [None]:
model = Decoder(num_blocks = 4, 
                embed_size = 512, 
                num_heads = 8, 
                head_size=16, 
                vocab_size=tokenizer.vocab_size,
                context_length=tokenizer.max_model_input_sizes[checkpoint])
model.to(device)

Decoder(
  (embeddings): Embedding(
    (token_embeddings): Embedding(28996, 512)
  )
  (position_embeddings): PositionalEmbedding()
  (decoder_blocks): Sequential(
    (0): DecoderBlock(
      (masked_multiheads): MultiHeadAttention(
        (multiheads): ModuleList(
          (0-7): 8 x Head(
            (query): Linear(in_features=512, out_features=64, bias=False)
            (key): Linear(in_features=512, out_features=64, bias=False)
            (value): Linear(in_features=512, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (projection): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (multiheads): MultiHeadAttention(
        (multiheads): ModuleList(
          (0-7): 8 x Head(
            (query): Linear(in_features=512, out_features=64, bias=False)
            (key): Linear(in_features=512, out_features=64, bias=False)
            (value): Linear(in

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.AdamW(model.parameters())

In [None]:
from datetime import datetime
import numpy as np
# model training

def train(model, criterion, optimizer, train_loader, epochs):
  train_losses = np.zeros(epochs)
  # test_losses = np.zeros(epochs)

  for i in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    n_train = 0
    for batch in train_loader:
      batch = {k: v.to(device) for k , v in batch.items()}
      optimizer.zero_grad()
      targets = batch['input_ids'].clone().detach()
      targets = torch.roll(targets, shifts=-1, dims=1)
      targets[:,-1] = tokenizer.pad_token_id
      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss = criterion(outputs.transpose(2,1), targets)
      loss.backward()
      optimizer.step()
      train_loss.append(loss.item())
    train_loss = np.mean(train_loss) #average loss

    # model.eval()
    # test_loss = 0
    # n_test = 0
    # for batch in valid_loader:
    #    batch = {k: v.to(device) for k , v in batch.items()}
    #    outputs = model(batch['input_ids'], batch['attention_mask'])
    #    loss = criterion(outputs, batch['labels'])
    #    test_loss += loss.item()*batch['input_ids'].size(0)
    #    n_test += batch['input_ids'].size(0)
    # test_loss = test_loss/n_test #average loss
    train_losses[i] = train_loss
    # test_losses[i] = test_loss

    duration = datetime.now() - t0
    print(f"Epoch {i+1}/{epochs}, \
    Train Loss : {train_loss:.4f}, \
    Duration: {duration}")
  return train_losses

In [None]:
train_losses = train(
    model,
    criterion,
    optimizer,
    train_loader,
    epochs=15
)

Epoch 1/15,     Train Loss : 4.7669,     Duration: 0:01:41.757727
Epoch 2/15,     Train Loss : 3.2761,     Duration: 0:01:41.396047
Epoch 3/15,     Train Loss : 2.5829,     Duration: 0:01:41.460361
Epoch 4/15,     Train Loss : 2.2159,     Duration: 0:01:42.170310
Epoch 5/15,     Train Loss : 2.0006,     Duration: 0:01:41.544539
Epoch 6/15,     Train Loss : 1.8538,     Duration: 0:01:41.895752
Epoch 7/15,     Train Loss : 1.7507,     Duration: 0:01:41.546609
Epoch 8/15,     Train Loss : 1.6694,     Duration: 0:01:41.817958
Epoch 9/15,     Train Loss : 1.6087,     Duration: 0:01:41.712565
Epoch 10/15,     Train Loss : 1.5590,     Duration: 0:01:41.943966
Epoch 11/15,     Train Loss : 1.5179,     Duration: 0:01:42.014242
Epoch 12/15,     Train Loss : 1.4837,     Duration: 0:01:41.843878
Epoch 13/15,     Train Loss : 1.4519,     Duration: 0:01:41.737299
Epoch 14/15,     Train Loss : 1.4285,     Duration: 0:01:41.832658
Epoch 15/15,     Train Loss : 1.4051,     Duration: 0:01:41.962416


In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [None]:
chars = sorted(list(set(text)))
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
def encode(text):
  return [stoi[ch] for ch in text]
def decode(nums):
  return ''.join([itos[i] for i in nums])
vocab_size = len(chars)
data = torch.tensor(encode(text), dtype= torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]
def get_batch(split):
  data = train_data if split == 'train' else val_data
  # generates 4 random starting indexes from 0 to len(data)-block_size
  ix = torch.randint(len(data)-context_length, (batch_size,))
  # stack all the data of contextual block size 8 on top of one another
  x = torch.stack([data[i:i+context_length] for i in ix])
  y = torch.stack([data[i+1:i+context_length+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x,y 

In [None]:
valid_loader = DataLoader(tokenized_datasets['validation'],
                          batch_size = 32,
                          collate_fn = data_collator)

In [None]:
model.eval()
for batch in valid_loader:
  batch = {k: v.to(device) for k, v in batch.items()}
  outputs = model(batch['input_ids'], batch['attention_mask'])
  break


    



In [None]:
outputs.shape

torch.Size([32, 51, 28996])

In [None]:
torch.argmax(outputs, axis=-1).shape

torch.Size([32, 51])

In [None]:
prdiction_ids = torch.argmax(outputs, axis=-1)

In [None]:
tokenizer.decode(prdiction_ids[0])

"a's a pretty, funny funny story. [SEP] with of of a a a a a a a a a a a a a a a a a a a a a a apparent apparent apparent apparent a a a........"

In [None]:
tokenizer.decode(batch['input_ids'][0])

"[CLS] it's a charming and often affecting journey. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

# generator

In [None]:
prompt = " it is"
tokenized_prompt = tokenizer(prompt, return_tensors='pt')
tokenized_prompt

{'input_ids': tensor([[ 101, 1122, 1110,  102]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [None]:
outputs = model(
    tokenized_prompt['input_ids'][:, :-1].to(device),
    tokenized_prompt['attention_mask'][:,:-1].to(device)
)
outputs.shape

torch.Size([1, 3, 28996])

In [None]:
prediction_ids = torch.argmax(outputs[:,-1,:], axis=-1)

In [None]:
tokenizer.decode(prediction_ids[0])

'a'

In [None]:
prompt = " it is a hi ##lar in"
tokenized_prompt = tokenizer(prompt, return_tensors='pt')
tokenized_prompt

{'input_ids': tensor([[  101,  1122,  1110,   170, 20844,   108,   108,  2495,  1197,  1107,
           102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
outputs = model(
    tokenized_prompt['input_ids'][:, :-1].to(device),
    tokenized_prompt['attention_mask'][:,:-1].to(device)
)
outputs.shape

torch.Size([1, 10, 28996])

In [None]:
prediction_ids = torch.argmax(outputs[:,-1,:], axis=-1)
tokenizer.decode(prediction_ids[0])

'a'