<a href="https://colab.research.google.com/github/dotsnangles/Vanila-Transformer-Implementation/blob/main/Torch_Modules_for_Transformer_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn

In [None]:
nn.Linear

m = nn.Linear(20, 30)
input = torch.randn(128, 20)
output = m(input)
print(output.size())

torch.Size([128, 30])


In [None]:
torch.reshape

a = torch.arange(4.)
torch.reshape(a, (2, 2))
b = torch.tensor([[0, 1], [2, 3]])
torch.reshape(b, (-1,))

tensor([0, 1, 2, 3])

In [None]:
torch.einsum
torch.matmul

# vector x vector
tensor1 = torch.randn(3)
tensor2 = torch.randn(3)
s = torch.matmul(tensor1, tensor2).size()
m = torch.matmul(tensor1, tensor2)
print(tensor1)
print(tensor2)
print(m)
print('------------------------------------------------------------')

# matrix x vector
tensor1 = torch.randn(3, 4)
tensor2 = torch.randn(4)
s = torch.matmul(tensor1, tensor2).size()
m = torch.matmul(tensor1, tensor2)
print(tensor1)
print(tensor2)
print(m)
print('------------------------------------------------------------')

# batched matrix x broadcasted vector
tensor1 = torch.randn(10, 3, 4)
tensor2 = torch.randn(4)
s = torch.matmul(tensor1, tensor2).size()
m = torch.matmul(tensor1, tensor2)
print(tensor1)
print(tensor2)
print(m)
print('------------------------------------------------------------')

# batched matrix x batched matrix
tensor1 = torch.randn(10, 3, 4)
tensor2 = torch.randn(10, 4, 5)
s = torch.matmul(tensor1, tensor2).size()
m = torch.matmul(tensor1, tensor2)
print(tensor1)
print(tensor2)
print(m)
print('------------------------------------------------------------')

# batched matrix x broadcasted matrix
tensor1 = torch.randn(10, 3, 4)
tensor2 = torch.randn(4, 5)
s = torch.matmul(tensor1, tensor2).size()
m = torch.matmul(tensor1, tensor2)
print(tensor1)
print(tensor2)
print(m)

In [None]:
torch.Tensor.masked_fill_

# Parameters
# mask (BoolTensor) – the boolean mask

# value (float) – the value to fill in with

# masked_fill_demo.py

import torch as T
import numpy as np
device = T.device("cpu")

def my_masker(tsr, msk, v):
  res = tsr.clone()
  for i in range(len(tsr)):
    for j in range(len(tsr[0])):
      if msk[i][j] == 0:
        res[i][j] = v
  return res

print("\nBegin masked_fill() demo ")

data = np.array([[1.0, 2.0, 3.0],
                 [4.0, 5.0, 6.0]], dtype=np.float32)
tsr = T.tensor(data, dtype=T.float32).to(device)

print("\nThe tensor is:")
print(tsr)

msk = np.array([[0, 1, 1],
                [1, 0, 1]], dtype=np.uint8)
msk = T.tensor(msk, dtype=T.uint8)
print("\nThe mask is: ")
print(msk)

T.set_printoptions(precision=1)
result = tsr.masked_fill(msk == 0, 9.9)
print("\nThe result of mask_fill(msk==0, 9.9) is: ")
print(result)

res = my_masker(tsr, msk, 9.9)
print("\nThe result using custom masking function is: ")
print(res)

print("\nEnd demo ")


Begin masked_fill() demo 

The tensor is:
tensor([[1., 2., 3.],
        [4., 5., 6.]])

The mask is: 
tensor([[0, 1, 1],
        [1, 0, 1]], dtype=torch.uint8)

The result of mask_fill(msk==0, 9.9) is: 
tensor([[9.9, 2.0, 3.0],
        [4.0, 9.9, 6.0]])

The result using custom masking function is: 
tensor([[9.9, 2.0, 3.0],
        [4.0, 9.9, 6.0]])

End demo 


In [None]:
torch.softmax

m = nn.Softmax(dim=1)
input = torch.randn(2, 3)
print(input)
output = m(input)
print(output)

tensor([[ 0.7, -0.2, -1.1],
        [-0.8, -0.0,  1.1]])
tensor([[0.6, 0.3, 0.1],
        [0.1, 0.2, 0.7]])


In [86]:
nn.Embedding

# an Embedding module containing 10 tensors of size 3
embedding = nn.Embedding(10, 3)

# a batch of 2 samples of 4 indices each
input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
x = embedding(input)

# example with padding_idx
embedding = nn.Embedding(10, 3, padding_idx=0)
input = torch.LongTensor([[0,2,0,5]])
embedding(input)

# example of changing `pad` vector
padding_idx = 0
embedding = nn.Embedding(3, 3, padding_idx=padding_idx)
embedding.weight
with torch.no_grad():
    embedding.weight[padding_idx] = torch.ones(3)
embedding.weight

in tensor([[1, 2, 4, 5],
        [4, 3, 2, 9]])
out tensor([[[-0.6, -1.1, -0.6],
         [ 0.8,  1.0, -0.1],
         [ 2.4,  0.6, -0.2],
         [ 2.2,  0.5, -0.7]],

        [[ 2.4,  0.6, -0.2],
         [-1.4,  0.9,  0.1],
         [ 0.8,  1.0, -0.1],
         [ 1.1, -2.5, -0.8]]], grad_fn=<EmbeddingBackward0>)


Parameter containing:
tensor([[ 1.0,  1.0,  1.0],
        [ 0.5, -1.0,  1.9],
        [ 1.0, -0.6,  0.9]], requires_grad=True)

In [None]:
nn.LayerNorm

# NLP Example
batch, sentence_length, embedding_dim = 20, 5, 10
embedding = torch.randn(batch, sentence_length, embedding_dim)
layer_norm = nn.LayerNorm(embedding_dim)

# Activate module
layer_norm(embedding)

# Image Example
N, C, H, W = 20, 5, 10, 10
input = torch.randn(N, C, H, W)
# print(input)

# Normalize over the last three dimensions (i.e. the channel and spatial dimensions)
# as shown in the image below
layer_norm = nn.LayerNorm([C, H, W])
output = layer_norm(input)
# print(output)

In [None]:
nn.ReLU

m = nn.ReLU()
input = torch.randn(2)
print(input)
output = m(input)
print(output)

# An implementation of CReLU - https://arxiv.org/abs/1603.05201

# m = nn.ReLU()
# input = torch.randn(2).unsqueeze(0)
# output = torch.cat((m(input),m(-input)))

tensor([-1.0414,  0.3116])
tensor([0.0000, 0.3116])


In [None]:
nn.Dropout

m = nn.Dropout(p=0.2)
input = torch.randn(20, 16)
output = m(input)
print(output)

tensor([[-1.6298, -0.1889, -0.0000,  0.5942, -0.0000,  1.2120,  0.6604, -1.2961,
         -0.2244, -1.5138, -0.0000, -0.3063,  0.7182, -2.2613, -0.1167,  0.8874],
        [-2.1384, -0.8085, -2.2098, -1.2808,  0.2853,  0.0000,  1.3715, -1.3260,
          0.2927, -1.0185,  1.8577,  0.9353,  0.9600,  0.0000, -0.7526, -1.4569],
        [ 0.0000,  0.0000, -1.1167, -0.2852, -1.1784,  1.1591, -0.0000, -0.0599,
          0.0000, -1.7613, -0.7031,  0.5004,  1.0777,  0.5122, -0.5795,  0.0000],
        [ 0.0000,  1.1698,  0.6372,  0.0000, -0.5218,  1.2359,  1.3700, -1.4986,
          1.9762, -0.4013,  0.4246, -1.2062, -0.6150,  0.6805, -0.2744, -0.1185],
        [ 0.0000,  0.2685,  0.5638, -0.0879,  0.0000,  0.7736, -0.6908, -1.2601,
         -0.4170, -0.0000,  0.0076, -0.7303,  2.2090,  0.5476,  0.3615,  1.0004],
        [ 0.0000, -0.0000,  0.0000,  3.6464,  0.0474, -0.1398, -0.8299,  0.4616,
          1.2149,  3.3246, -1.7838,  0.0414, -0.0000, -0.0963,  0.5303, -0.4797],
        [-0.3271, -0.5

In [None]:
nn.ModuleList

# Holds submodules in a list.

# ModuleList can be indexed like a regular Python list, but modules it contains are properly registered, and will be visible by all Module methods.

class MyModule(nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])

    def forward(self, x):
        # ModuleList can act as an iterable, or be indexed using ints
        for i, l in enumerate(self.linears):
            x = self.linears[i // 2](x) + l(x)
        return x

torch.nn.modules.container.ModuleList

In [None]:
torch.arange()

torch.arange(5)
torch.arange(1, 4)
torch.arange(1, 2.5, 0.5)

<function _VariableFunctionsClass.arange>

In [None]:
torch.Tensor.expand

x = torch.tensor([[1], [2], [3]])
x.size()
x.expand(3, 4)
x.expand(-1, 4)   # -1 means not changing the size of that dimension

tensor([[1, 1, 1, 1],
        [2, 2, 2, 2],
        [3, 3, 3, 3]])

In [None]:
x = torch.arange(0, 6)
print(x)
x.expand(3, 6)

tensor([0, 1, 2, 3, 4, 5])


tensor([[0, 1, 2, 3, 4, 5],
        [0, 1, 2, 3, 4, 5],
        [0, 1, 2, 3, 4, 5]])

In [None]:
torch.Tensor.unsqueeze

x = torch.tensor([1, 2, 3, 4])
torch.unsqueeze(x, 0)
torch.unsqueeze(x, 1)

tensor([[1],
        [2],
        [3],
        [4]])

In [None]:
torch.tril

a = torch.randn(3, 3)
a
torch.tril(a)

b = torch.randn(4, 6)
b
torch.tril(b, diagonal=1)
torch.tril(b, diagonal=-1)
torch.tril(b, diagonal=0)

tensor([[ 0.2433,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.4033,  0.7351,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9757,  0.6241, -1.5771,  0.0000,  0.0000,  0.0000],
        [-0.0357, -0.7708,  1.1375, -0.9052,  0.0000,  0.0000]])

In [None]:
torch.tril(b, diagonal=0)

tensor([[ 0.2433,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.4033,  0.7351,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9757,  0.6241, -1.5771,  0.0000,  0.0000,  0.0000],
        [-0.0357, -0.7708,  1.1375, -0.9052,  0.0000,  0.0000]])

In [None]:
torch.tril(b, diagonal=1)

tensor([[ 0.2433,  0.5793,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.4033,  0.7351, -0.6321,  0.0000,  0.0000,  0.0000],
        [-0.9757,  0.6241, -1.5771,  1.4008,  0.0000,  0.0000],
        [-0.0357, -0.7708,  1.1375, -0.9052, -0.2328,  0.0000]])

In [None]:
torch.tril(b, diagonal=-1)

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.4033,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9757,  0.6241,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.0357, -0.7708,  1.1375,  0.0000,  0.0000,  0.0000]])