In [None]:
# Name: Devashish Mayur Potnis
# Class: BE-AIML

In [5]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)

        self.fc_out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask):
        batch_size = query.shape[0]

        # Linear transformations
        Q = self.query(query)
        K = self.key(key)
        V = self.value(value)

        # Split into heads
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        # Scaled Dot-Product Attention
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float('-1e20'))

        attention = torch.nn.functional.softmax(energy, dim=-1)
        x = torch.matmul(attention, V)

        # Reshape and concatenate
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.d_model)

        # Final linear layer
        x = self.fc_out(x)

        return x

class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model, n_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, d_model)
        )
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # Self-attention
        attention = self.self_attention(x, x, x, mask)
        x = x + self.dropout(attention)
        x = self.layer_norm1(x)

        # Feedforward
        ffn_output = self.ffn(x)
        x = x + self.dropout(ffn_output)
        x = self.layer_norm2(x)

        return x

class TransformerEncoder(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, n_layers, max_seq_length, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, n_heads, ff_dim, dropout)
            for _ in range(n_layers)
        ])
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.position_embedding = nn.Embedding(max_seq_length, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        positions = torch.arange(0, x.size(1)).expand(x.size(0), x.size(1)).to(self.device)
        x = x + self.position_embedding(positions)
        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, mask)

        return x

# Example usage:
d_model = 512
n_heads = 8
ff_dim = 2048
n_layers = 6
max_seq_length = 100
dropout = 0.1

# Create transformer encoder
transformer_encoder = TransformerEncoder(d_model, n_heads, ff_dim, n_layers, max_seq_length, dropout)

# Dummy input
input_data = torch.rand((16, 100, d_model))

# Mask for padding
padding_mask = (input_data.sum(dim=-1) != 0).unsqueeze(1).unsqueeze(2)

# Forward pass
output_data = transformer_encoder(input_data, padding_mask)
print(output_data)
print("Output shape:", output_data.shape)


tensor([[[-8.7619e-02,  2.0156e-01,  2.8862e-01,  ..., -1.2939e+00,
           5.1404e-01,  1.4258e+00],
         [-3.0341e-02,  8.7924e-01, -9.7820e-01,  ...,  1.2920e+00,
           2.9295e-01, -1.3277e+00],
         [-1.5473e+00, -1.0971e-01,  4.6799e-01,  ...,  6.9329e-02,
           9.3166e-01, -1.7259e+00],
         ...,
         [ 1.8971e-01, -1.0820e+00, -1.0425e+00,  ...,  8.1433e-01,
           3.5505e-01, -1.1333e+00],
         [-7.5087e-01, -2.2452e-02, -9.4044e-01,  ...,  9.3356e-01,
           4.1222e+00, -8.2847e-01],
         [-1.1959e+00, -3.3969e-02,  1.0555e+00,  ..., -1.1676e+00,
           1.1279e+00, -1.2758e+00]],

        [[-1.0027e+00, -6.4236e-01, -4.9098e-01,  ..., -2.2374e+00,
           2.7833e-01,  1.6787e+00],
         [ 2.9975e-01,  3.5225e-01,  2.0555e-01,  ...,  5.5566e-01,
           4.9051e-01, -1.9905e+00],
         [-1.3381e+00,  1.5946e-01,  4.2815e-02,  ..., -2.7201e-01,
           1.3507e+00, -1.8895e+00],
         ...,
         [-8.8799e-01, -1

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads

        # Define linear layers for queries, keys, and values
        self.linear_q = nn.Linear(d_model, d_model)
        self.linear_k = nn.Linear(d_model, d_model)
        self.linear_v = nn.Linear(d_model, d_model)

        # Linear layer for the output of the attention heads
        self.linear_out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]

        # Linear transformation for queries, keys, and values
        Q = self.linear_q(query)
        K = self.linear_k(key)
        V = self.linear_v(value)

        # Split into multiple heads
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim)

        # Transpose to prepare for attention calculation
        Q = Q.permute(0, 2, 1, 3)
        K = K.permute(0, 2, 1, 3)
        V = V.permute(0, 2, 1, 3)

        # Calculate scaled dot-product attention
        scores = torch.matmul(Q, K.permute(0, 1, 3, 2)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attention = F.softmax(scores, dim=-1)
        x = torch.matmul(attention, V)

        # Concatenate attention heads
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.d_model)

        # Linear transformation for the output
        x = self.linear_out(x)

        return x


class FeedForward(nn.Module):
    def __init__(self, d_model, ff_hidden_dim):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, ff_hidden_dim)
        self.dropout = nn.Dropout(0.1)
        self.linear2 = nn.Linear(ff_hidden_dim, d_model)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x


class TransformerLayer(nn.Module):
    def __init__(self, d_model, n_heads, ff_hidden_dim):
        super(TransformerLayer, self).__init__()

        # Multi-Head Self-Attention
        self.self_attention = MultiHeadAttention(d_model, n_heads)

        # Feedforward Neural Network
        self.feedforward = FeedForward(d_model, ff_hidden_dim)

        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, mask):
        # Multi-Head Self-Attention
        attention_output = self.self_attention(x, x, x, mask)
        x = x + self.dropout(attention_output)
        x = self.norm1(x)

        # Feedforward Neural Network
        ff_output = self.feedforward(x)
        x = x + self.dropout(ff_output)
        x = self.norm2(x)

        return x


class Transformer(nn.Module):
    def __init__(self, d_model, n_heads, ff_hidden_dim, n_layers):
        super(Transformer, self).__init__()

        # Stack multiple transformer layers
        self.layers = nn.ModuleList([TransformerLayer(d_model, n_heads, ff_hidden_dim) for _ in range(n_layers)])

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return x


# Example usage
d_model = 512
n_heads = 8
ff_hidden_dim = 2048
n_layers = 6

# Create transformer model
transformer = Transformer(d_model, n_heads, ff_hidden_dim, n_layers)

# Dummy input
input_data = torch.rand((32, 10, d_model))  # (batch_size, sequence_length, d_model)

# Mask for padding tokens
mask = (input_data != 0).unsqueeze(1).unsqueeze(2)  # Assuming 0 represents padding tokens

# Forward pass
output = transformer(input_data, mask)
print("Output shape:", output.shape)


RuntimeError: The size of tensor a (512) must match the size of tensor b (10) at non-singleton dimension 4

In [None]:
# Definition: A deep learning model architecture based on self-attention, widely used in NLP tasks.
# Key Components:
# Self-Attention: Computes attention scores among input tokens.
# Positional Encoding: Adds positional information to input tokens.
# Multi-Head Attention: Allows the model to focus on different parts of the input simultaneously.
# Feed-Forward Networks: Apply additional transformation layers.
# Layer Normalization: Normalizes the output of each layer to stabilize training.