<a href="https://colab.research.google.com/github/gnoejh/ict1022/blob/main/Transformer/3_sequence_sequence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Sequence-Sequence Models

![type_sequence_Modeling](type_sequence_Modeling.png)



In [36]:
# %pip install torch torchvision # Uncomment if running on Colab
import torch
import torch.nn as nn
import torch.optim as optim

### 1. One-to-One


**Description**: A single input corresponds to a single output. This is typical in simple classification tasks.

**Example**: Image Classification

In this example, we'll simulate an image classification task using PyTorch. Although it's a simplified example, the idea is to classify a single image into one of multiple categories.
    

In [37]:

import torch
import torch.nn as nn
import torch.optim as optim

# Simulated input image (1, 3, 32, 32) -> Batch size of 1, 3 channels (RGB), 32x32 pixels
image = torch.randn(1, 3, 32, 32)

# Simple CNN for classification
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(32 * 8 * 8, num_classes) # 8*8 becuase of 2 maxpool layers
        
    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, 2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc1(x)
        return x

# Instantiate and run model
num_classes = 10  # e.g., 10 classes
model = SimpleCNN(num_classes)
output = model(image)
print("One-to-One Model Output (Logits):", output)
    

One-to-One Model Output (Logits): tensor([[ 0.3612,  0.0005, -0.0102, -0.0513,  0.2687, -0.0353, -0.3247,  0.0128,
          0.1543,  0.2045]], grad_fn=<AddmmBackward0>)


### 2. One-to-Many


**Description**: A single input produces a sequence of outputs. This is useful in tasks where a single piece of information is expanded into a sequence.

**Example**: Image Captioning

In this example, we simulate a basic image captioning model. An image is used as input to generate a caption (sequence of words) describing the image.
    

In [38]:
import torch
import torch.nn as nn

class ImageCaptioningModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(ImageCaptioningModel, self).__init__()
        self.cnn = nn.Conv2d(3, embed_size, kernel_size=3, stride=1, padding=1)
        self.pool = nn.AdaptiveAvgPool2d((1, 1))  # Pooling to reduce spatial dimensions
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, image):
        features = torch.relu(self.cnn(image))
        features = self.pool(features).view(image.size(0), 1, -1)  # Pool and flatten
        outputs, _ = self.rnn(features)
        captions = self.fc(outputs.squeeze(1))
        return captions


# Simulated image input and vocabulary
vocab_size = 20
embed_size = 64
hidden_size = 128

# Simulated input image (1, 3, 32, 32) -> Batch size of 1, 3 channels (RGB), 32x32 pixels
image = torch.randn(1, 3, 32, 32)

model = ImageCaptioningModel(vocab_size, embed_size, hidden_size)
caption_output = model(image)
print("One-to-Many Model Output (Caption logits):", caption_output)

One-to-Many Model Output (Caption logits): tensor([[ 0.0638, -0.0109, -0.0022, -0.0155, -0.0134,  0.0366, -0.0775,  0.0764,
         -0.0830,  0.0148,  0.0457, -0.0639,  0.0202, -0.0286,  0.1380,  0.0446,
         -0.0092,  0.0021,  0.0749, -0.1163]], grad_fn=<AddmmBackward0>)


### 3. Many-to-One


**Description**: A sequence of inputs produces a single output. This is useful for summarizing or classifying entire sequences.

**Example**: Sentiment Analysis

In this example, a sequence of words is classified into a single sentiment label, such as positive or negative sentiment.
    

In [39]:

class SentimentAnalysisModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(SentimentAnalysisModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.rnn(x)
        output = self.fc(hidden.squeeze(0))
        return output

# Simulated input sequence
sequence = torch.randint(0, vocab_size, (1, 5))  # Batch size of 1, sequence length of 5

output_size = 2  # Binary sentiment classification (positive/negative)
model = SentimentAnalysisModel(vocab_size, embed_size, hidden_size, output_size)
sentiment_output = model(sequence)
print("Many-to-One Model Output (Sentiment logits):", sentiment_output)
    

Many-to-One Model Output (Sentiment logits): tensor([[-0.0282,  0.0227]], grad_fn=<AddmmBackward0>)


### 4. Many-to-Many (Aligned)


**Description**: Each input in the sequence corresponds to an output in the sequence, maintaining alignment between input and output.

**Example**: Part-of-Speech Tagging

In this example, each word in a sentence is tagged with its part of speech (POS). The input and output sequences have the same length.
    

In [40]:

class POSTaggingModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(POSTaggingModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        rnn_out, _ = self.rnn(x)
        tags = self.fc(rnn_out)
        return tags

# Simulated input sentence (sequence of word indices)
sentence = torch.randint(0, vocab_size, (1, 5))  # Batch size of 1, sequence length of 5

output_size = 5  # Number of possible POS tags
model = POSTaggingModel(vocab_size, embed_size, hidden_size, output_size)
pos_tags_output = model(sentence)
print("Many-to-Many (Aligned) Model Output (POS Tag logits):", pos_tags_output)
    

Many-to-Many (Aligned) Model Output (POS Tag logits): tensor([[[-0.1526, -0.0360, -0.0051,  0.1109, -0.0153],
         [-0.0389, -0.0623, -0.0084,  0.0766, -0.0446],
         [-0.0445, -0.0105, -0.0255,  0.0409, -0.0622],
         [-0.0868, -0.0670, -0.0670,  0.0085, -0.0557],
         [-0.0679, -0.0116, -0.1109, -0.0032, -0.0228]]],
       grad_fn=<ViewBackward0>)


### 5. Many-to-Many (Non-Aligned)


**Description**: A sequence of inputs produces a sequence of outputs with potentially different lengths. Here, the entire input sequence affects the entire output sequence.

**Example**: Machine Translation

In this example, a sequence in one language (e.g., English) is translated into a sequence in another language (e.g., French) with a potentially different length.
    

In [41]:

class MachineTranslationModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_size, hidden_size):
        super(MachineTranslationModel, self).__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.encoder_rnn = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, embed_size)
        self.decoder_rnn = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, tgt_vocab_size)

    def forward(self, src, tgt):
        src_embedded = self.src_embedding(src)
        _, (hidden, _) = self.encoder_rnn(src_embedded)

        tgt_embedded = self.tgt_embedding(tgt)
        decoder_out, _ = self.decoder_rnn(tgt_embedded, (hidden, hidden))
        output = self.fc(decoder_out)
        return output

# Simulated input and target sequences
src_vocab_size = 10000
tgt_vocab_size = 10000
src_sequence = torch.randint(0, src_vocab_size, (1, 6))  # Source sequence
tgt_sequence = torch.randint(0, tgt_vocab_size, (1, 8))  # Target sequence

model = MachineTranslationModel(src_vocab_size, tgt_vocab_size, embed_size, hidden_size)
translation_output = model(src_sequence, tgt_sequence)
print("Many-to-Many (Non-Aligned) Model Output (Translation logits):", translation_output)
    

Many-to-Many (Non-Aligned) Model Output (Translation logits): tensor([[[ 0.0397, -0.0178, -0.0525,  ..., -0.1130, -0.0455,  0.1471],
         [ 0.1677, -0.0168,  0.0174,  ...,  0.0165, -0.0347,  0.0858],
         [ 0.0698, -0.0674,  0.0535,  ...,  0.0376,  0.0115,  0.0411],
         ...,
         [ 0.0496, -0.0804, -0.0178,  ..., -0.0640, -0.0081,  0.0620],
         [ 0.0917, -0.0692, -0.0396,  ..., -0.0282, -0.1171,  0.0887],
         [ 0.0170, -0.1550, -0.0283,  ..., -0.0355, -0.0411,  0.0718]]],
       grad_fn=<ViewBackward0>)
