<a href="https://colab.research.google.com/github/cj12o/Pytorch/blob/main/Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.optim as optim
import torch.nn as nn

In [27]:
class Encoder(nn.Module):
  def __init__(self,input_dim,emb_dim,hidden_dim,num_layers=1):
    super(Encoder,self).__init__()
    self.embedding=nn.Embedding(input_dim,emb_dim)
    self.gru=nn.GRU(emb_dim,hidden_dim,num_layers,batch_first=True)
  def forward(self,src):
    l1_op=self.embedding(src)
    output,hidden=self.gru(l1_op)
    return hidden
#note hidden_dim->1) hyper param for Gru,it represents no:of gfeatures in hidden layes
                  #2)it basically represent gru capacity to learm and represent seq. data
# A larger hidden_dim means the hidden state can store more information, potentially allowing the network to
# capture more intricate relationships in the sequence. However, a larger hidden_dim also increases the number of
# parameters in the model, which can lead to increased computational cost
#  and potentially overfitting if the dataset is not large enough.

In [32]:
class Decoder(nn.Module):
  def __init__(self,output_dim,emb_dim,hidden_dim,num_layers=1):
    super(Decoder,self).__init__()
    self.embeddings=nn.embedding=nn.Embedding(output_dim,emb_dim)
    self.gru=nn.GRU(emb_dim,hidden_dim,num_layers,batch_first=True)
    self.fc_out=nn.Linear(hidden_dim,output_dim)
  def forward(self,input,hidden):
    input=input.unsqueeze(1)  # [batch, 1]
    l1_op=self.embeddings(input)  #[batch,emb_dim]
    output,hidden=self.gru(l1_op)
    l3_op=self.fc_out(output.squeeze(1)) #[batch,output_dim]
    prediction = self.fc_out(output.squeeze(1))  # [batch, output_dim]
    return hidden,prediction


In [33]:
class Seq2seq(nn.Module):
  def __init__(self,encoder,decoder,device):
    super(Seq2seq,self).__init__()
    self.encoder=encoder
    self.decoder=decoder
    self.device=device
  def forward(self,src,target,teacher_forcing_ratio=0.5):
    batch_size=src.size(0)
    target_len = target.size(1)
    target_vocab_size = self.decoder.fc_out.out_features
    outputs = torch.zeros(batch_size,target_len,target_vocab_size).to(self.device)
    hidden=self.encoder(src)
    input=target[:,0] #<Start>
    for t in range(1, target_len):
            hidden,prediction = self.decoder(input, hidden)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input = target[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

    return outputs

In [37]:
class Seq2seq(nn.Module):
  def __init__(self,encoder,decoder,device):
    super(Seq2seq,self).__init__()
    self.encoder=encoder
    self.decoder=decoder
    self.device=device
  def forward(self,src,target,teacher_forcing_ratio=0.5):
    batch_size=src.size(0)
    target_len = target.size(1)
    target_vocab_size = self.decoder.fc_out.out_features
    outputs = torch.zeros(batch_size,target_len,target_vocab_size).to(self.device)
    hidden=self.encoder(src)
    input=target[:,0] #<Start> token - assuming target starts with a start token
    for t in range(1, target_len):
            # The decoder returns (hidden_state, prediction)
            # Correctly assign the returned values to variables
            decoder_hidden, prediction = self.decoder(input, hidden)

            # The next hidden state for the decoder is the one just returned
            hidden = decoder_hidden

            # Store the prediction (output) in the outputs tensor
            outputs[:, t] = prediction # Assign prediction (shape [batch, vocab_size]) to outputs[:, t] (shape [batch, vocab_size])

            # Decide the next input based on teacher forcing
            top1 = prediction.argmax(1) # Get the predicted next token
            input = target[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

    # Note: The outputs[:, 0] is still zeros as we started the loop from t=1.
    # If your target sequence includes a start token at index 0, and you want
    # the prediction for the token at index 1 to be stored at outputs[:, 1],
    # this indexing is correct. If you want to store the prediction for index 0
    # (which would be based on some initial input and hidden state before the loop),
    # you would need to handle that separately or adjust the loop range and indexing.
    # Assuming the first prediction corresponds to the token at target index 1.

    return outputs

# %%
# The subsequent code remains the same
INPUT_DIM = 1000   # Vocabulary size of source
OUTPUT_DIM = 1000  # Vocabulary size of target
EMB_DIM = 256
HIDDEN_DIM = 512
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

enc = Encoder(INPUT_DIM, EMB_DIM, HIDDEN_DIM).to(DEVICE)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HIDDEN_DIM).to(DEVICE)
model = Seq2seq(enc, dec, DEVICE)
model=model.to(DEVICE)

# Sample batch: (batch_size=32, sequence_length=10)
src = torch.randint(0, INPUT_DIM, (32, 10)).to(DEVICE)
trg = torch.randint(0, OUTPUT_DIM, (32, 12)).to(DEVICE)

output = model(src, trg)
print(output.shape)

cpu
torch.Size([32, 12, 1000])
