In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

# Sample sentences
sentences = [
    "the cat sits on the mat",
    "a dog is playing outside",
    "he is reading a book",
    "she loves to eat pasta",
    "children are going to school"
]

# Synthetic noise: drop or shuffle words
def corrupt(sentence):
    words = sentence.split()
    if random.random() > 0.5:
        # drop a word
        if len(words) > 1:
            words.pop(random.randint(0, len(words)-1))
    else:
        # shuffle
        random.shuffle(words)
    return ' '.join(words)

# Vocabulary
all_words = list(set(" ".join(sentences).split()))
word2idx = {w: i+1 for i, w in enumerate(all_words)}
word2idx['<pad>'] = 0
idx2word = {i: w for w, i in word2idx.items()}

vocab_size = len(word2idx)
max_len = max(len(s.split()) for s in sentences)

# Encode sentence
def encode(sentence):
    words = sentence.split()
    ids = [word2idx.get(w, 0) for w in words]
    return ids + [0] * (max_len - len(ids))

# Decode indices
def decode(indices):
    return ' '.join([idx2word[i.item()] for i in indices if i.item() != 0])

# Dataset
inputs, targets = [], []
for s in sentences:
    noisy = corrupt(s)
    inputs.append(encode(noisy))
    targets.append(encode(s))

inputs = torch.tensor(inputs)
targets = torch.tensor(targets)

# Model
class DenoisingAE(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.encoder = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.decoder = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, target):
        embed = self.embedding(x)
        _, (h, c) = self.encoder(embed)
        target_embed = self.embedding(target)
        output, _ = self.decoder(target_embed, (h, c))
        return self.out(output)

model = DenoisingAE(vocab_size).train()
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# Training loop
for epoch in range(100):
    optimizer.zero_grad()
    output = model(inputs, targets)
    loss = criterion(output.view(-1, vocab_size), targets.view(-1))
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")

# Evaluation
model.eval()
with torch.no_grad():
    outputs = model(inputs, targets).argmax(dim=-1)
for i in range(len(sentences)):
    input_sent = decode(inputs[i])
    output_sent = decode(outputs[i])
    target_sent = decode(targets[i])

    print(f"\nNoisy:        {input_sent}")
    print(f"Reconstructed:{output_sent}")
    print(f"Target:       {target_sent}")


Epoch 20 Loss: 0.0012
Epoch 40 Loss: 0.0001
Epoch 60 Loss: 0.0001
Epoch 80 Loss: 0.0001
Epoch 100 Loss: 0.0001

Noisy:        the cat on the mat
Reconstructed:the cat sits on the mat
Target:       the cat sits on the mat

Noisy:        a is playing outside
Reconstructed:a dog is playing outside outside
Target:       a dog is playing outside

Noisy:        he reading a book
Reconstructed:he is reading a book book
Target:       he is reading a book

Noisy:        she to eat pasta
Reconstructed:she loves to eat pasta pasta
Target:       she loves to eat pasta

Noisy:        going to are children school
Reconstructed:children are going to school school
Target:       children are going to school
