<a href="https://colab.research.google.com/github/coursewarefactory/NLP_training/blob/main/Simple_Training_NLP_to_Tranlsate_one_word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

#1. Import necessary libraries:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

#2. Load the data:
word_pairs = [("love", "кохання")]

#3. Data preprocessing:
# Convert words to lowercase and remove special characters
word_pairs = [(x.lower(), y.lower()) for x, y in word_pairs] # here x and y should be in lower case

# Create vocabulary and assign each word a unique index
vocab = {word: i for i, word in enumerate(set([pair[0] for pair in word_pairs] + [pair[1] for pair in word_pairs]))}

#4. Create the Encoder and Decoder structure:

class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size)

    def forward(self, input):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded)
        return output, hidden

class Decoder(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

#5. Embedding layer:
# Define the size of the vocabulary and the embedding size
vocab_size = len(vocab)
embedding_size = 100
hidden_size = 100

# Initialize the encoder and decoder
encoder = Encoder(vocab_size, embedding_size, hidden_size)
decoder = Decoder(embedding_size, hidden_size, vocab_size)

#6. Training the model:
# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(list(encoder.parameters()) + list(decoder.parameters()), lr=0.1) # include decoder parameters also in optimizer

# Train the model for 100 epochs
for epoch in range(100):
    for pair in word_pairs:
        input_word = pair[0]
        target_word = pair[1]

        # Encode input word and feed into decoder
        encoder_input = torch.tensor([vocab[input_word]])
        encoder_output, encoder_hidden = encoder(encoder_input)

        # Decode and get output word
        decoder_hidden = encoder_hidden
        decoder_input = torch.tensor([[vocab[target_word]]])
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)

        # Calculate loss and perform gradient descent
        loss = criterion(decoder_output, torch.tensor([vocab[target_word]]))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

#7. Testing the model:
# Test the model by giving it an unseen input word "passion"
encoder_input = torch.tensor([vocab["love"]])
encoder_output, encoder_hidden = encoder(encoder_input)
decoder_hidden = encoder_hidden
decoder_input = torch.tensor([[0]])  # Start of sequence token
decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)

# Get output word from decoder
output_word = list(vocab.keys())[list(vocab.values()).index(decoder_output.argmax().item())]
print(output_word)  # Output: кохання