# LSTM
### Fetching and Organizing Data

In [8]:
from datasets import load_dataset
from tqdm import tqdm
import regex as re
import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import gensim.downloader as api
from torch.utils.data import DataLoader, Dataset

torch.set_default_device("cuda")

#Lemmatizer
import nltk
nltk.download('punkt_tab')      
nltk.download('wordnet')    
nltk.download('omw-1.4') 
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer



dataset = load_dataset("coastalcph/tydi_xor_rc")

languages = ['ar', 'ko', 'te']
train_df = dataset["train"].filter(lambda example: example['lang'] in languages).to_pandas()
val_df = dataset["validation"].filter(lambda example: example['lang'] in languages).to_pandas()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\aarus/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\aarus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\aarus/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\aarus/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


### GloVe Embeddings and Lemmatization

In [None]:

print("Loading GloVe embeddings (glove-wiki-gigaword-300)...")
glove = api.load("glove-wiki-gigaword-300")
print(f"Loaded {len(glove)} word vectors with dimensionality {glove.vector_size}")

w2v = glove

Loading GloVe embeddings (glove-wiki-gigaword-300)...
Loaded 400000 word vectors with dimensionality 300


In [10]:
lemmatizer = WordNetLemmatizer()

def Tokenize(sentence):
    return np.array(re.findall(r"\w+", sentence.lower()))

def word2vec(word):
  if word in w2v.key_to_index:
      return w2v[word]

#Embeds each lemmatized word in a sentence and calculates the mean, deeming it the sentence embedding 
def sentence2vec(sentence):
    words = Tokenize(sentence)
    sentence_vec = [word2vec(lemmatizer.lemmatize(word)) for word in words]
    sentence_vec = [vec for vec in sentence_vec if vec is not None]

    if len(sentence_vec) == 0:
        return np.zeros(w2v.vector_size)

    return np.mean(sentence_vec, axis=0)

### DataSet Creation

In [11]:
class SentenceDataset(Dataset):
    def __init__(self, sentences : np.ndarray, name):
        self.name = name
        self.sentences = torch.FloatTensor(sentences)
        print(f"Loaded {name} with {len(self.sentences)} sentences")
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return self.sentences[idx], self.sentences[idx]
    

def CreateDataLoader(sentences, name):
    embedings = np.array([sentence2vec(s) for s in tqdm(sentences, desc=f"Embedding {name}", leave=False)])
    dataset = SentenceDataset(embedings, name)
    # Use CUDA generator to match model device
    generator = torch.Generator(device='cuda')
    return DataLoader(dataset, batch_size=32, shuffle=True, generator=generator)


def CreateDataLoaders(train_df, val_df, field, lang, name):
    train_dl = CreateDataLoader(train_df[train_df['lang'].isin(lang)][field], f"{name} Training")
    val_dl = CreateDataLoader(val_df[val_df['lang'].isin(lang)][field],  f"{name} Validation")
    return train_dl, val_dl

ar_train, ar_val = CreateDataLoaders(train_df, val_df, "question", ["ar"], "Arabic")
ko_train, ko_val = CreateDataLoaders(train_df, val_df, "question", ["ko"], "Korean")
te_train, te_val = CreateDataLoaders(train_df, val_df, "question", ["te"], "Telugu")
en_train, en_val = CreateDataLoaders(train_df, val_df, "context", ["ar", "ko", "te"], "English")




                                                                   

Loaded Arabic Training with 2558 sentences


                                                                    

Loaded Arabic Validation with 415 sentences


                                                                   

Loaded Korean Training with 2422 sentences


                                                                    

Loaded Korean Validation with 356 sentences


                                                                   

Loaded Telugu Training with 1355 sentences


                                                                    

Loaded Telugu Validation with 384 sentences


                                                                                 

Loaded English Training with 6335 sentences


                                                                                   

Loaded English Validation with 1155 sentences




### Model Creation

In [12]:
class LSTMEmbeddingModel(nn.Module):

    def __init__(self, input_dim, hidden_dim=256, num_layers=2, dropout=0.3):
        super(LSTMEmbeddingModel, self).__init__()
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, input_dim)
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
    
    def forward(self, x, hidden=None):
        x = x.unsqueeze(1)
        lstm_out, hidden = self.lstm(x, hidden)
        output = self.fc(lstm_out)
        output = output.squeeze(1)
        
        return output, hidden
    
    def init_hidden(self, batch_size):
        return (torch.zeros(self.num_layers, batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, batch_size, self.hidden_dim))
    
    def trainSelf(self, dataloader, epochs=10, learning_rate=0.001, display=False, feedback=False):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(device)
        
        criterion = nn.CosineEmbeddingLoss()
        optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        
        self.train()
        losses = []
        
        for epoch in range(epochs):
            total_loss = 0
            
            for _, (data, targets) in enumerate(dataloader):
                data, targets = data.to(device), targets.to(device)
                
                optimizer.zero_grad()
                output, _ = self(data)
                
                # CosineEmbeddingLoss expects a target of 1 or -1 for similarity/dissimilarity
                target_labels = torch.ones(data.size(0), device=device)  # make embeddings similar
                loss = criterion(output, targets, target_labels)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=5)
                optimizer.step()
                
                total_loss += loss.item()
            
            avg_loss = total_loss / len(dataloader)
            losses.append(avg_loss)
            if feedback:
                print(f'Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}')
    
    def perplexity(self, dataloader):
        total_loss = 0
        total_samples = 0
        criterion = nn.CosineEmbeddingLoss(reduction='sum')
        device = next(self.parameters()).device  # Get device from model parameters
        
        self.eval()
        with torch.no_grad():
            for data, targets in dataloader:
                data, targets = data.to(device), targets.to(device)
                output, _ = self(data)
                target_labels = torch.ones(data.size(0), device=device)
                loss = criterion(output, targets, target_labels)
                total_loss += loss.item()
                total_samples += data.size(0)
        
        avg_loss = total_loss / total_samples if total_samples > 0 else float('inf')
        perplexity = math.exp(avg_loss)
        return perplexity

### Model Training

In [13]:

def TrainAndEvaluateModel(train_dl, val_dl, title):

    model = LSTMEmbeddingModel(input_dim=300, hidden_dim=256, num_layers=2, dropout=0.3)

    #Training
    model.trainSelf(train_dl, epochs=20, learning_rate=0.001)
    model.to("cuda")

    #Validation
    perplexity = model.perplexity(val_dl)

    print(f"============ { title } ============")
    print(f"Perplexity: {perplexity:.4f}")


In [15]:
TrainAndEvaluateModel(ar_train, ar_val, title="Arabic")
TrainAndEvaluateModel(ko_train, ko_val, title="Korean")
TrainAndEvaluateModel(te_train, te_val, title="Telugu")
TrainAndEvaluateModel(en_train, en_val, title="English")

Perplexity: 1.7146
Perplexity: 2.6645
Perplexity: 2.4619
Perplexity: 1.0109
