## ELMO Implementation

# 1. Marathi

In [None]:
!pip install indic-nlp-library wandb tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import os
import numpy as np
from indicnlp.tokenize import indic_tokenize
from torch.nn.utils.rnn import pad_sequence

In [None]:
import fasttext
import fasttext.util
ft_model = fasttext.load_model('/kaggle/input/pre-trained-model-indicft/indicnlp.ft.mr.300.bin')
word = "नृत्य"
print("Embedding Shape is {}".format(ft_model.get_word_vector(word)))

In [None]:
import os
import re
from indicnlp.tokenize import indic_tokenize
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import fasttext
from tqdm import tqdm

folder_path = '/kaggle/input/medium-marathi-dataset'
model_path = '/kaggle/input/pre-trained-model-indicft/indicnlp.ft.mr.300.bin' 


ft_model = fasttext.load_model(model_path)

token_to_index = {'<PAD>': 0, '<UNK>': 1, '<SOS>':2, '<EOS>':3}
next_token_index = 4  

def update_indices(token_list, token_to_index):
    global next_token_index
    for token in token_list:
        if token not in token_to_index:
            token_to_index[token] = next_token_index
            next_token_index += 1

texts = []  
threshold = 256

for file_name in os.listdir(folder_path):
    if file_name.endswith('.txt'):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        sentences = re.split(r'[।\n\.]+', text)
        sentences = ["<SOS> "+sentence.strip()+" <EOS>" for sentence in sentences if sentence.strip()]

        for sentence in sentences:
            tokens = indic_tokenize.trivial_tokenize(sentence, lang='mr')
            update_indices(tokens, token_to_index)
            if len(tokens) > threshold:  
                continue  
            texts.append(sentence)

print(f"Number of sentences processed: {len(texts)}")

class MarathiDataset(Dataset):
    def __init__(self, texts, ft_model, token_to_index):
        self.texts = texts
        self.ft_model = ft_model
        self.token_to_index = token_to_index
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        sentence = self.texts[idx]
        tokens = indic_tokenize.trivial_tokenize(sentence, lang='mr')
        embeddings = [self.ft_model.get_word_vector(token) for token in tokens]  
        input_embeddings = torch.tensor(embeddings[:-1], dtype=torch.float)
        target_indices = [self.token_to_index.get(token, self.token_to_index['<UNK>']) for token in tokens[1:]]  
        target_indices = torch.tensor(target_indices, dtype=torch.long)  
        return input_embeddings, target_indices

def collate_fn(batch):
    (inputs, targets) = zip(*batch)
    input_embeddings = pad_sequence(inputs, batch_first=True, padding_value=0.0)
    target_sequences = pad_sequence(targets, batch_first=True, padding_value=token_to_index['<PAD>']) 
    return input_embeddings, target_sequences

dataset = MarathiDataset(texts, ft_model, token_to_index)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

for input_data, targets in dataloader:
    print(f"Input batch shape: {input_data.shape}")
    print(f"Target batch shape: {targets.shape}")
    break

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ELMoLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(ELMoLanguageModel, self).__init__()
        
        self.forward_lstm1 = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.forward_lstm2 = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)
        self.backward_lstm1 = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.backward_lstm2 = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)

        self.forward_pred = nn.Linear(hidden_dim, vocab_size)
        self.backward_pred = nn.Linear(hidden_dim, vocab_size)

        self.gamma = nn.Parameter(torch.ones(3)) 

    def forward(self, x):
    
        forward_out1, _ = self.forward_lstm1(x)
        forward_out2, _ = self.forward_lstm2(forward_out1)

        reversed_embeddings = torch.flip(x, [1])
        backward_out1, _ = self.backward_lstm1(reversed_embeddings)
        backward_out2, _ = self.backward_lstm2(backward_out1)

        backward_out1 = torch.flip(backward_out1, [1])
        backward_out2 = torch.flip(backward_out2, [1])

        forward_predictions = self.forward_pred(forward_out2[:, -1, :])
        backward_predictions = self.backward_pred(backward_out2[:, 0, :])

        combined_embeddings = self.gamma[0] * x + self.gamma[1] * torch.cat((forward_out1, backward_out1), dim=-1) + self.gamma[2] * torch.cat((forward_out2, backward_out2), dim=-1)

        return forward_predictions, backward_predictions, combined_embeddings


In [None]:
cuda_available = torch.cuda.is_available()
print("CUDA Available:", cuda_available)
device = torch.device("cuda" if cuda_available else "cpu")

In [None]:
import wandb
hidden_dim = 150  
num_layers = 2  
vocab_size = len(token_to_index) + 1

model = ELMoLanguageModel(vocab_size, 300, hidden_dim).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=token_to_index['<PAD>']) 
optimizer = torch.optim.Adam(model.parameters()) 

# wandb.init(project='Marathi_model_Elmo')


num_epochs = 2  
for epoch in range(num_epochs):
    model.train() 
    total_loss = 0
    for input_data, targets in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
        input_data, targets = input_data.to(device), targets.to(device)
        optimizer.zero_grad()  
        forward_pred, backward_pred, _ = model(input_data) 
                
        loss_f = criterion(forward_pred, targets[:, 1]) 
        loss_b = criterion(backward_pred, targets[:, -1])  

        total_loss = loss_f + loss_b
        
        total_loss.backward()  
        optimizer.step()      
        total_loss += total_loss.item()
    avg_loss = total_loss / len(dataloader)
#     wandb.log({"train_loss": avg_loss})

    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.15f}")

In [None]:
model_path = './bilm_marathi_model.pth'
torch.save(model.state_dict(), model_path)

In [None]:
import json

mappings_path = './marathi_mappings.json'
with open(mappings_path, 'w', encoding='utf-8') as f:
    json.dump({
        'token_to_index': token_to_index
    }, f, ensure_ascii=False, indent=4)


## Anology and Similarity (Marathi)

In [None]:
!pip install indic-nlp-library
!pip install pandas pyarrow
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import re
from indicnlp.tokenize import indic_tokenize

cuda_available = torch.cuda.is_available()
print("CUDA Available:", cuda_available)
device = torch.device("cuda" if cuda_available else "cpu")

class ELMoLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(ELMoLanguageModel, self).__init__()
        self.forward_lstm1 = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.forward_lstm2 = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)
        self.backward_lstm1 = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.backward_lstm2 = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)

        self.forward_pred = nn.Linear(hidden_dim, vocab_size)
        self.backward_pred = nn.Linear(hidden_dim, vocab_size)
        self.gamma = nn.Parameter(torch.ones(3))  
        self.freeze_parameters()
        
    def freeze_parameters(self):
        for name, param in self.named_parameters():
            if 'gamma' not in name:
                param.requires_grad = False

    def forward(self, x):
        
        forward_out1, _ = self.forward_lstm1(x)
        forward_out2, _ = self.forward_lstm2(forward_out1)

        # Backward LM
        reversed_embeddings = torch.flip(x, [1])
        backward_out1, _ = self.backward_lstm1(reversed_embeddings)
        backward_out2, _ = self.backward_lstm2(backward_out1)

        backward_out1 = torch.flip(backward_out1, [1])
        backward_out2 = torch.flip(backward_out2, [1])

        forward_predictions = self.forward_pred(forward_out2[:, -1, :])
        backward_predictions = self.backward_pred(backward_out2[:, 0, :])

        combined_embeddings = self.gamma[0] * x + self.gamma[1] * torch.cat((forward_out1, backward_out1), dim=-1) + self.gamma[2] * torch.cat((forward_out2, backward_out2), dim=-1)

        return forward_predictions, backward_predictions, combined_embeddings


In [None]:
import json
import torch
import pandas as pd
import fasttext
import fasttext.util

def load_model_and_mappings(model_path, mappings_path):
    with open(mappings_path, 'r', encoding='utf-8') as f:
        mappings = json.load(f)

    token_to_index = mappings['token_to_index']
    vocab_size = len(token_to_index) + 1 
#     model = BiLM(hidden_dim=128, num_layers=2, vocab_size=vocab_size)
    model = ELMoLanguageModel(vocab_size, 300, 150).to(device)
    
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    model.eval()  
    
    return model, token_to_index


model_path = '/kaggle/input/elmo-model-small-dataset/bilm_marathi_model.pth'
mappings_path = '/kaggle/input/elmo-model-small-dataset/marathi_mappings.json'

model, token_to_index = load_model_and_mappings(model_path, mappings_path)

def cosine_similarity(embedding1, embedding2):
    return torch.nn.functional.cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0), dim=1).mean()

def get_word_embedding(word, ft_model, device, elmo_model):
    embeddings = ft_model.get_word_vector(word)
    embeddings_tensor = torch.tensor([embeddings], dtype=torch.float).unsqueeze(0).to(device)
        _, _, elmo_embeddings = elmo_model(embeddings_tensor)
    return elmo_embeddings.squeeze(0)

def find_analogy(word_a, word_b, word_c, ft_model, device,elmo_model):
    embedding_a = get_word_embedding(word_a, ft_model, device, elmo_model)
    embedding_b = get_word_embedding(word_b, ft_model, device, elmo_model)
    embedding_c = get_word_embedding(word_c, ft_model, device, elmo_model)

    max_similarity = -float('Inf')
    word_d = None

    analogy_vector = (embedding_b - embedding_a) + embedding_c

    for word in ft_model.get_words():
        embedding_d = get_word_embedding(word, ft_model, device, elmo_model)
        sim = cosine_similarity(analogy_vector.unsqueeze(0), embedding_d.unsqueeze(0))
        if sim > max_similarity:
            max_similarity = sim
            word_d = word

    return word_d


def evaluate_pairs(pairs, ft_model, device, elmo_model):
    for word1, word2 in pairs:
        embedding1 = get_word_embedding(word1, ft_model, device, elmo_model)
        embedding2 = get_word_embedding(word2, ft_model, device, elmo_model)
        sim = cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0))
        print(f"Cosine similarity between {word1} and {word2} is {sim.item()}")


ft_model = fasttext.load_model('/kaggle/input/pre-trained-model-indicft/indicnlp.ft.mr.300.bin')

In [None]:
vocab_size = len(token_to_index) + 1 
elmo_model = ELMoLanguageModel(vocab_size, 300, 150).to(device)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

word_pairs = [('राजा', 'राणी'), ('राजा', 'दलितांना'), ('माणूस', 'खुर्ची'), ('माणूस', 'उडणे')]
analogies =  [('राजा', 'माणूस', 'राणी'), ('पॅरिस', 'फ्रान्स', 'रोम')]
word_pairs_1 = [('सर्वसाधारण माणूस रस्त्यावर चालत आहे', 'एक माणूस रस्त्यावर चालत आहे'), ('सर्वसाधारण माणूस रस्त्यावर चालत आहे', 'मी खात आहे')]

evaluate_pairs(word_pairs, ft_model, device,elmo_model)
evaluate_pairs(word_pairs_1, ft_model, device,elmo_model)

for a, b, c in analogies:
    predicted_d = find_analogy(a, b, c, ft_model, device,elmo_model)
    print(f"{a} is to {b} as {c} is to {predicted_d}")


### ==============================================================