# Word2Vec Bi-LSTM RNN

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import regex as re
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import gensim.downloader as api
from torch.utils.data import TensorDataset, DataLoader

#Lemmatizer
import nltk
nltk.download('punkt_tab')      
nltk.download('wordnet')    
nltk.download('omw-1.4') 
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

### Question Translation
Word2Vec works on english words only, so it is essential to translate the questions from Korean, Arabic, and Telugu to English before leveraging it

In [None]:
#Loading Original Dataset
dataset = load_dataset("coastalcph/tydi_xor_rc")
languages = ["ar", "ko", "te", "en"]
train = dataset["train"].filter(lambda example: example['lang'] in languages).to_pandas()
val = dataset["validation"].filter(lambda example: example['lang'] in languages).to_pandas()

#GPU for acceleration if possible
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

#Downloading sequence_train and Tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to(device)
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")


def Translate(words, lang):
    langs = {"ar": "ara_Arab", "ko": "kor_Hang", "te": "tel_Telu"}

    tokenizer.src_lang = langs[lang]
    inputs = tokenizer(words, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    forced_bos_token_id = tokenizer.convert_tokens_to_ids("eng_Latn")

    with torch.no_grad():
        outputs = model.generate(**inputs, forced_bos_token_id=forced_bos_token_id, max_length=512)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


def TranslateDF(df):
    df_translated = df.copy()
    translated_questions = []

    for i, row in tqdm(df.iterrows()):
        lang = row["lang"]
        question = row["question"]
        translated = Translate([question], lang)
        translated_questions.append(translated[0] if translated and translated[0] else question)

        if (i + 1) % 100 == 0:
            print(f"Translated {i+1}/{len(df)} questions...")

    df_translated["question"] = translated_questions
    return df_translated

#Translate and save dataset
if False:
    val = TranslateDF(val)
    val.to_parquet("validationEN.parquet", index=False)
    train = TranslateDF(train)
    train.to_parquet("trainingEN.parquet", index=False)

## Word2Vec

In [None]:
#Load translated dataset and word2vec
train = pd.read_parquet("trainingEN.parquet")
val = pd.read_parquet("validationEN.parquet")

w2v = api.load("word2vec-google-news-300")

In [None]:
lemmatizer = WordNetLemmatizer()

def Tokenize(sentence):
    return np.array(re.findall(r"\w+", sentence.lower()))

def word2vec(word):
  if word in w2v.key_to_index:
      return w2v[word]

#Embeds each lemmatized word in a sentence and calculates the mean, deeming it the sentence embedding 
def sentence2vec(sentence):
    words = Tokenize(sentence)
    sentence_vec = [word2vec(lemmatizer.lemmatize(word)) for word in words]
    sentence_vec = [vec for vec in sentence_vec if vec is not None]

    if len(sentence_vec) == 0:
        return np.zeros(w2v.vector_size)

    return np.mean(sentence_vec, axis=0)

s = sentence2vec("This is a test sentence.")

In [None]:
class BiLSTMClassifier(nn.Module):
      def __init__(self, embedding_dim=300, hidden_dim=128, num_layers=1, num_classes=2):
          super().__init__()
          self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
          self.fc = nn.Linear(hidden_dim * 2, num_classes)
          self.dropout = nn.Dropout(0.3)

      def forward(self, x):
          output, (h_n, c_n) = self.lstm(x)
          h_final = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
          h_final = self.dropout(h_final)
          return self.fc(h_final)

In [None]:

def TrainAndValidate(train, val):

    #Embedding
    question_vecs_train = torch.tensor([sentence2vec(q) for q in train['question']])
    context_vecs_train = torch.tensor([sentence2vec(c) for c in train['context']])

    question_vecs_val = torch.tensor([sentence2vec(q) for q in val['question']])
    context_vecs_val = torch.tensor([sentence2vec(c) for c in val['context']])

    labels_train = torch.tensor([1 if answerable else 0 for answerable in train["answerable"]])
    labels_val = torch.tensor([1 if answerable else 0 for answerable in val["answerable"]])
    
    sequence_train = torch.stack([context_vecs_train, question_vecs_train], dim=0)
    sequence_val = torch.stack([context_vecs_val, question_vecs_val], dim=0)

    #Data Organization
    X_train = sequence_train.permute(1, 0, 2) 
    X_test = sequence_val.permute(1, 0, 2)

    train_ds = TensorDataset(X_train, labels_train)
    test_ds = TensorDataset(X_test, labels_val)

    train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
    test_dl = DataLoader(test_ds, batch_size=64)

    #Model Creation
    device = torch.device("cpu")
    model = BiLSTMClassifier().to(device)
    model = model.double()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    #Training
    for epoch in range(50):
        model.train()
        total_loss = 0
        for xb, yb in tqdm(train_dl):
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            out = model(xb.to(torch.float64))
            loss = criterion(out, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_dl):.4f}")

    #Evaluate
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for xb, yb in test_dl:
            out = model(xb.to(torch.float64))
            preds = out.argmax(dim=1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    print(f"Test Accuracy: {correct / total:.2%}")


train_ar = train[train['lang'] == 'ar']
train_ko = train[train['lang'] == 'ko']
train_te = train[train['lang'] == 'te']

val_ar = val[val['lang'] == 'ar']
val_ko = val[val['lang'] == 'ko']
val_te = val[val['lang'] == 'te']

TrainAndValidate(train_ar, val_ar)
TrainAndValidate(train_ko, val_ko)
TrainAndValidate(train_te, val_te)
