# Downtask 2: Wikipedia Section-title Prediction

### 1. Marathi

In [1]:
!pip install indic-nlp-library
!pip install pandas pyarrow

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.4.0-py3-none-any.whl.metadata (1.9 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinx>=1.2.0 (from sphinx-argparse->indic-nlp-library)
  Downloading sphinx-7.3.7-py3-none-any.whl.metadata (6.0 kB)
Collecting sphinxcontrib-applehelp (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library)
  Downloading sphinxcontrib_applehelp-1.0.8-py3-none-any.whl.metadata (2.3 kB)
Collecting sphinxcontrib-devhelp (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library)
  Downloading sphinxcontrib_devhelp-1.0.6-py3-none-any.whl.metadata (2.3 kB)
Collecting sphinxcontrib-jsmath (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library)
  Downloading sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting sphinxcontrib-h

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

cuda_available = torch.cuda.is_available()
print("CUDA Available:", cuda_available)
device = torch.device("cuda" if cuda_available else "cpu")

CUDA Available: True


In [3]:
class ELMoLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(ELMoLanguageModel, self).__init__()
        self.forward_lstm1 = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.forward_lstm2 = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)
        self.backward_lstm1 = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.backward_lstm2 = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)

        self.forward_pred = nn.Linear(hidden_dim, vocab_size)
        self.backward_pred = nn.Linear(hidden_dim, vocab_size)
        self.gamma = nn.Parameter(torch.ones(3))  
        self.freeze_parameters()
        
    def freeze_parameters(self):
        for name, param in self.named_parameters():
            if 'gamma' not in name:
                param.requires_grad = False

    def forward(self, x):
        
        forward_out1, _ = self.forward_lstm1(x)
        forward_out2, _ = self.forward_lstm2(forward_out1)

        # Backward LM
        reversed_embeddings = torch.flip(x, [1])
        backward_out1, _ = self.backward_lstm1(reversed_embeddings)
        backward_out2, _ = self.backward_lstm2(backward_out1)

        backward_out1 = torch.flip(backward_out1, [1])
        backward_out2 = torch.flip(backward_out2, [1])

        forward_predictions = self.forward_pred(forward_out2[:, -1, :])
        backward_predictions = self.backward_pred(backward_out2[:, 0, :])

        combined_embeddings = self.gamma[0] * x + self.gamma[1] * torch.cat((forward_out1, backward_out1), dim=-1) + self.gamma[2] * torch.cat((forward_out2, backward_out2), dim=-1)

        return forward_predictions, backward_predictions, combined_embeddings


In [4]:
import json
import torch

def load_model_and_mappings(model_path, mappings_path):
    with open(mappings_path, 'r', encoding='utf-8') as f:
        mappings = json.load(f)

    token_to_index = mappings['token_to_index']
    vocab_size = len(token_to_index) + 1 
#     model = BiLM(hidden_dim=128, num_layers=2, vocab_size=vocab_size)
    model = ELMoLanguageModel(vocab_size, 300, 150).to(device)
    
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    model.eval()    
    
    return model, token_to_index


model_path = '/kaggle/input/final-elmo-model/bilm_marathi_model (1).pth'
mappings_path = '/kaggle/input/final-elmo-model/marathi_mappings.json'

model, token_to_index = load_model_and_mappings(model_path, mappings_path)

In [5]:
import os
import re
from indicnlp.tokenize import indic_tokenize

def preprocess_text(text, language='mr'):
    """
    Apply preprocessing steps to the given text.
    """
    text = remove_non_textual_elements(text)
    text = normalize_quotation_marks(text)
    text = ensure_utf8_encoding(text)
    sentences = tokenize_sentences(text)
    sentences_SOS = ["<SOS> "+sentence+" <EOS>" for sentence in sentences]
    tokenized_sentences = [tokenize_words_indicnlp(sentence, language) for sentence in sentences_SOS]
    return ' '.join([' '.join(sentence) for sentence in tokenized_sentences])

def remove_non_textual_elements(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def normalize_quotation_marks(text):
    text = text.replace('“', '"').replace('”', '"')
    text = text.replace("‘", "'").replace("’", "'")
    return text

def ensure_utf8_encoding(text):
    return text.encode('utf-8', errors='ignore').decode('utf-8')

def tokenize_sentences(text):
    sentences = re.split(r'[।\n\.]+', text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

def tokenize_words_indicnlp(sentence, language='mr'):
    return indic_tokenize.trivial_tokenize(sentence, lang=language)


In [6]:
import pandas as pd
import fasttext
import fasttext.util

ft_model = fasttext.load_model('/kaggle/input/pre-trained-model-indicft/indicnlp.ft.mr.300.bin')

def load_dataset(parquet_path):
    """Load dataset from a Parquet file."""
    return pd.read_parquet(parquet_path)

train_path = '/kaggle/input/wstp-mr/train-00000-of-00001.parquet'
test_path = '/kaggle/input/wstp-mr/test-00000-of-00001.parquet'
val_path = '/kaggle/input/wstp-mr/validation-00000-of-00001.parquet'

train_df = load_dataset(train_path)
test_df = load_dataset(test_path)
val_df = load_dataset(val_path)

print("Train Dataset:", train_df.head())
print("Test Dataset:", test_df.head())
print("Validation Dataset:", val_df.head())



Train Dataset:                                          sectionText correctTitle  \
0  मालिकेतील आधल्या खेळांप्रमाणे "सिव्हलिजेशन ५" ...       titleB   
1  गावात एटीएम उपलब्ध आहे. \nगावात व्यापारी बँक उ...       titleB   
2  मराठवाड्यातील किंबहुना महाराष्ट्रातील दिवाळी श...       titleA   
3  डॉ. बाबासाहेब आंबेडकरांनी समाजवादाचा पुरस्कार ...       titleB   
4  चीनमध्ये अनेकांचा असा समज आहे की वाघांचे काही ...       titleB   

                       titleA                titleB  \
0              सामाजिक नीत्या                  शहरे   
1             शैक्षणिक सुविधा    बाजार व पतव्यवस्था   
2  महाराष्ट्रातील लोकसंस्कृती              जैन धर्म   
3               चीनबाबत विचार  समाजवादासंबंधी विचार   
4           आहार व शिकारपद्धत             चिनी औषधे   

                      titleC         titleD  \
0                      खेळणे     घटक व लढाई   
1            संपर्क व दळणवळण         आरोग्य   
2       दीपावलीची विविध नावे         भाऊबीज   
3  मूलभूत हक्कांसंबंधी विचार  धार्मिक विचार   
4   

In [7]:
def preprocess_dataset(df, text_column='text'):
    df[text_column] = df[text_column].apply(lambda x: preprocess_text(x))
    return df

train_df_preprocessed = preprocess_dataset(train_df, 'sectionText')
test_df_preprocessed = preprocess_dataset(test_df, 'sectionText')
val_df_preprocessed = preprocess_dataset(val_df, 'sectionText')

print(train_df_preprocessed.head())

                                         sectionText correctTitle  \
0  < SOS > मालिकेतील आधल्या खेळांप्रमाणे " सिव्हल...       titleB   
1  < SOS > गावात एटीएम उपलब्ध आहे < EOS > < SOS >...       titleB   
2  < SOS > मराठवाड्यातील किंबहुना महाराष्ट्रातील ...       titleA   
3  < SOS > डॉ < EOS > < SOS > बाबासाहेब आंबेडकरां...       titleB   
4  < SOS > चीनमध्ये अनेकांचा असा समज आहे की वाघां...       titleB   

                       titleA                titleB  \
0              सामाजिक नीत्या                  शहरे   
1             शैक्षणिक सुविधा    बाजार व पतव्यवस्था   
2  महाराष्ट्रातील लोकसंस्कृती              जैन धर्म   
3               चीनबाबत विचार  समाजवादासंबंधी विचार   
4           आहार व शिकारपद्धत             चिनी औषधे   

                      titleC         titleD  \
0                      खेळणे     घटक व लढाई   
1            संपर्क व दळणवळण         आरोग्य   
2       दीपावलीची विविध नावे         भाऊबीज   
3  मूलभूत हक्कांसंबंधी विचार  धार्मिक विचार   
4            वाघ-मा

### 1.1 Baseline

In [8]:
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import numpy as np
import wandb

class TitleSelectionDataset(Dataset):
    def __init__(self, df, ft_model, tokenizer, lang='mr'):
        self.df = df
        self.ft_model = ft_model
        self.tokenizer = tokenizer
        self.lang = lang

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        section_text = self.text_to_embedding(row['sectionText'])
        titles = [
            self.text_to_embedding(row['titleA']),
            self.text_to_embedding(row['titleB']),
            self.text_to_embedding(row['titleC']),
            self.text_to_embedding(row['titleD'])
        ]

        title_keys = ['titleA', 'titleB', 'titleC', 'titleD']
        correct_title_key = row['correctTitle']  
        label = title_keys.index(correct_title_key) 

        return section_text, titles, label

    def text_to_embedding(self, text):
        tokens = self.tokenizer(text, lang=self.lang)
        embeddings = [self.ft_model.get_word_vector(token) for token in tokens]
        embeddings_array = np.array(embeddings) 
        return torch.tensor(embeddings_array, dtype=torch.float) 

def collate_fn(batch):
    section_texts, titles, labels = zip(*batch)
    section_texts_padded = pad_sequence(section_texts, batch_first=True, padding_value=0.0)
    titles_padded = [pad_sequence([t[i] for t in titles], batch_first=True, padding_value=0.0) for i in range(4)]
    labels = torch.tensor(labels, dtype=torch.long)
    return section_texts_padded, titles_padded, labels

class TitleClassifier(nn.Module):
    def __init__(self, elmo_model, hidden_dim):
        super(TitleClassifier, self).__init__()
        self.elmo_model = elmo_model
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 2 * 2, hidden_dim),  
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, section_text, titles):
        _, _, section_embeddings = self.elmo_model(section_text)
        section_embeddings = section_embeddings.mean(dim=1)

        scores = []
        for title in titles:
            _, _, title_embeddings = self.elmo_model(title)
            title_embeddings = title_embeddings.mean(dim=1)
            combined_embeddings = torch.cat((section_embeddings, title_embeddings), dim=1)
            score = self.classifier(combined_embeddings)  
            scores.append(score.squeeze())
        
        scores = torch.stack(scores, dim=1).squeeze(-1) 
        return scores

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hidden_dim = 150
vocab_size = len(token_to_index)  
elmo_model = ELMoLanguageModel(vocab_size, 300, hidden_dim).to(device) 
classifier = TitleClassifier(elmo_model, hidden_dim).to(device) 

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001) 

train_loader = DataLoader(TitleSelectionDataset(train_df_preprocessed, ft_model, indic_tokenize.trivial_tokenize), batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(TitleSelectionDataset(test_df_preprocessed, ft_model, indic_tokenize.trivial_tokenize), batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(TitleSelectionDataset(val_df_preprocessed, ft_model, indic_tokenize.trivial_tokenize), batch_size=64, shuffle=True, collate_fn=collate_fn)


wandb.init(project='ELMO_FOR_INDIAN_LANGUAGES_GROUP-30', name="Downtask2-Train_elmo_Marathi")

for epoch in range(5):
    classifier.train()
    total_loss = 0
    for section_texts, candidate_titles, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/10"):
        section_texts = section_texts.to(device)
        candidate_titles = [title.to(device) for title in candidate_titles]
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = classifier(section_texts, candidate_titles)
#         print(outputs.shape)
#         print(labels.shape)
        loss = criterion(outputs, labels)  
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    wandb.log({"train_loss": total_loss / len(train_loader)})
    print(f"Epoch {epoch+1}: Average Loss: {total_loss / len(train_loader)}")

def extract_features_elmo(dataloader, classifier, device):
    classifier.eval()  
    all_embeddings = []
    all_labels = []

    with torch.no_grad():
        for section_texts, candidate_titles, labels in tqdm(dataloader):
            section_texts = section_texts.to(device)
            candidate_titles = [title.to(device) for title in candidate_titles]
            outputs = classifier(section_texts, candidate_titles)
            all_embeddings.extend(outputs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return np.array(all_embeddings), np.array(all_labels)

# =============================================================================================

train_embeddings, train_labels = extract_features_elmo(train_loader, classifier, device)
predicted_labels_train = np.argmax(train_embeddings, axis=1)

print(classification_report(train_labels, predicted_labels_train))
cm = confusion_matrix(train_labels, predicted_labels_train)
plt.figure(figsize=(10, 7)) 
sns.set(font_scale=1.4)  
sns.heatmap(cm, annot=True, annot_kws={"size": 16}, fmt='g', cmap=plt.cm.Blues) 

plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# =============================================================================================
test_embeddings, test_labels = extract_features(test_loader, classifier, device)
predicted_labels = np.argmax(test_embeddings, axis=1)

print(classification_report(test_labels, predicted_labels))
cm = confusion_matrix(test_labels, predicted_labels)
plt.figure(figsize=(10, 7)) 
sns.set(font_scale=1.4)  
sns.heatmap(cm, annot=True, annot_kws={"size": 16}, fmt='g', cmap=plt.cm.Blues)  

plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

test_embeddings, test_labels = extract_features(test_loader, classifier_baseline, device)
predicted_labels = np.argmax(test_embeddings, axis=1)

print(classification_report(test_labels, predicted_labels))
cm = confusion_matrix(test_labels, predicted_labels)
plt.figure(figsize=(10, 7)) 
sns.set(font_scale=1.4)  
sns.heatmap(cm, annot=True, annot_kws={"size": 16}, fmt='g', cmap=plt.cm.Blues)  

plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch 1/10: 100%|██████████| 164/164 [01:50<00:00,  1.48it/s]


Epoch 1: Average Loss: 1.3447618622605393


Epoch 2/10: 100%|██████████| 164/164 [01:51<00:00,  1.47it/s]


Epoch 2: Average Loss: 1.3080241251282576


Epoch 3/10: 100%|██████████| 164/164 [01:49<00:00,  1.49it/s]


Epoch 3: Average Loss: 1.2907281895963156


Epoch 4/10: 100%|██████████| 164/164 [01:51<00:00,  1.47it/s]


Epoch 4: Average Loss: 1.2787548296335267


Epoch 5/10: 100%|██████████| 164/164 [01:51<00:00,  1.47it/s]


Epoch 5: Average Loss: 1.2640570961847537


100%|██████████| 164/164 [01:50<00:00,  1.48it/s]


              precision    recall  f1-score   support

           0       0.38      0.38      0.38      2623
           1       0.37      0.37      0.37      2658
           2       0.38      0.37      0.37      2560
           3       0.37      0.37      0.37      2605

    accuracy                           0.37     10446
   macro avg       0.37      0.37      0.37     10446
weighted avg       0.37      0.37      0.37     10446



NameError: name 'confusion_matrix' is not defined

### 1.2 ELMO

In [None]:
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import wandb

class TitleSelectionDataset(Dataset):
    def __init__(self, df, ft_model, tokenizer, lang='mr'):
        self.df = df
        self.ft_model = ft_model
        self.tokenizer = tokenizer
        self.lang = lang

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        section_text = self.text_to_embedding(row['sectionText'])
        titles = [
            self.text_to_embedding(row['titleA']),
            self.text_to_embedding(row['titleB']),
            self.text_to_embedding(row['titleC']),
            self.text_to_embedding(row['titleD'])
        ]

        title_keys = ['titleA', 'titleB', 'titleC', 'titleD']
        correct_title_key = row['correctTitle']  
        label = title_keys.index(correct_title_key) 

        return section_text, titles, label

    def text_to_embedding(self, text):
        tokens = self.tokenizer(text, lang=self.lang)
        embeddings = [self.ft_model.get_word_vector(token) for token in tokens]
        embeddings_array = np.array(embeddings) 
        return torch.tensor(embeddings_array, dtype=torch.float)

def collate_fn(batch):
    section_texts, titles, labels = zip(*batch)
    section_texts_padded = pad_sequence(section_texts, batch_first=True, padding_value=0.0)
    titles_padded = [pad_sequence([t[i] for t in titles], batch_first=True, padding_value=0.0) for i in range(4)]
    labels = torch.tensor(labels, dtype=torch.long)
    return section_texts_padded, titles_padded, labels

class TitleClassifier(nn.Module):
    def __init__(self, input_dim):
        super(TitleClassifier, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim * 2, input_dim),  
            nn.ReLU(),
            nn.Linear(input_dim, 1)
        )

    def forward(self, section_text, titles):
        section_embeddings = section_text.mean(dim=1) 

        scores = []
        for title in titles:
            title_embeddings = title.mean(dim=1)  
            combined_embeddings = torch.cat((section_embeddings, title_embeddings), dim=1)
            score = self.classifier(combined_embeddings)  
            scores.append(score.squeeze())

        scores = torch.stack(scores, dim=1).squeeze(-1)
        return scores

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = 300  
classifier_baseline = TitleClassifier(input_dim).to(device) 

criterion = nn.CrossEntropyLoss()
optimizer_baseline = torch.optim.Adam(classifier_baseline.parameters(), lr=0.001)

train_loader = DataLoader(TitleSelectionDataset(train_df_preprocessed, ft_model, indic_tokenize.trivial_tokenize), batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(TitleSelectionDataset(test_df_preprocessed, ft_model, indic_tokenize.trivial_tokenize), batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(TitleSelectionDataset(val_df_preprocessed, ft_model, indic_tokenize.trivial_tokenize), batch_size=64, shuffle=True, collate_fn=collate_fn)

wandb.init(project='ELMO_FOR_INDIAN_LANGUAGES_GROUP-30', name="Downtask2-Train_baseline_Marathi")

for epoch in range(5):
    classifier_baseline.train()
    total_loss = 0
    for section_texts, candidate_titles, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/10"):
        section_texts = section_texts.to(device)
        candidate_titles = [title.to(device) for title in candidate_titles]
        labels = labels.to(device)
        
        optimizer_baseline.zero_grad()
        outputs = classifier_baseline(section_texts, candidate_titles)
        loss = criterion(outputs, labels)  
        loss.backward()
        optimizer_baseline.step()
        
        total_loss += loss.item()
    wandb.log({"train_loss": total_loss / len(train_loader)})
    print(f"Epoch {epoch+1}: Average Loss: {total_loss / len(train_loader)}")

def extract_features(dataloader, classifier, device):
    classifier.eval()  
    all_embeddings = []
    all_labels = []

    with torch.no_grad():
        for section_texts, candidate_titles, labels in tqdm(dataloader):
            section_texts = section_texts.to(device)
            candidate_titles = [title.to(device) for title in candidate_titles]
            outputs = classifier(section_texts, candidate_titles)
            all_embeddings.extend(outputs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return np.array(all_embeddings), np.array(all_labels)

train_embeddings, train_labels = extract_features(train_loader, classifier_baseline, device)
predicted_labels_train = np.argmax(train_embeddings, axis=1)

print(classification_report(train_labels, predicted_labels_train))
cm = confusion_matrix(train_labels, predicted_labels_train)
plt.figure(figsize=(10, 7)) 
sns.set(font_scale=1.4)  
sns.heatmap(cm, annot=True, annot_kws={"size": 16}, fmt='g', cmap=plt.cm.Blues) 

plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()