In [None]:
# Install dependencies
!pip install -r requirements.txt

In [None]:
# Verify installation
import torch
import transformers
import pandas as pd
import sklearn
import numpy as np
import tensorflow as tf

print("Torch version:", torch.__version__)
print("Transformers version:", transformers.__version__)
print("Pandas version:", pd.__version__)
print("Scikit-learn version:", sklearn.__version__)
print("Numpy version:", np.__version__)
print("TensorFlow version:", tf.__version__)

In [None]:
# Settingup and Cloning Repositories
import os
from pathlib import Path
import shutil

# Set up paths
base_dir = Path("/home/jovyan/transfer_learning")
nepali_data_dir = base_dir / "data/nepali-data"
hindi_data_dir = base_dir / "data/hindi-data"

# Clone repositories
!git clone https://github.com/oya163/nepali-ner.git --quiet && \
 git clone https://github.com/cfiltnlp/HiNER.git --quiet

# Create directories
nepali_data_dir.mkdir(parents=True, exist_ok=True)
hindi_data_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Copy files
shutil.copy(base_dir / "nepali-ner/data/ebiquity_v2/stemmed/total.bio", nepali_data_dir / "nepali.txt")

for set_type in ["train", "test", "validation"]:
    shutil.copy(base_dir / f"HiNER/data/collapsed/{set_type}.conll", hindi_data_dir / f"{set_type}.txt")

In [None]:
# Renaming labels in the Nepali dataset to make it compatible with the Hindi dataset.
def replace_labels_in_file(input_path, output_path, label_mapping):
    with input_path.open('r', encoding='utf-8') as file:
        lines = file.readlines()

    updated_lines = [
        ' '.join([parts[0], label_mapping.get(parts[1], parts[1])]) if len(parts) > 1 else ''
        for line in lines if (parts := line.strip().split())
    ]

    with output_path.open('w', encoding='utf-8') as new_file:
        new_file.write('\n'.join(updated_lines) + '\n')

label_mapping = {
    'B-LOC': 'B-LOCATION',
    'B-ORG': 'B-ORGANIZATION',
    'B-PER': 'B-PERSON',
    'I-LOC': 'I-LOCATION',
    'I-ORG': 'I-ORGANIZATION',
    'I-PER': 'I-PERSON'
}

replace_labels_in_file(nepali_data_dir / "nepali.txt", 
                       nepali_data_dir / "nepali_label_matched.txt", 
                       label_mapping)

In [None]:
from pathlib import Path
from collections import defaultdict
import random

# Loading and reading the content of the uploaded Nepali NER tagged text file
file_path = nepali_data_dir / "nepali_label_matched.txt"

# Reading the content of the file to check its structure
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Seed for reproducibility
random.seed(42)

# Function to parse sentences and their labels from the file
def parse_sentences(lines):
    sentences = []
    current_sentence = []
    for line in lines:
        if line.strip() == "":
            if current_sentence:
                sentences.append(current_sentence)
                current_sentence = []
        else:
            word, tag = line.strip().split()
            current_sentence.append((word, tag))
    if current_sentence:
        sentences.append(current_sentence)
    return sentences

# Parsing sentences
sentences = parse_sentences(lines)

# Function to distribute sentences into train, test, and validation sets based on label distribution
def distribute_sentences(sentences, train_ratio=0.7, test_ratio=0.2, valid_ratio=0.1):
    # Initialize distribution dictionaries
    label_sentences = defaultdict(list)

    # Collect sentences by label
    for sentence in sentences:
        label_set = set()
        for _, tag in sentence:
            if tag != 'O':
                label_set.add(tag)

        if not label_set:
            label_set = {'O'}  # Pure 'O' sentences categorized under 'O'
        for label in label_set:
            label_sentences[label].append(sentence)

    # Split into train, test, validation sets
    train, test, valid = [], [], []
    for label, sents in label_sentences.items():
        random.shuffle(sents)
        n_total = len(sents)
        n_train = int(n_total * train_ratio)
        n_test = int(n_total * test_ratio)

        train.extend(sents[:n_train])
        test.extend(sents[n_train:n_train + n_test])
        valid.extend(sents[n_train + n_test:])

    # Shuffling the datasets to ensure randomness
    random.shuffle(train)
    random.shuffle(test)
    random.shuffle(valid)

    return train, test, valid

# Distribute the sentences
train_set, test_set, validation_set = distribute_sentences(sentences)

# Function to write datasets to files with tab-separated values
def write_to_file(sentences, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for sentence in sentences:
            for word, label in sentence:
                file.write(f"{word}\t{label}\n")  # Using a tab instead of space to separate word and label
            file.write("\n")  # New line for each sentence

# Write the train, test, and validation datasets to files
write_to_file(train_set, nepali_data_dir / 'train.txt')
write_to_file(test_set, nepali_data_dir / 'test.txt')
write_to_file(validation_set, nepali_data_dir / 'validation.txt')

In [None]:
######Experiments##################

In [None]:
# Monolingual NER on Hindi and Nepali datasets
model_names = [
    'google/muril-base-cased', 
    'distilbert/distilbert-base-multilingual-cased', 
    'google-bert/bert-base-multilingual-cased', 
    'google/rembert'
]

datasets = ['hindi', 'nepali']

for DATA in datasets:
    for MODEL_NAME in model_names:
        # Extracting the first word after '/' to create folder to store model and results
        MODEL_TYPE = MODEL_NAME.split('/')[1].split('-')[0]

        output_dir = f"{base_dir}/output/{DATA}/{MODEL_TYPE}/output"
        logging_dir = f"{base_dir}/output/{DATA}/{MODEL_TYPE}/logs"
        model_dir = f"{base_dir}/output/{DATA}/{MODEL_TYPE}/model"

        # Create directories if they don't exist
        os.makedirs(output_dir, exist_ok=True)
        os.makedirs(logging_dir, exist_ok=True)
        os.makedirs(model_dir, exist_ok=True)

        # Now run your command
        os.system(f'''
        python trainer.py \
        --train "{locals()[f'{DATA}_data_dir']}/train.txt" \
        --validation "{locals()[f'{DATA}_data_dir']}/validation.txt" \
        --test "{locals()[f'{DATA}_data_dir']}/test.txt" \
        --model_name "{MODEL_NAME}" \
        --output_dir "{output_dir}" \
        --logging_dir "{logging_dir}" \
        --save_pretrained "{model_dir}" \
        > "{output_dir}/output.txt"
        ''')

In [None]:
# Cross-lingual NER on Hindi and Nepali datasets
model_names = [
    'google/muril-base-cased', 
    'distilbert/distilbert-base-multilingual-cased', 
    'google-bert/bert-base-multilingual-cased', 
    'google/rembert'
]

datasets = ['hindi', 'nepali']

for TRAIN_DATA in datasets:
    if TRAIN_DATA == 'hindi':
        PRETRAINED_MODEL = 'nepali'
        DATASET_TYPE = 'nepali_hindi'
    elif TRAIN_DATA == 'nepali':
        PRETRAINED_MODEL = 'hindi'
        DATASET_TYPE = 'hindi_nepali'

    for MODEL_NAME in model_names:
        MODEL_TYPE = MODEL_NAME.split('/')[1].split('-')[0]

        # Define the directories that need to be created
        local_model_path = f"{base_dir}/output/{PRETRAINED_MODEL}/{MODEL_TYPE}/model"
        output_dir = f"{base_dir}/output/{DATASET_TYPE}/{MODEL_TYPE}/output"
        logging_dir = f"{base_dir}/output/{DATASET_TYPE}/{MODEL_TYPE}/logs"
        model_dir = f"{base_dir}/output/{DATASET_TYPE}/{MODEL_TYPE}/model"

        # Create directories if they don't exist
        os.makedirs(output_dir, exist_ok=True)
        os.makedirs(logging_dir, exist_ok=True)
        os.makedirs(model_dir, exist_ok=True)

        # Now run the command
        os.system(f'''
        python trainer.py \
        --use_local_model True \
        --local_model_path "{local_model_path}" \
        --model_name "{MODEL_NAME}" \
        --output_dir "{output_dir}" \
        --logging_dir "{logging_dir}" \
        --save_pretrained "{model_dir}" \
        --train "{locals()[f'{TRAIN_DATA}_data_dir']}/train.txt" \
        --validation "{locals()[f'{TRAIN_DATA}_data_dir']}/validation.txt" \
        --test "{locals()[f'{TRAIN_DATA}_data_dir']}/test.txt" > "{output_dir}/output.txt"
        ''')