# 1. Setup

In [None]:
import pandas as pd
import re
import os

from tqdm.auto import tqdm
tqdm.pandas()

cwd = "~/Desktop/chaii"

data = pd.read_csv(os.path.join(cwd, 'Data/train.csv'))
#data = data.head()
data['src'] = data['language'].str[:2]
data = data.drop(columns=['language'])
print(data.shape)
data.head()

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

## 1.1. Drive

## 1.2 GCP

## 1.3. Indic NLP

In [None]:
!pip install indic-nlp-library
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git

In [None]:
import sys
from indicnlp import common

# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES=r"indic_nlp_resources"

# Add library to Python path
sys.path.append(r'{}\src'.format(INDIC_NLP_LIB_HOME))

# Set environment variable for resources folder
common.set_resources_path(INDIC_NLP_RESOURCES)

In [None]:
from indicnlp.tokenize import sentence_tokenize

## 1.4.Indic Trans - Translate

In [None]:
# clone the repo for running evaluation
!git clone https://github.com/AI4Bharat/indicTrans.git
%cd indicTrans
# clone requirements repositories
!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!git clone https://github.com/rsennrich/subword-nmt.git
%cd ..

In [None]:
# Install the necessary libraries
!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library
! pip install mosestokenizer subword-nmt
# Install fairseq from source
!git clone https://github.com/pytorch/fairseq.git
%cd fairseq
# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d
!pip install --editable ./

%cd ..

In [None]:
# download the indictrans model

# downloading the indic-en model
!wget https://storage.googleapis.com/samanantar-public/V0.2/models/indic-en.zip
!unzip indic-en.zip

# downloading the en-indic model
!wget https://storage.googleapis.com/samanantar-public/V0.2/models/en-indic.zip
!unzip en-indic.zip

# # downloading the indic-indic model
#!wget https://storage.googleapis.com/samanantar-public/V0.3/models/m2m.zip
#!unzip m2m.zip

In [None]:
%cd indicTrans

In [None]:
from indicTrans.inference.engine import Model

indic2en_model = Model(expdir='../indic-en')
en2indic_model = Model(expdir='../en-indic')

In [None]:
def translate_text_indic(text, src="en", dst="ta"):
    
    if text.isnumeric():
        return text

    if src == 'en':
        model = en2indic_model
    else:
        model = indic2en_model

    translated_text = model.translate_paragraph(text, src, dst)
    
    return translated_text

In [None]:
ta_sents = ['அவனுக்கு நம்மைப் தெரியும் என்று தோன்றுகிறது',
            "இது எங்கே இருக்கு என்று என்னால் கண்டுபிடிக்க முடியவில்லை.",
            'உங்களுக்கு உங்கள் அருகில் இருக்கும் ஒருவருக்கோ இத்தகைய அறிகுறிகள் தென்பட்டால், வீட்டிலேயே இருப்பது, கொரோனா வைரஸ் தொற்று பிறருக்கு வராமல் தடுக்க உதவும்.']


indic2en_model.batch_translate(ta_sents, 'ta', 'en')

In [None]:
indic2en_model.translate_paragraph(ta_sents[0], 'ta', 'en')

## 1.5 Indic Trans - Transliterate

In [None]:
!git clone https://github.com/libindic/indic-trans.git

In [None]:
%cd indic-trans
!pip install -r requirements.txt
!python setup.py install

In [None]:
%cd ..

In [None]:
from indictrans import Transliterator
trn = Transliterator(source='eng', target='tam', build_lookup=True)
hin = """Sheikh Hamdan bin Zayed Al Nahyan, Ruler's Representative in Al Dhafra Region, attended the opening of the International Atomic Energy Agency's (IAEA) ConvEx-3 'Barakah UAE', which is aimed to test the global emergency response system for a severe accident simulated at the Barakah nuclear power plant.

During the exercise, 75 member states and 12 international organisations are participating to evaluate their emergency response actions in order to identify good practices and highlight areas needing improvement."""
eng = trn.transform(hin)
print(eng)

# 2. Translate

## 2.1 To English

In [None]:
translator = 'indic'
if translator == 'gcp':
    translate_text = translate_text_gcp
elif translator == 'indic':
    translate_text = translate_text_indic
else:
    raise ValueError()

    
def clean_text(text):
    words = [word for word, tag in nltk.pos_tag(nltk.word_tokenize(text)) if tag[0]=='N']
    if len(words) >= 1:
        cleaned_text = " ".join(words)
    else:
        cleaned_text = text
    return cleaned_text

In [None]:
def translate_row_en(row, target_lang):

    answer_text_original = row['answer_text']

    row['question'] = translate_text(text=row['question'], src=row['src'], dst=target_lang)
    row['answer_text'] = translate_text(text=row['answer_text'], src=row['src'], dst=target_lang)

    # split into sentences, translate and identify the sentence id with answer
    sentences = sentence_tokenize.sentence_split(row['context'], row['src'], delim_pat='auto')
    running_len = 0
    translated_context = ""
    start_found = False
    for i, sentence in enumerate(sentences):
        result = translate_text(text=sentence, src=row['src'], dst=target_lang)
        
        if not start_found:
            running_len += len(sentence)
            if row['answer_start'] < running_len: # if this sentence has the answer_text
                start_in_sent = result.lower().find(row['answer_text'].lower())
                if start_in_sent != -1: # if there is an exact match of the answer
                    row['answer_text_cleaned'] = row['answer_text']
                    row['answer_start'] = len(translated_context) + start_in_sent
                else: # if there is not an exact match of the answer
                    row['answer_text_cleaned'] = clean_text(row['answer_text'])
                    start_in_sent_cleaned = result.lower().find(row['answer_text_cleaned'].lower())
                    if start_in_sent_cleaned != -1: # if there is an exact match of the cleaned answer
                        row['exact_match'] = 0
                        row['answer_start'] = len(translated_context) + start_in_sent_cleaned
                    else: # if there is not an exact match of the cleaned answer
                        row['exact_match'] = -1
                        row['answer_start'] = len(translated_context)
                start_found = True

        translated_context = translated_context + result + " "

    row['context'] = translated_context.strip()

    return row

In [None]:
trans_aug = {}
target_lang = 'en'
trans_aug[target_lang] = data.copy()
trans_aug[target_lang]['exact_match'] = 1
trans_aug[target_lang]['answer_text_cleaned'] = trans_aug[target_lang]['answer_text'].copy()
trans_aug[target_lang] = trans_aug[target_lang].progress_apply(lambda row: translate_row_en(row, target_lang), axis=1)
trans_aug[target_lang]['src'] = target_lang

In [None]:
trans_aug['en']['exact_match'].value_counts()

In [None]:
filter = trans_aug['en']['exact_match'] != -1
trans_aug['en'] = trans_aug['en'][filter]

In [None]:
trans_aug['en']['answer_text_final'] = trans_aug['en']['answer_text'].where(trans_aug['en']['exact_match'] == 1, trans_aug['en']['answer_text_cleaned'])

In [None]:
trans_aug['en'].head()

In [None]:
trans_aug['en'].to_csv(os.path.join(cwd, f"Data/raw_train_en.csv"), index=False)

In [None]:
trans_aug['en']['answer_text'] = trans_aug['en']['answer_text_final'].copy()
trans_aug['en'] = trans_aug['en'].drop(columns=['answer_text_cleaned', 'answer_text_final'])
trans_aug['en'] = trans_aug['en'].rename(columns={'src': 'language'})
trans_aug['en'].to_csv(os.path.join(cwd, f"Data/train_en.csv"), index=False)

In [None]:
trans_aug['en']

## 2.2 To other languages

In [None]:
trans_aug['en'] = trans_aug['en'].rename(columns={'language': 'src'})

In [None]:
def translate_row(row, target_lang):

    answer_text_original = row['answer_text']

    row['question'] = translate_text(text=row['question'], src=row['src'], dst=target_lang)
    row['answer_text'] = translate_text(text=row['answer_text'], src=row['src'], dst=target_lang)

    # split into sentences, translate and identify the sentence id with answer
    sentences = sentence_tokenize.sentence_split(row['context'], row['src'], delim_pat='auto')
    running_len = 0
    translated_context = ""
    start_found = False
    for i, sentence in enumerate(sentences):
        result = translate_text(text=sentence, src=row['src'], dst=target_lang)
        
        if not start_found:
            running_len += len(sentence)
            if row['answer_start'] < running_len: # if this sentence has the answer_text
                start_in_sent = result.lower().find(row['answer_text'].lower())
                if start_in_sent != -1: # if there is an exact match of the answer
                    row['answer_text_cleaned'] = row['answer_text']
                    row['answer_start'] = len(translated_context) + start_in_sent
                else: # if there is not an exact match of the answer
                    row['answer_text_cleaned'] = clean_text(row['answer_text'])
                    start_in_sent_cleaned = result.lower().find(row['answer_text_cleaned'].lower())
                    if start_in_sent_cleaned != -1: # if there is an exact match of the cleaned answer
                        row['exact_match'] = 0
                        row['answer_start'] = len(translated_context) + start_in_sent_cleaned
                    else: # if there is not an exact match of the cleaned answer
                        row['exact_match'] = -1
                        row['answer_start'] = len(translated_context)
                start_found = True

        translated_context = translated_context + result + " "

    row['context'] = translated_context.strip()

    return row

In [None]:
meta_data = {}
target_langs = ['ta', 'ml', 'te', 'hi', 'mr', 'bn']
for target_lang in tqdm(target_langs):
    trans_aug[target_lang] = trans_aug['en'].copy()
    trans_aug[target_lang]['exact_match'] = 1
    trans_aug[target_lang]['answer_text_cleaned'] = trans_aug[target_lang]['answer_text'].copy()
    trans_aug[target_lang] = trans_aug[target_lang].progress_apply(lambda row: translate_row(row, target_lang), axis=1)
    trans_aug[target_lang]['src'] = target_lang

    meta_data[target_lang] = trans_aug[target_lang]['exact_match'].value_counts()
    filter = trans_aug[target_lang]['exact_match'] != -1
    trans_aug[target_lang] = trans_aug[target_lang][filter]

    trans_aug[target_lang]['answer_text_final'] = trans_aug[target_lang]['answer_text'].where(trans_aug[target_lang]['exact_match'] == 1, trans_aug[target_lang]['answer_text_cleaned'])

    trans_aug[target_lang]['answer_text'] = trans_aug[target_lang]['answer_text_final'].copy()
    trans_aug[target_lang] = trans_aug[target_lang].drop(columns=['answer_text_cleaned', 'answer_text_final'])
    trans_aug[target_lang] = trans_aug[target_lang].rename(columns={'src': 'language'})
    trans_aug[target_lang].to_csv(os.path.join(cwd, f"Data/train_{target_lang}.csv"), index=False)


In [None]:
meta_df = pd.DataFrame(meta_data).fillna(0)
meta_df

## 2.3 Format output

In [None]:
data_dir = os.path.join(cwd, 'Data')
final_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
final_df['is_original'] = True

langs = ['en', 'ta', 'ml', 'te', 'hi', 'mr', 'bn']
for lang in langs:
    lang_df = pd.read_csv(os.path.join(data_dir, f'train_{lang}.csv'))
    lang_df = lang_df.drop(columns=['exact_match'])
    lang_df['is_original'] = False
    final_df = pd.concat([final_df, lang_df], ignore_index=True)

final_df

In [None]:
final_df['language'].value_counts()

In [None]:
final_df.to_csv(os.path.join(cwd, f"Data/train_trans_augmented.csv"), index=False)

# 3. Transliterate

In [None]:
import pandas as pd
import re
import os

cwd = "~/Desktop/chaii"

data = pd.read_csv(os.path.join(cwd, 'Data/train.csv'))
#data = data.head()
data['src'] = data['language'].str[:2]
data = data.drop(columns=['language'])
print(data.shape)
data.head()

In [None]:
lang_code_map = {
    'en': 'eng',
    'ta': 'tam',
    'ml': 'mal',
    'te': 'tel',
    'hi': 'hin',
    'mr': 'mar',
    'bn': 'ben',
}

In [None]:
def remove_non_nouns(text):
    words = [word for word, tag in nltk.pos_tag(nltk.word_tokenize(text)) if tag[0]=='N']
    if len(words) >= 1:
        cleaned_text = " ".join(words)
    else:
        cleaned_text = text
    return cleaned_text

In [None]:
def transliterate_text(text, src="ta"):
    
    global tlits # target language is alrady setup in these models
    
    if text.isnumeric():
        return text
    
    if tlits[src] is not None:
        return tlits[src].transform(text)
    
    return text

In [None]:
def transliterate_row(row, target_lang):

    answer_text_original = row['answer_text']

    row['question'] = transliterate_text(text=row['question'], src=row['src'])
    row['answer_text'] = transliterate_text(text=row['answer_text'], src=row['src'])

    # split into sentences, translate and identify the sentence id with answer
    sentences = sentence_tokenize.sentence_split(row['context'], row['src'], delim_pat='auto')
    running_len = 0
    transliterated_context = ""
    start_found = False
    for i, sentence in enumerate(sentences):
        result = transliterate_text(text=sentence, src=row['src'])
        
        if not start_found:
            running_len += len(sentence)
            if row['answer_start'] < running_len: # if this sentence has the answer_text
                start_in_sent = result.lower().find(row['answer_text'].lower())
                if start_in_sent != -1: # if there is an exact match of the answer
                    row['answer_text_cleaned'] = row['answer_text']
                    row['answer_start'] = len(transliterated_context) + start_in_sent
                else: # if there is not an exact match of the answer
                    row['answer_text_cleaned'] = remove_non_nouns(row['answer_text'])
                    start_in_sent_cleaned = result.lower().find(row['answer_text_cleaned'].lower())
                    if start_in_sent_cleaned != -1: # if there is an exact match of the cleaned answer
                        row['exact_match'] = 0
                        row['answer_start'] = len(transliterated_context) + start_in_sent_cleaned
                    else: # if there is not an exact match of the cleaned answer
                        row['exact_match'] = -1
                        row['answer_start'] = len(transliterated_context)
                start_found = True

        transliterated_context = transliterated_context + result + " "

    row['context'] = transliterated_context.strip()

    return row

In [None]:
meta_data = {}
trans_aug = {}
source_langs = ['ta', 'hi']
target_langs = ['en', 'ta', 'ml', 'te', 'hi', 'mr', 'bn']
for target_lang in tqdm(target_langs):
    
    tlits = {}
    for source_lang in source_langs:
        if source_lang != target_lang:
            tlits[source_lang] = Transliterator(source=lang_code_map[source_lang], target=lang_code_map[target_lang], build_lookup=True)
        else:
            tlits[source_lang] = None
    
    trans_aug[target_lang] = data.copy()
    trans_aug[target_lang]['exact_match'] = 1
    trans_aug[target_lang]['answer_text_cleaned'] = trans_aug[target_lang]['answer_text'].copy()
    trans_aug[target_lang] = trans_aug[target_lang].progress_apply(lambda row: transliterate_row(row, target_lang), axis=1)
    trans_aug[target_lang]['tgt'] = target_lang

    meta_data[target_lang] = trans_aug[target_lang]['exact_match'].value_counts()
    filter = trans_aug[target_lang]['exact_match'] != -1
    trans_aug[target_lang] = trans_aug[target_lang][filter]

    trans_aug[target_lang]['answer_text_final'] = trans_aug[target_lang]['answer_text'].where(trans_aug[target_lang]['exact_match'] == 1, trans_aug[target_lang]['answer_text_cleaned'])
    trans_aug[target_lang]['answer_text'] = trans_aug[target_lang]['answer_text_final'].copy()
    trans_aug[target_lang] = trans_aug[target_lang].drop(columns=['answer_text_cleaned', 'answer_text_final'])
    trans_aug[target_lang].to_csv(os.path.join(cwd, f"Data/train_{target_lang}_tlit.csv"), index=False)


In [None]:
meta_df = pd.DataFrame(meta_data).fillna(0)
meta_df

In [None]:
data_dir = os.path.join(cwd, 'Data')
final_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
final_df = final_df.rename(columns={'language':'src'})
final_df['tgt'] = final_df['src'].copy()
final_df['is_original'] = True

langs = ['en', 'ta', 'ml', 'te', 'hi', 'mr', 'bn']
for lang in langs:
    lang_df = pd.read_csv(os.path.join(data_dir, f'train_{lang}_tlit.csv'))
    lang_df = lang_df.drop(columns=['exact_match'])
    lang_df['is_original'] = False
    final_df = pd.concat([final_df, lang_df], ignore_index=True)

final_df

In [None]:
final_df['tgt'].value_counts()

In [None]:
final_df.to_csv(os.path.join(cwd, f"Data/train_trans_augmented_tlit.csv"), index=False)