In [5]:
import glob
import re

# Generating parallel sentences

In [6]:
sentences_path = "word_aligner/All_sentence_files"
alignments_path = "alignments"

In [7]:
def read_file(path):
    f = open(path, 'r',encoding='utf8')
    data = [l.strip() for l in f.readlines()]
    f.close()
    return data

In [8]:
sentences = {}
for file in glob.glob(sentences_path+"/*"):
    lang = file.split("/")[-1].split("_")[0]
    sentences[lang] = read_file(file)

In [9]:
parallel_senteces = []
for urdu_sent, hindi_sent, tamil_sent, telugu_sent in zip(sentences['urdu'], sentences['hindi'], sentences['tamil'], sentences['telugu']):
    parallel_senteces.append({'urdu':urdu_sent,
                             'hindi':hindi_sent,
                             'tamil':tamil_sent,
                             'telugu':telugu_sent})

In [10]:
parallel_senteces[0]

{'urdu': 'چور سمجھ کر نامعلوم شخص کو دیہاتیوں نے بری طرح زد و کوب کرنے پر زیر علاج رہتے ہوئے فوت ہو گیا ۔',
 'hindi': 'एक अज्ञात व्यक्ति को चोर समझकर ग्रामीणों ने जमकर पिटाई कर दी और इलाज के दौरान उसकी मौत हो गई.',
 'tamil': 'அடையாளம் தெரியாத நபர், அவரை திருடன் என்று தவறாக நினைத்து, கிராம மக்களால் கடுமையாக தாக்கப்பட்டு, சிகிச்சை பலனின்றி உயிரிழந்தார்.',
 'telugu': 'ఓ గుర్తుతెలియని వ్యక్తి దొంగగా భావించి గ్రామస్తులు తీవ్రంగా కొట్టి చికిత్స పొందుతూ మృతి చెందాడు.'}

# Labelling translated sentences

In [11]:
import json
import pickle

In [12]:
import json
urdu_labels = []
with open("json_data/ur_all.json", "r", encoding="utf-8") as f:
    for line in f:
        urdu_labels.append(json.loads(line))

main_dict = {}
for sample in urdu_labels:
    sentence = " ".join(sample['words'])
    sentence = re.sub(r'\s+', ' ', sentence)
    main_dict[sentence] = {'urdu':
                           {
                               'words':sample['words'], 
                               'srl':sample['srl']
                           }
                          }
    

In [21]:
def generate_new_labels(source_labels, target_sentence, alignments, NAH_index = 16):
    target_labels = [NAH_index]*len(target_sentence.split())
    for i,j in alignments:
        new_label = source_labels[i]
        if new_label != NAH_index:
            target_labels[j] = new_label
    return target_labels
        

In [22]:
problems = []
success = []
for file in glob.glob(alignments_path+"/*"):
    lang = file.split("/")[-1].split(".")[0].split("-")[-1]
    print(file, lang)
    with open(file, "rb") as f:
        alignments_pickle_load = pickle.load(f)

    for sample in alignments_pickle_load:
        try:
            source_sent = sample['Source Sentence']
            # source_sent = re.sub(r'\s+', ' ', source_sent)
            target_sent = sample['Target Sentence']
            alignments = sample['Mapping/Alignment']
            
            source_labels = main_dict[source_sent]['urdu']['srl']
            target_labels = generate_new_labels(source_labels, target_sent, alignments)
    
            main_dict[source_sent][lang] = {'words':target_sent.split(), 'srl':target_labels}
            success.append(source_sent)
        except Exception as e:
            problems.append(source_sent)
            

    

alignments/urdu-tamil.pkl tamil
alignments/urdu-telugu.pkl telugu
alignments/urdu-hindi.pkl hindi


In [27]:
final_sents = list(main_dict.keys())
import random
random.shuffle(final_sents)
random.shuffle(final_sents)
random.shuffle(final_sents)

In [32]:
l = len(final_sents)
train_sents = final_sents[:int(l*0.7)]
val_sents = final_sents[int(l*0.7):int(l*0.85)]
test_sents = final_sents[int(l*0.85):]

In [33]:
with open("fial_jsons/train_all.json", "w", encoding="utf-8") as f:
    for sent in train_sents:
        for lang in main_dict[sent]:
            json.dump(main_dict[sent][lang], f, ensure_ascii=False)
            f.write("\n")


In [34]:
with open("fial_jsons/val_all.json", "w", encoding="utf-8") as f:
    for sent in val_sents:
        for lang in main_dict[sent]:
            json.dump(main_dict[sent][lang], f, ensure_ascii=False)
            f.write("\n")

In [35]:
with open("fial_jsons/test_urdu.json", "w", encoding="utf-8") as f:
    for sent in test_sents:
        json.dump(main_dict[sent]['urdu'], f, ensure_ascii=False)
        f.write("\n")

with open("fial_jsons/test_hindi.json", "w", encoding="utf-8") as f:
    for sent in test_sents:
        json.dump(main_dict[sent]['hindi'], f, ensure_ascii=False)
        f.write("\n")


with open("fial_jsons/test_tamil.json", "w", encoding="utf-8") as f:
    for sent in test_sents:
        json.dump(main_dict[sent]['tamil'], f, ensure_ascii=False)
        f.write("\n")

with open("fial_jsons/test_telugu.json", "w", encoding="utf-8") as f:
    for sent in test_sents:
        json.dump(main_dict[sent]['telugu'], f, ensure_ascii=False)
        f.write("\n")