In [1]:
import json
import re
import os
import logging
import random
from scRNN import Sinhala_alpha, get_words
from scRNN.constants import NUM
from scRNN.error_word_generation import GenErrorWords, ErrorType
from preProcessing import read_sentences

In [6]:
dataset_path =  "dataset"
log_file = "log.txt"
if os.path.exists(os.path.join(dataset_path, log_file)):
    os.remove(os.path.join(dataset_path, log_file))
output_path = os.path.join(dataset_path, "preprocessed") 
new_corp = 'dataset/train/24_MAY/TrainedDataRemoved.txt'
text_book = 'dataset/train/data_from_text_books/text_book.txt'
data_good = "dataset/Other/eof_data_good.txt"
data_bad = "dataset/Other/eof_data_bad.txt"

In [9]:
new_corp_sentences = read_sentences(new_corp)

In [10]:
# clean stage 01
# replace english words with <unk>
# replace number with NUM defined on constants
def clean_stage_01(list_senteces):
    rlist_senteces = []
    for sentence in list_senteces:
        rword_lst = []
        word_lst = sentence.split()
        for word in word_lst:
            if word in ['.', '<eos>', '<unk>']:
                rword_lst.append(word)
                continue
            # word = re.sub(r'[A-z]+', '<unk>', word)
            # word = re.sub(r'[0-9,]+', NUM, word)
            rword_lst.append(word)
        rlist_senteces.append(" ".join(rword_lst))
    return rlist_senteces

In [12]:
new_corp_sentences = clean_stage_01(new_corp_sentences)

In [14]:
# clean stage 02
# replace english words with <unk>
# replace number with NUM defined on constants
def clean_stage_02(list_senteces):
    rlist_senteces = []
    with open(os.path.join(dataset_path, log_file), mode='a', encoding='utf8') as f:
        for sentence in list_senteces:
            rword_lst = []
            word_lst = sentence.split()
            for word in word_lst:
                if word in ['.', '<eos>', '<unk>']:
                    rword_lst.append(word)
                    continue
                word_prev = word
                word = re.sub('\u200d්','්', word) 
                word = re.sub(r'අා', 'ආ', word)
                word = re.sub(r'අැ', 'ඇ', word)
                word = re.sub(r'ේ', 'ේ', word)
                word = re.sub(r'ෙෙ','ෛ', word)
                word = re.sub(r'ෝ', 'ෝ', word)
                word = re.sub(r'ෝ', 'ෝ', word)
                word = re.sub(r'ෘෘ', 'ෲ', word)
                word = re.sub(r'[<>˚ை>\\/”{}¼½⅓’‘”‚.‟÷–\]]',' ', word)
                if word_prev != word:
                    print("{} -> {}".format(word_prev, word), file = f)
                    print("{} -> {}".format(list(word_prev), list(word)), file = f)
                rword_lst.append(word)
            rlist_senteces.append(" ".join(rword_lst))
        print("-"*40 + "\n", file =f)
    
    return rlist_senteces

In [15]:
new_corp_sentences = clean_stage_02(new_corp_sentences)

In [16]:
print("number of sentences {}".format(len(new_corp_sentences)))
corpus_senteces = new_corp_sentences #+ text_book_sentences
print("number of sentences {}".format(len(corpus_senteces)))
corpus_senteces = set(corpus_senteces) #
print("number of sentences {}".format(len(corpus_senteces)))

number of sentences 96665
number of sentences 96665
number of sentences 62295


In [17]:
def to_file(file_to_write, sentence_list):
    with open(file_to_write, 'w+', encoding='utf8') as f:
        f.writelines("\n".join(sentence_list))

In [20]:
# write to file
file_to_write = os.path.join(output_path, 'train_corpus_y.txt') 
to_file(file_to_write, corpus_senteces)
words_train = get_words(file_to_write)
print("number of words in training {}".format(len(words_train)))
del words_train

number of words in training 35576


In [323]:
# data to be used and primary test set and dev set
PATH_DEV_GOOD_SOURCE = 'dataset/test_01/eof_data_test_good.txt'
PATH_DEV_GOOD_RESULT = 'dataset/test_01/eof_cdata_test_good.txt'

data_dev = read_sentences(PATH_DEV_GOOD_SOURCE)
data_dev = clean_stage_01(data_dev)
data_dev = clean_stage_02(data_dev)
to_file(PATH_DEV_GOOD_RESULT, data_dev)

In [324]:
PATH_TEST_GOOD_SOURCE = 'dataset/test_01/eof_data_test_bad.txt'
PATH_TEST_GOOD_RESULT = 'dataset/test_01/eof_cdata_test_bad.txt'

data_TEST = read_sentences(PATH_TEST_GOOD_SOURCE)
data_TEST = clean_stage_01(data_TEST)
data_TEST = clean_stage_02(data_TEST)
to_file(PATH_TEST_GOOD_RESULT, data_TEST)

In [325]:
# data to be used and primary test set and dev set
PATH_TEST_GOOD_SOURCE = 'dataset/test_raw/'
PATH_TEST_GOOD_RESULT = 'dataset/test_00_good/'

for file in os.listdir(PATH_TEST_GOOD_SOURCE):
    if 'eof_doc' in file:
        data_dev = read_sentences(PATH_TEST_GOOD_SOURCE+file)
        data_dev = clean_stage_01(data_dev)
        data_dev = clean_stage_02(data_dev)
        file_to_write = PATH_TEST_GOOD_RESULT + "c_" + "_".join(file.split("_")[1:]) 
        to_file(file_to_write, data_dev)

In [326]:
PATH_TRAIN_GOOD = os.path.join(output_path, 'train_corpus_y.txt') 
PATH_TEST = 'dataset/test_00_good'
PATH_DEV_GOOD = 'dataset/test_01/eof_cdata_test_good.txt'

In [327]:
# load vocab in train
words_train = get_words(PATH_TRAIN_GOOD)
print("number of words in training {}".format(len(words_train)))
# load vocab in dev
words_dev = get_words(PATH_DEV_GOOD)
print("number of words in dev {}".format(len(words_dev)))
# load vocan in test
words_test = []
for file in os.listdir(PATH_TEST):
    words_test.extend(get_words(os.path.join(PATH_TEST, file)))
print("number of words in test {}".format(len(words_test)))
vocab = {k : i for i, k in enumerate(set(words_train + words_dev + words_test))}
if "<eos>" not in vocab.keys():
    vocab["<eos>"] = len(vocab.keys()) - 1
if "<unk>" not in vocab.keys():
    vocab["<unk>"] = len(vocab.keys()) - 1

print("total vocab size {}".format(len(vocab)))

number of words in training 29137
number of words in dev 8508
number of words in test 1816
total vocab size 33225


In [328]:
print("number of words common in train & dev {}".format(len(set(words_train).intersection(set(words_dev)))))
print("number of words common in train & test {}".format(len(set(words_train).intersection(set(words_test)))))

number of words common in train & dev 4850
number of words common in train & test 993


In [329]:
with open('train_vocab.txt', mode= 'w', encoding='utf8') as f:
    f.write("\n".join(sorted(vocab.keys(), key=lambda l: len(l), reverse=True)))
    

In [3]:
with open('train_c_corpus+text.json', mode= 'r', encoding='utf8') as f:
    vocab_from_file = json.load(f)
type(vocab_from_file)

dict

In [6]:
def noise_add(file_in, file_out, dir_name):
    dir_name = dir_name
    file_name = os.path.join(file_in)
    out_file_name = os.path.join(dir_name, file_out)

    logging.basicConfig(handlers=[logging.FileHandler(os.path.join(dir_name, "log_"+file_in), 'w', 'utf-8')],
                        level=logging.DEBUG)

    error_types = {  # replace maximum of two chars with
        ErrorType.Replace: [1, 2],
        ErrorType.Delete: [1, 1],
        ErrorType.Insert: [1, 1]
    }
    gne = GenErrorWords(random_seed= 1, #random.randint(0,99),
                        error_types=error_types,
                        vocab=vocab_from_file)
    input_file_name = os.path.join(dir_name, file_name)
    with open(input_file_name, 'r+', encoding='utf8')as f:
        file_data = f.read().strip().split()

    print(out_file_name)
    GenErrorWords.add_noise_to_train(
        filedata=file_data,
        Obj=gne, 
        outfile_name=out_file_name
    )

In [7]:
noise_add("output.txt", "output_bad.txt", '')
# noise_add("eof_cdata_test_good.txt", "eof_cdata_dev_bad.txt", 'dataset/test_01')
# noise_add("eof_cdata_test_good.txt", "eof_cdata_test_bad.txt", 'dataset/test_01')

output_bad.txt
logs written to output_bad_log.txt



In [332]:
# append good data and bad data
# with open("dataset/train_corpus_y.txt", 'a+', encoding='utf8') as f:
#         f.writelines("\n".join(data_good))
# with open("dataset/train_corpus_x.txt", 'a+', encoding='utf8') as f:
#         f.writelines("\n".join(data_bad))

In [333]:
def all_words(file_name):
    with open(file_name, 'r+', encoding='utf8') as f:
        return f.read().split()
y = all_words("dataset/train_corpus_y.txt")
x = all_words("dataset/train_corpus_x.txt")

In [341]:
def all_words(file_name):
    with open(file_name, 'r+', encoding='utf8') as f:
        return f.read().split()
y = all_words("train_corpus_y_00.txt")
word_freq = {k : 0  for k,v in vocab.items()}
word_freq['unk'] = 0
for word in y :
    if word in word_freq.keys():
        word_freq[word] += 1
    else:
        word_freq[word] = 1
freq = list(sorted(word_freq.items(), key=lambda x: x[1]))
# freq[-100:]
bins = 20
freq.pop(-1)
freq.pop(-1)
freq[-100:]
max_val = freq[-1][1]
min_val = 0
binsep = [i for i in range(min_val, max_val, int((max_val-min_val)/bins))]
bindict = {k : 0  for k in range(bins)}
for i in freq:
    bin_num = i[1] // int((max_val-min_val)/bins)
    try:
        bindict[bin_num] += 1
    except KeyError:
        bindict[bin_num-1] += 1

In [344]:
print(binsep)
print(bindict)
print(freq[-200:])

[0, 596, 1192, 1788, 2384, 2980, 3576, 4172, 4768, 5364, 5960, 6556, 7152, 7748, 8344, 8940, 9536, 10132, 10728, 11324, 11920]
{0: 33048, 1: 94, 2: 32, 3: 20, 4: 11, 5: 5, 6: 6, 7: 1, 8: 0, 9: 2, 10: 0, 11: 1, 12: 0, 13: 2, 14: 0, 15: 0, 16: 0, 17: 1, 18: 0, 19: 1}
[('අඩු', 543), ('එම්', 543), ('ආදායම', 548), ('සහිත', 551), ('එස්', 553), ('නීති', 558), ('එහි', 559), ('ගන්නා', 560), ('පෙර', 561), ('ක්\u200dරීඩා', 563), ('සම්බන්ධ', 566), ('මාරු', 566), ('කළමනාකරණය', 567), ('කර්මාන්ත', 567), ('වැඩ', 567), ('සමාගම', 569), ('ව්\u200dයාපෘතියේ', 576), ('විවිධ', 578), ('නම්', 579), ('අයවැය', 582), ('වැටුප්', 584), ('වෙන්', 586), ('ආර්ථික', 594), ('වාර්ෂික', 595), ('මුළු', 596), ('පාලන', 601), ('පිළියෙල', 603), ('යොමු', 605), ('කාර්යාල', 608), ('ගත', 609), ('කොට', 611), ('කාර්යාලය', 621), ('ඡන්ද', 622), ('සහකාර', 622), ('බදු', 624), ('අනෙකුත්', 627), ('ඉහළ', 627), ('ආරම්භ', 633), ('මාර්ග', 636), ('ගත්', 641), ('මිල', 643), ('වෙළඳ', 646), ('ආ', 650), ('කරමින්', 655), ('සංඛ්\u200dයාව', 670), ('මණ

In [5]:
a = read_sentences("dataset/train/26_MAY/corpus_100_bad.txt")
b = read_sentences("dataset/train/26_MAY/corpus_100_good.txt")
new_x = []
new_y = []
for i, (y,x) in enumerate(zip(a,b)):
    if len(y.split()) != len(x.split()):
        print(i)
        print(y.split())
        print(x.split())
        break
    else:
        # for x_i, y_i in zip(x.split(),y.split()):
            # print(x_i, '=>', y_i)
        new_x.append(x)
        new_y.append(y)

In [17]:
len(get_words("train_corpus_y_00.txt"))

29137

In [11]:
import json
with open('corpus_100.json', mode = 'r', encoding = 'utf8') as f:
    d = json.load(f)


In [12]:
len(d.keys())

41580

In [9]:
PATH_TEST = 'dataset/test_00_good'
print("===== CREATING VOCAB =====")
vocab = {}
id2vocab = {}
words_test = []
for file in os.listdir(PATH_TEST):
    words_test.extend(get_words(os.path.join(PATH_TEST, file)))
vocab = {k : i for i, k in enumerate(set(words_test))}
if "<eos>" not in vocab.keys():
    vocab["<eos>"] = len(vocab.keys()) - 1
if "<unk>" not in vocab.keys():
    vocab["<unk>"] = len(vocab.keys()) - 1
id2vocab = {value : key for key, value in vocab.items()}

===== CREATING VOCAB =====


In [29]:
d = get_words('dataset/train/text_booksout_good.txt')
e = get_words('dataset/train/24_MAY/corpus_10_good.txt')
c = get_words('dataset/Other/data_test_good.txt')

In [35]:
train_dataset = set(list(e)).intersection(set(c))

In [36]:
len(train_dataset)

4811

In [34]:
len(set(c))

8572