In [1]:
from unidecode import unidecode
from nltk.tokenize import word_tokenize
import os
from tqdm import tqdm_notebook as tqdm

In [2]:
raw_data_path = './raw_data/open_subtitles'
prepared_data_path = './prepared_data/open_subtitles'
if not os.path.exists(prepared_data_path):
    os.makedirs(prepared_data_path)

In [3]:
words =  list(open(os.path.join(raw_data_path, 'slowa.txt')).read().split())
swearwords =  list(open(os.path.join(raw_data_path, 'wulgaryzmy.txt')).read().split())

In [4]:
correct_to_ignored = {word: unidecode(word) for word in words if word!=unidecode(word)}

In [5]:
ignored_to_corrected = {}
for k, v in correct_to_ignored.items():
    ignored_to_corrected[v] = ignored_to_corrected.get(v, [])
    ignored_to_corrected[v].append(k)

In [6]:
words_set = set(words)
ignored_to_corrected_filtered = {}
for k, v in ignored_to_corrected.items():
    if len(v) == 1 and k not in words_set:
        ignored_to_corrected_filtered[k] = v[0]

In [7]:
def correct(sentence):
    sentence = sentence.replace('- ', '')
    tokenized = word_tokenize(sentence)
    corrected = []
    for token in tokenized:
        if token in ignored_to_corrected_filtered:
            corrected.append(ignored_to_corrected_filtered[token])
        else:
            corrected.append(token)
    return ' '.join(corrected)

In [8]:
def contains_swearword(words):
    if not isinstance(words, list):
        words = [words]
    for word in words:
        if word in swearwords:
            return True
    return False


In [9]:
# get special words from file
special_words_path = os.path.join(raw_data_path, 'special_words_pl.txt')
if os.path.isfile(special_words_path):
    special_words = list(open(special_words_path, 'r').read().strip().split())
else:
    print("no special words file")

In [10]:
def split_qa(source_path, dest_questions_path, dest_answers_path):
    with open(source_path, 'r') as source, open(dest_questions_path, 'w') as questions, open(dest_answers_path, 'w') as answers:
        pbar = tqdm(total = 41998942)
        for line in source:
            pbar.update(1)
            if line.endswith('?\n') or any(word in special_words for word in word_tokenize(line.lower())):
                next_line = next(source)
                pbar.update(1)
                question = correct(line)
                answer = correct(next_line)
                if contains_swearword(question) or contains_swearword(answer):
                    break
                questions.write(question + '\n')
                answers.write(answer + '\n')
                     

In [11]:
src = os.path.join(raw_data_path, 'OpenSubtitles2018.en-pl.pl')
dest_q = os.path.join(prepared_data_path, 'subtitles_questions_polish.txt')
dest_a = os.path.join(prepared_data_path, 'subtitles_answers_polish.txt')
split_qa(src, dest_q, dest_a)

HBox(children=(IntProgress(value=0, max=41998942), HTML(value='')))

KeyboardInterrupt: 