In [1]:
import re
import os
import json
import gzip
import requests
import random
import logging

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from copy import deepcopy
from time import sleep
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
from collections.abc import Iterable

In [2]:
proxy_list_url = 'https://www.ip-adress.com/proxy-list'

In [3]:
proxy_df = pd.read_html(proxy_list_url, skiprows=0)[0]

In [4]:
proxy_df.head()

Unnamed: 0,Proxy,Type,Location,Last Checked
0,68.188.59.198:80,highly-anonymous,"🇺🇸 Saint Charles, Missouri, United States",2022-01-27 16:20
1,54.202.153.246:80,transparent,"🇺🇸 Boardman, Oregon, United States",2022-01-27 13:32
2,150.242.182.98:80,anonymous,🇲🇾 Malaysia,2022-01-27 11:43
3,113.28.90.67:9480,highly-anonymous,"🇭🇰 Kowloon, Kowloon City, Hong Kong",2022-01-26 14:03
4,113.28.90.67:80,highly-anonymous,"🇭🇰 Kowloon, Kowloon City, Hong Kong",2022-01-26 13:59


In [5]:
proxy_list = proxy_df['Proxy'].values
proxy_list[:3]

array(['68.188.59.198:80', '54.202.153.246:80', '150.242.182.98:80'],
      dtype=object)

### utils

In [6]:
def get_url_content(url, proxy=[]):
    if len(proxy) > 0:
        proxy_list = list(deepcopy(proxy))
        try:
            random_proxy_server = np.random.choice(proxy_list)
            proxies = {"http": f"http://{random_proxy_server}"}
            return requests.get(url, proxies=proxies, headers={'User-Agent': 'Chrome'}).content
        except Exception as e:
            logging.warning('Failed to use {} proxy. Trying to use another one.....')
            proxy_list.remove(random_proxy_server)
            get_url_content(url, proxy=proxy_list)
    else:
        return requests.get(url, headers={'User-Agent': 'Chrome'}).content

In [7]:
def fix_sup(phonemes):
    regex_sups = re.findall(r'\w<sup>\w</sup>', phonemes)

    for sup in regex_sups:
        sup_letters = re.findall(r'(\w)<sup>(\w)</sup>', sup)[0]
        phonemes = phonemes.replace(sup, "{%s^%s}" % sup_letters)
    
    return phonemes


def extract_word_from_span(x):
    return re.findall(r'\>\s*(.*?)\s*\<\/(?:span|h2)\>', str(x))

    
def extract_tran_from_span(x):
    return re.findall(r'\>\s*\[(.*?)\]\s*\<\/(?:span|h2)\>', str(x))
    

def parse_word(word):
    try:
        res = {}

        goroh_words_url = f"https://goroh.pp.ua/Словозміна/{word}"
        goroh_trans_url = f"https://goroh.pp.ua/Транскрипція/{word}"

        html_word = get_url_content(goroh_words_url.format(word=word), proxy=proxy_list)
        html_tran = get_url_content(goroh_trans_url.format(word=word), proxy=proxy_list)

        soup_word = BeautifulSoup(html_word)
        soup_tran = BeautifulSoup(html_tran)

        words_articles = soup_word.find_all(lambda tag: tag.name == 'div' and 
                                                        tag.get('class') == ["article-block"])
        trans_articles = soup_tran.find_all(lambda tag: tag.name == 'div' and 
                                                        tag.get('class') == ["article-block"])

        for article_word, article_tran in zip(words_articles, trans_articles):
            tags = [tag.text.strip() for lst in article_word.findAll("div", class_="taglist") 
                                     for tag in lst.findAll("a", class_="tag")]
            for tag in tags:
                if tag in ["прийменник", "дієприслівник", "частка", "сполучник", "вигук", "прислівник", "займенник з прийменником"]:
                    # such words are not changable so we sure that it will be only one word in the tag
                    words = extract_word_from_span(article_word.find("h2"))
                    trans = [fix_sup(x) for x in extract_tran_from_span(article_tran.find("h2"))]
                    res.update(dict(zip(words, trans)))

                elif tag in ["іменник", "прикметник", "займенник", "числівник", "дієслово", "дієприкметник"]:
                    table_word = article_word.find("table", {"class" : "table"})
                    table_tran = article_tran.find("table", {"class" : "table"})

                    if table_word and table_tran:
                        rows_words = table_word.find_all(lambda tag: tag.name == 'tr' and 
                                                                     tag.get('class') == ['row'])
                        rows_trans = table_tran.find_all(lambda tag: tag.name == 'tr' and 
                                                                     tag.get('class') == ['row'])

                        for row_word, row_tran in zip(rows_words, rows_trans):
                            cells_words = row_word.find_all(lambda tag: tag.name == 'td' and 
                                                                        tag.get('class') == ['cell'])
                            cells_trans = row_tran.find_all(lambda tag: tag.name == 'td' and 
                                                                         tag.get('class') == ['cell'])

                            for cell_word, cell_tran in zip(cells_words, cells_trans):
                                words = [word 
                                         for el in cell_word.contents 
                                         for word in extract_word_from_span(el)]
                                trans = [fix_sup(tran) 
                                         for el in cell_tran.contents 
                                         for tran in extract_tran_from_span(el)]
                                res.update(dict(zip(words, trans)))

        return res
    except:
        return word

In [8]:
def split_phonemes(phonemes):
    res = []
    complex_phone = False
    phone_candidate = ''

    for i, letter in enumerate(phonemes):

        if letter == '{':
            complex_phone = True
            phone_candidate += letter
            continue
        elif letter == '}':
            complex_phone = False
            phone_candidate += letter
            res.append(phone_candidate)
            phone_candidate = ''
            continue
        # define stressed letter
        elif ord(letter) == 769: 
            res[-1] = res[-1] + letter
        # define softed letter
        elif ord(letter) in (96, 8217):
            res[-1] = res[-1] + "'"
        # define extended letter
        elif ord(letter) == 58:
            res[-1] = res[-1] + letter
        else:
            if complex_phone:
                phone_candidate += letter
            else:
                res.append(letter)
    return res

#### small manual tests

In [9]:
parse_word("натщесерце")

{'натще́серце': 'нач:че́с{е^и}рц{е^и}'}

In [10]:
split_phonemes('нач:че́с{е^и}рц{е^и}')

['н', 'а', 'ч:', 'ч', 'е́', 'с', '{е^и}', 'р', 'ц', '{е^и}']

In [11]:
# parse_word("дзюрчання")

In [12]:
# parse_word("Аахен")

In [13]:
# parse_word("іл")

In [14]:
# parse_word("під'їхав")

In [15]:
# parse_word("йому")

In [16]:
# parse_word("п'ятдесят")

In [17]:
# parse_word("читає")

In [18]:
# parse_word("шпаклюватимемо")

### Ukrainian vocabulary

In [19]:
# https://github.com/brown-uk/dict_uk/releases

In [20]:
vocab = []

with open('../data/dict_corp_vis.txt', 'r') as dict_file:
    for line in dict_file:
        if not line.startswith(" "):
            vocab.append(line)
vocab = [word.split()[0].strip() for word in vocab]
filtered_vocab = list(filter(lambda x: len(x)>1, vocab))

print(f'Number of words in vocabulary: {len(vocab)}')
print(f'Number of words after filtering short words: {len(filtered_vocab)}')

Number of words in vocabulary: 416657
Number of words after filtering short words: 416634


In [21]:
filtered_vocab[1000:1010]

['авіамістечко',
 'авіамоделізм',
 'авіамоделіст',
 'авіамодель',
 'авіамодельний',
 'авіамоделювання',
 'авіамоніторинг',
 'авіамотор',
 'авіамоторист',
 'авіамоторний']

In [22]:
# parse_word("абдукторний")

Here we face a problem. There are some words in Ukrainian which can have stress in different places (e.g. `замок` which means castle or locker, depends on stress). On the one hand, we have some how treat them, on the other -- their ratio in total vocabulary is so small that I guess it will be easier for us to ignore them and assume that there is only on stress for such words is available.

In [23]:
consonants = [
    "б", "в", "г", "ґ", "д", "ж", "з", "й", 
    "к", "л", "м", "н", "п", "р", "с", "т", "ф", 
    "х", "ц", "ч", "ш", "щ", "ь"
]
vowels = ["а", "е", "є", "и", "і", "ї", "о", "у", "ю", "я"]

In [24]:
def get_stress_pos(word):
    pos = 0
    vowel_count = 0
    for i, letter in enumerate(word):
        if letter in vowels:
            vowel_count += 1
        if ord(letter) == 769:
            if word[i-1] in vowels:
                return i-1
            elif word[i-2] in vowels:
                return i-2
    if vowel_count == 1:
        return None
    else:
        return -1

In [25]:
get_stress_pos("шпаклю́є")

5

### Parse the data

In [26]:
def json2gzip(data, path):
    if not path.endswith(".json.gz"):
        path += ".json.gz"
    json_str = json.dumps(data) + "\n"
    json_bytes = json_str.encode('utf-8')

    with gzip.GzipFile(path, 'w') as fout:
        fout.write(json_bytes)     

def gzip2json(path):
    with gzip.GzipFile(path, 'r') as fin:
        data = json.loads(fin.read().decode('utf-8'))
    return data

In [27]:
def gentle_parsing(vocab, use_tqdm=False):
    total_res = {}
    failed_stress = []
    failed_words = []
    
    if use_tqdm:
        iterator = tqdm(vocab)
    else:
        iterator = vocab
    
    for item in iterator:
        if total_res.get(item.lower()):
            continue
        parsed_words = parse_word(item)
#         sleep(np.min([np.random.random(), 0.5]))  # between 0 and 0.5
        if isinstance(parsed_words, dict):
            for word, tran in parsed_words.items():
                word_stress_pos = get_stress_pos(word.lower())
                if word_stress_pos != -1:
                    word = word.replace(chr(769), "")
                    total_res[word] = {"stress_pos": word_stress_pos, "phones": split_phonemes(tran)}
                else:
                    failed_stress.append(word)
        else:
            failed_words.append(item)
    return total_res, failed_stress, failed_words

In [28]:
alphabet = ['а', 'б', 'в', 'г', 'ґ', 'д', 'е', 'є', 'ж', 'з', 'и', 
            'і', 'ї', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 
            'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ь', 'ю', 'я']

letter_batches = {}
for word in tqdm(filtered_vocab):
    letter = word[0].lower()
    letter_batches[letter] = letter_batches.get(letter, []) + [word]

100%|██████████| 416634/416634 [00:22<00:00, 18755.34it/s]


In [29]:
try:
    final_res = gzip2json('../data/phone_data.json.gz')
except FileNotFoundError:
    tmp = [item[:] for item in list(letter_batches.values())]
    tmp_res = process_map(gentle_parsing, 
                      tmp, 
                      max_workers=12)
    
    final_res = {}
    for item in tmp_res:
        final_res.update(item[0])
    
    json2gzip(final_res, '../data/phone_data.json.gz')

In [30]:
# failed_words = [x for item in tmp_res for x in item[2] if item[2]]
# failed_words[:10]

In [31]:
# failed_words_res = gentle_parsing(failed_words, use_tqdm=True)

In [32]:
# failed_stress = [x for item in tmp_res for x in item[1] if item[1]]
# failed_stress[:10]

In [37]:
len(list(final_res.keys()))

2256088

In [38]:
len(set([item.lower() for item in final_res.keys()]))

2231982

In [40]:
a = sorted(list([item.lower() for item in final_res.keys()]))
dupes = list(set(a[::2]) & set(a[1::2]))

In [49]:
final_res.get("приходе")

{'stress_pos': 4, 'phones': ['п', 'р', '{и^е}', 'х', 'о́', 'д', '{е^и}']}

In [41]:
dupes

['обриві',
 'амурам',
 'мельником',
 'недашківському',
 'загірним',
 'борозні',
 'хуторові',
 'малкою',
 'головецька',
 'вирва',
 'толок',
 'трудолюбиве',
 'ліснику',
 'балкове',
 'маху',
 'чупринко',
 'борщівко',
 'молоткову',
 'горяному',
 'чемберленами',
 'сміливе',
 'ділового',
 'стояни',
 'греблями',
 'польовому',
 'свидницькі',
 'рогівським',
 'гримучим',
 'вільшаної',
 'степ',
 'вовківську',
 'комишу',
 'симбірську',
 'русинам',
 'аланках',
 'любарської',
 'глауберові',
 'черемушкам',
 'марко',
 'приходе',
 'кілкам',
 'спасителя',
 'сомів',
 'музику',
 'ясенок',
 'шнурках',
 'скорин',
 'поганину',
 'соловських',
 'копана',
 'верхньодніпровську',
 'блощице',
 'паливоду',
 'солодичем',
 'ікра',
 'батальному',
 'плоскій',
 'біленьку',
 'грунським',
 'полонська',
 'ситним',
 'жижкам',
 'стирка',
 'луки',
 'стадне',
 'чорноголової',
 'глинці',
 'даченського',
 'гордієве',
 'квасникові',
 'шкураті',
 'тернівки',
 'рабові',
 'терени',
 'герою',
 'сидорових',
 'землю',
 'вигоне',
 'зими