In [34]:
import re
import os
import json
import gzip
import requests
import random
import logging

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from copy import deepcopy
from time import sleep
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map

In [2]:
proxy_list_url = 'https://www.ip-adress.com/proxy-list'

In [3]:
proxy_df = pd.read_html(proxy_list_url, skiprows=0)[0]

In [4]:
proxy_df.head()

Unnamed: 0,Proxy,Type,Country,Last Checked
0,128.199.151.21:80,highly-anonymous,Singapore,4 hours ago
1,113.253.239.102:80,transparent,Hong Kong,5 hours ago
2,113.253.239.102:8080,transparent,Hong Kong,5 hours ago
3,41.65.168.66:8080,highly-anonymous,Egypt,5 hours ago
4,113.253.239.102:3129,transparent,Hong Kong,5 hours ago


In [5]:
proxy_list = proxy_df['Proxy'].values

In [6]:
def get_url_content(url, proxy=[]):
    if len(proxy) > 0:
        proxy_list = list(deepcopy(proxy))
        try:
            random_proxy_server = np.random.choice(proxy_list)
            proxies = {"http": f"http://{random_proxy_server}"}
            return requests.get(url, proxies=proxies, headers={'User-Agent': 'Chrome'}).content
        except Exception as e:
            logging.warning('Failed to use {} proxy. Trying to use another one.....')
            proxy_list.remove(random_proxy_server)
            get_url_content(url, proxy=proxy_list)
    else:
        return requests.get(url, headers={'User-Agent': 'Chrome'}).content

In [7]:
goroh_url = "https://goroh.pp.ua/Транскрипція/{word}"

def parse_goroh(word):
    phonemes = ''  

    try:
        html = get_url_content(goroh_url.format(word=word), proxy=proxy_list)
        soup = BeautifulSoup(html)
        tags = set(filter(None, [item.string.strip() for item in soup.findAll("div", class_="taglist")[0]]))
        if set(["іменник", "прикметник"]).intersection(tags):
            table = soup.findAll("table", class_="table")[0].findAll("tr", class_="row")
            table_content = [[cell.contents[0] for cell in row.findAll("td")] for row in table]
            cell_content = list(filter(lambda x: x[0] == 'називний', table_content))[0][1]
            phonemes = re.findall(r'\>\[(.*)\]\<\/span\>', str(cell_content))[0]
        elif set(["прислівник", "дієслово"]).intersection(tags):
            tag_content = soup.findAll("div", class_="page__sub-header")[0]#.findAll('h2')[0]
            phonemes = re.findall(r'\s+\[(.*)]\s+', str(tag_content))[0]

        return fix_sup(phonemes)
    except:
        return phonemes

    
def fix_sup(phonemes):
    regex_sups = re.findall(r'\w<sup>\w</sup>', phonemes)

    for sup in regex_sups:
        sup_letters = re.findall(r'(\w)<sup>(\w)</sup>', sup)[0]
        phonemes = phonemes.replace(sup, "{%s^%s}" % sup_letters)
    
    return phonemes

In [8]:
tmp = parse_goroh("вживати")
tmp

'ўж{и^е}ва́т{и^е}'

In [9]:
def split_phonemes(phonemes):
    res = []
    complex_phone = False
    phone_candidate = ''

    for i, letter in enumerate(phonemes):

        if letter == '{':
            complex_phone = True
            phone_candidate += letter
            continue
        elif letter == '}':
            complex_phone = False
            phone_candidate += letter
            res.append(phone_candidate)
            phone_candidate = ''
            continue
        # define stressed letter
        elif ord(letter) == 769: 
            res[-1] = res[-1] + letter
        # define softed letter
        elif ord(letter) in (96, 8217):
            res[-1] = res[-1] + "'"
        # define extended letter
        elif ord(letter) == 58:
            res[-1] = res[-1] + letter
        else:
            if complex_phone:
                phone_candidate += letter
            else:
                res.append(letter)
    return res

In [10]:
vocab = []

with open('../data/dict_corp_vis.txt', 'r') as dict_file:
    for line in dict_file:
        if not line.startswith(" "):
            vocab.append(line)
vocab = [word.split()[0] for word in vocab]
filtered_vocab = list(filter(lambda x: len(x)>2, vocab))

print(f'Number of words in vocabulary: {len(vocab)}')
print(f'Number of words after filtering short words: {len(filtered_vocab)}')

Number of words in vocabulary: 246391
Number of words after filtering short words: 246184


In [11]:
def gentle_parsing(word):
    sleep(0.2)
    phonemes = split_phonemes(parse_goroh(word))
    return (word, phonemes)

In [12]:
batch_size = 1024
# num_batches = int(len(filtered_vocab) / batch_size)
num_batches = 1

In [13]:
parsing_res = []

for batch_num in tqdm(range(num_batches+1)):
    tmp_res = process_map(gentle_parsing, 
                          filtered_vocab[batch_size*(batch_num):batch_size*(batch_num+1)], 
                          max_workers=12)
    if len(parsing_res) == 0:
        parsing_res = tmp_res
    else:
        parsing_res.extend(tmp_res)
    sleep(5)

  0%|          | 0/3 [00:00<?, ?it/s]

HBox(children=(FloatProgress(value=0.0, max=1024.0), HTML(value='')))




 33%|███▎      | 1/3 [00:43<01:27, 43.98s/it]

HBox(children=(FloatProgress(value=0.0, max=1024.0), HTML(value='')))




 67%|██████▋   | 2/3 [01:20<00:41, 41.82s/it]

HBox(children=(FloatProgress(value=0.0, max=1024.0), HTML(value='')))




100%|██████████| 3/3 [01:54<00:00, 38.10s/it]


In [48]:
phone_res = [(word, phones) for word, phones in parsing_res if len(phones) > 0]

In [49]:
phone_data = [{"word": word, "phones": phones} for word, phones in phone_res]

In [50]:
phone_df = pd.DataFrame(phone_data)

phone_df.sample(10)

Unnamed: 0,word,phones
1833,Акименко,"[а, к, и́, м, {е^и}, н, к, о]"
1918,акробатичність,"[а, к, р, о, б, а, т, и́, ч, н', і, с', т']"
381,авіабудівний,"[а, в', і, а, б, у, д', і, ў, н, и́, й]"
2258,Алеппо,"[а, л, е́, п:, о]"
1125,аграфія,"[а, г, р, а, ф', і́, й, а]"
1671,азіатський,"[а, з', і, а́, ц', к, и, й]"
186,абсолютизуючи,"[а, б, с, о, л', у, т, {и^е}, з, у́, й, у, ч, ..."
1574,аеротермометр,"[а, {е^и}, р, о, т, {е^и}, р, м, о́, м, {е^и},..."
516,Аврамівка,"[а, ў, р, а́, м', і, ў, к, а]"
2114,акцентовано,"[а, к, ц, {е^и}, н, т, у, в, а́, т, {и^е}]"


In [52]:
print(phone_df.sample(1).values)

[['акомодуватися'
  list(['а', 'к', 'о', 'м', 'о', 'д', 'у', 'в', 'а́', 'т', '{и^е}', "с'"])]]


In [None]:
def json2gzip(data, path):
    if not path.endswith(".json.gz"):
        path += ".json.gz"
    json_str = json.dumps(data) + "\n"
    json_bytes = json_str.encode('utf-8')

    with gzip.GzipFile(path, 'w') as fout:
        fout.write(json_bytes)     

def gzip2json(path):
    with gzip.GzipFile(path, 'r') as fin:
        data = json.loads(fin.read().decode('utf-8'))
    return data

In [55]:
json2gzip(phone_data, '../data/phone_data.json.gz')

In [56]:
phone_data = gzip2json('../data/phone_data.json.gz')

In [59]:
phone_data[42]

{'word': 'абеткувати',
 'phones': ['а', 'б', '{е^и}', 'т', 'к', 'у', 'в', 'а́', 'т', '{и^е}']}