# URL

In [None]:
import urllib.request
url = 'http://cirtec.ranepa.ru/analysis/Word2Vec/citcon4bundles.txt'
response = urllib.request.urlopen(url)
data = response.read()      # a `bytes` object

with open('../initial_data/Word2Vec__citcon4bundles.txt', 'wb') as f:
    f.write(data)

# FILE

Загрузка файл, подготовка данных в виде:
1) массива слов
2) массива нормализованных слов

~10 минут

In [None]:
%%time
import string
import re
import pymorphy2

morph = pymorphy2.MorphAnalyzer()
regex = re.compile('[^а-яА-Я]')

def get_average_len_words(sentence):
    words = sentence.split()
    if len(words) == 0:
        return 0
    return sum([len(word) for word in words]) / len(words)

with open('../initial_data/Word2Vec__citcon4bundles.txt', 'rb') as f:
    data = f.read()
text = data.decode('utf-8') # a `str`; this step can't be used if data is binary
del data
sentences = [(sentence[0], sentence[1], sentence[2], sentence[3], sentence[4].replace("\xad ", "").lower()) 
             for sentence in (sentence.split(" ", 4) for sentence in text.split("\n") if sentence and get_average_len_words(sentence) > 5)]
del text
print("len_sentences: {}".format(len(sentences)))

words = [(sentence[0], [word for word in regex.sub(' ', sentence[4]).split()]) for sentence in sentences]
words_normal_form = [(sentence[0], [morph.parse(word)[0].normal_form for word in regex.sub(' ', sentence[4]).split()]) for sentence in sentences]

del sentences

In [None]:
words = [(pair[0], [word for word in pair[1] if word not in ["й", "ч", "р"]]) for pair in words]

words_normal_form = [(pair[0], [word for word in pair[1] if word not in ["й", "ч", "р"]]) for pair in words_normal_form]

In [None]:
from gensim.models.phrases import Phrases

def get_n_gram(_words, n, delimiters=(b"@", b"#", b"$", b"%", b"^", b"&", b"*"), min_count=20, threshold=50):
    if n < 2:
        raise ValueError(" n < 2 ")
    if len(delimiters) < n - 1:
        raise ValueError(" len(delimiters) < n-1 ")
    grams = []
    for ind in range(n - 1):
        gram = Phrases(_words, min_count=min_count, delimiter=delimiters[ind], threshold=threshold)
        grams.append(gram)
        if ind != n - 2:
            _words = gram[_words]
    return grams


def drop_all_delimiters(text, delimiters=("@", "#", "$", "%", "^", "&", "*")):
    for delimiter in delimiters:
        text = text.replace(delimiter, " ")
    return text


def get_gram_vocab(grams, n, delimiters=("@", "#", "$", "%", "^", "&", "*")):
    if n < 2:
        raise ValueError(" n < 2 ")
    if len(delimiters) < n - 1:
        raise ValueError(" len(delimiters) < n-1 ")
    delimiters = delimiters[:n - 1]
    sorted_gram_vocab = sorted([(drop_all_delimiters(value, delimiters), count) for (value, count) in
                                [(value.decode('utf8'), count) for value, count in dict(grams[n - 2].vocab).items()]
                                if sum([dlm in value for dlm in delimiters]) == n - 1], key=lambda kv: kv[1],
                               reverse=True)

    return sorted_gram_vocab


def get_words_with_phrases(pairs, _list_grams):
    _list_words = [pair[1] for pair in pairs]
    for gram in _list_grams:
        _list_words = gram[_list_words]
    return list(zip([pair[0] for pair in pairs], _list_words))
    

def get_phrases(pairs, n, delimiters=("@", "#", "$", "%", "^", "&", "*")):
    _phrases = [pair[1] for pair in pairs]
    for ind in range(n - 1):
        _phrases = [[word for word in sent if delimiters[ind] in word] for sent in _phrases]

    _phrases = [[word for word in sent if delimiters[n-1] not in word] for sent in _phrases]
    return list(zip([pair[0] for pair in pairs], _phrases)) 

Обучения фразеров

~2 минуты

In [None]:
%%time

N = 6

grams = get_n_gram(_words=[word[1] for word in words], n=N)
grams_normal_form = get_n_gram(_words=[word[1] for word in words_normal_form], n=N)

In [None]:
%%time

words_with_phrases = get_words_with_phrases(words, grams)
words_normal_form_with_phrases = get_words_with_phrases(words_normal_form, grams_normal_form)

In [None]:
phrases_words = {i: get_phrases(words_with_phrases, i) for i in range(2, N+1)}
phrases_words_normal_form = {i: get_phrases(words_normal_form_with_phrases, i) for i in range(2, N+1)}

In [None]:
%%time
from collections import Counter

all_pairs_names = set([word[0] for word in words])

def get_dict_counters_by_bunshid(pairs):
    _res = dict()
    for name_pair in all_pairs_names:
        all_pair = []
        for _words in (pair[1] for pair in pairs if pair[0] == name_pair):
            all_pair.extend(_words)
        _res[name_pair] = Counter(all_pair)
    return _res

dict_counters_words = {key: get_dict_counters_by_bunshid(pairs) for key, pairs in phrases_words.items()}
dict_counters_words_normal_form = {key: get_dict_counters_by_bunshid(pairs) for key, pairs in phrases_words_normal_form.items()}

# Create result dict

In [None]:
def get_res_dict(__dict_counters, __pair_name, __n, __grams):
    return [
        {
            "w": drop_all_delimiters(counter_words[0]),
            "n": counter_words[1],
            "t": __grams[__n-2].vocab[counter_words[0].encode("utf-8")]
        } for counter_words in __dict_counters[__n][__pair_name].most_common()
    ]

result_dict = {
    pair_name: {
        "2orig": get_res_dict(dict_counters_words, pair_name, 2, grams),
        "2norm": get_res_dict(dict_counters_words_normal_form, pair_name, 2, grams_normal_form),
        
        "3orig": get_res_dict(dict_counters_words, pair_name, 3, grams),
        "3norm": get_res_dict(dict_counters_words_normal_form, pair_name, 3, grams_normal_form),
        
        "4orig": get_res_dict(dict_counters_words, pair_name, 4, grams),
        "4norm": get_res_dict(dict_counters_words_normal_form, pair_name, 4, grams_normal_form),
        
        "5orig": get_res_dict(dict_counters_words, pair_name, 5, grams),
        "5norm": get_res_dict(dict_counters_words_normal_form, pair_name, 5, grams_normal_form),
        
        "6orig": get_res_dict(dict_counters_words, pair_name, 6, grams),
        "6norm": get_res_dict(dict_counters_words_normal_form, pair_name, 6, grams_normal_form),
    } for pair_name in all_pairs_names
}

# Create json

In [None]:
import json

with open("../resulting_data/json_grams/2-3-4-5-6_grams_v3.json", "w") as f:
    json.dump(result_dict, f)