# URL

In [None]:
# import urllib.request
# url = 'http://cirtec.ranepa.ru/analysis/Word2Vec/citcon4bundles.txt'
# response = urllib.request.urlopen(url)
# data = response.read()      # a `bytes` object

# with open('../initial_data/Word2Vec__citcon4bundles.txt', 'wb') as f:
#     f.write(data)

# FILE

In [None]:
def get_average_len_words(sentence):
    words = sentence.split()
    if len(words) == 0:
        return 0
    return sum([len(word) for word in words]) / len(words)

In [None]:
%%time
with open('../initial_data/Word2Vec__citcon4bundles.txt', 'rb') as f:
    data = f.read()
text = data.decode('utf-8') # a `str`; this step can't be used if data is binary
del data
print("len_text: {}".format(len(text)))
sentences = [(sentence[0], sentence[1], sentence[2], sentence[3], sentence[4].replace("\xad ", "").lower()) 
             for sentence in (sentence.split(" ", 4) for sentence in text.split("\n") if sentence and get_average_len_words(sentence) > 5)]
del text
print("len_sentences: {}".format(len(sentences)))
print(sentences[0])

In [None]:
%%time
import string
import re
import pymorphy2

morph = pymorphy2.MorphAnalyzer()

regex = re.compile('[^а-яА-Я]')

words = [(sentence[0], [morph.parse(word)[0].normal_form for word in regex.sub(' ', sentence[4]).split()]) for sentence in sentences]

In [None]:
from gensim.models.phrases import Phrases

def get_n_gram(_words, n, delimiters=(b"@", b"#", b"$", b"%"), min_count=20, threshold=10):
    if n < 2:
        raise ValueError(" n < 2 ")
    if len(delimiters) < n - 1:
        raise ValueError(" len(delimiters) < n-1 ")
    grams = []
    for ind in range(n - 1):
        gram = Phrases(_words, min_count=min_count, delimiter=delimiters[ind], threshold=threshold)
        grams.append(gram)
        if ind != n - 2:
            _words = gram[_words]
    return grams


def drop_all_delimiters(text, delimiters=("@", "#", "$", "%")):
    for delimiter in delimiters:
        text = text.replace(delimiter, " ")
    return text


def get_gram_vocab(grams, n, delimiters=("@", "#", "$", "%")):
    if n < 2:
        raise ValueError(" n < 2 ")
    if len(delimiters) < n - 1:
        raise ValueError(" len(delimiters) < n-1 ")
    delimiters = delimiters[:n - 1]
    sorted_gram_vocab = sorted([(drop_all_delimiters(value, delimiters), count) for (value, count) in
                                [(value.decode('utf8'), count) for value, count in dict(grams[n - 2].vocab).items()]
                                if sum([dlm in value for dlm in delimiters]) == n - 1], key=lambda kv: kv[1],
                               reverse=True)

    return sorted_gram_vocab

In [None]:
%%time

grams = get_n_gram(_words=[word[1] for word in words], n=4, delimiters=(b"@", b"#", b"$", b"%"))

In [None]:
t2 = grams[0][[word[1] for word in words]]
t3 = grams[1][t2]
t4 = grams[2][t3]

In [None]:
def get_phrases(_words, n, delimiters=("@", "#", "$", "%")):
    phrases = [[word for word in sent] for sent in _words] # _words
    for ind in range(n - 1):
        phrases = [[word for word in sent if delimiters[ind] in word] for sent in phrases]
    return phrases

In [None]:
%%time
phrases_t3 = [value for value in zip([word[0] for word in words], get_phrases(t3, 3))]
phrases_t4 = [value for value in zip([word[0] for word in words], get_phrases(t4, 4))]

In [None]:
from collections import Counter

unique = list(set([word[0] for word in words]))

In [None]:
dict_res_3 = dict()

for un in unique:
    all_ph = []
    for i in (ph for ph in phrases_t3 if ph[0] == un):
        all_ph.extend(i[1])
    dict_res_3[un] = Counter(all_ph)

In [None]:
dict_res_4 = dict()

for un in unique:
    all_ph = []
    for i in (ph for ph in phrases_t4 if ph[0] == un):
        all_ph.extend(i[1])
    dict_res_4[un] = Counter(all_ph)

In [None]:
dict_res_4[unique[10]].most_common()

# Create json

In [None]:
res_3 = {un: [drop_all_delimiters(m_c[0]) for m_c in dict_res_3[un].most_common()] for un in unique}
res_4 = {un: [drop_all_delimiters(m_c[0]) for m_c in dict_res_4[un].most_common()] for un in unique}

In [None]:
import json

In [None]:
with open("../resulting_data/json_grams/3_gram_normalize.json", "w") as f:
    json.dump({key: value if value else None for key, value in res_3.items()}, f)
with open("../resulting_data/json_grams/4_gram_normalize.json", "w") as f:
    json.dump({key: value if value else None for key, value in res_4.items()}, f)