## SynTagRus Rank

In [1]:
# функция для извлечения списка слов из UD_Russian-SynTagRus и расчета их частототности
def frequency_list_extract(filename):
    vocab = {} # dict to store frequency list
    with open(filename, 'r', encoding='utf-8') as f:
        l = f.readlines()
        for line in l:
            # if there is no tab character, skip the line
            if '\t' not in line:
                continue
            # make a list of the cells in the row
            row = line.split('\t')
            # if there are not 10 cells, skip the line
            if len(row) != 10:
                continue
            # the form is the value of the second cell
            form = row[1]
            # if we haven't seen it yet, set the frequency count to 0
            if form not in vocab:
                vocab[form] = 0
            vocab[form] = vocab[form] + 1
        return vocab

In [2]:
# создадим частотный словарь из тренировочного набора UD_Russian-SynTagRus
vocab = frequency_list_extract('../../../../UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu')

In [3]:
# sorted list in descending order
freq = []

for w in vocab:
    freq.append((vocab[w], w))

freq.sort(reverse=True)

In [4]:
freq[0:10]

[(70048, ','),
 (45571, '.'),
 (22727, 'в'),
 (21243, 'и'),
 (16177, '"'),
 (12898, '-'),
 (10531, 'на'),
 (10434, 'не'),
 (7464, 'что'),
 (6833, 'с')]

In [5]:
# sorted list in ascending order
# https://www.geeksforgeeks.org/sort-in-python/
freq_ascend = []

for w in vocab:
    freq_ascend.append((vocab[w], w))

freq_ascend.sort(reverse=False)

In [6]:
freq_ascend[0:10]

[(1, '+7-2=2'),
 (1, '+7°C'),
 (1, '+8-6'),
 (1, '-2,45'),
 (1, '0,01'),
 (1, '0,04'),
 (1, '0,06'),
 (1, '0,25'),
 (1, '0,26'),
 (1, '0,32')]

In [7]:
# сохраним частотный словарь в файл
fd = open('freq.txt', 'w+')
for w in vocab:
    fd.write('%d\t%s\n' % (vocab[w], w))
fd.close()

In [8]:
# функция распаковки частотного словаря для алгоритма подсчета рангов
def frequency_list_unpack(filename):
    freq = []
    with open(filename, 'r') as f:
        lines = f.read().splitlines()
        for line in lines:
            (f, w) = line.split('\t')
            freq.append((int(f), w))
    # чтобы алгоритм ранжирования работал корректно, на вход ему нужно падать 
    # частотный словарь, отсортированный по возрастанию
    freq.sort(reverse=False)
    return freq

In [10]:
freq = frequency_list_unpack('freq.txt')

In [13]:
freq[0:10]

[(1, '+7-2=2'),
 (1, '+7°C'),
 (1, '+8-6'),
 (1, '-2,45'),
 (1, '0,01'),
 (1, '0,04'),
 (1, '0,06'),
 (1, '0,25'),
 (1, '0,26'),
 (1, '0,32')]

In [18]:
# функция для подсчета рангов 
def count_ranks(freq):
    rank = 1
    min = freq[0][0]
    ranks = []
    for i in range(0, len(freq)):
        if freq[i][0] > min:
            rank = rank + 1
            min = freq[i][0]
        ranks.append((rank, freq[i][0], freq[i][1]))
    return ranks

In [19]:
ranks = count_ranks(freq)

In [20]:
ranks[0:10]

[(1, 1, '+7-2=2'),
 (1, 1, '+7°C'),
 (1, 1, '+8-6'),
 (1, 1, '-2,45'),
 (1, 1, '0,01'),
 (1, 1, '0,04'),
 (1, 1, '0,06'),
 (1, 1, '0,25'),
 (1, 1, '0,26'),
 (1, 1, '0,32')]

In [21]:
# наивысшый ранг у символа ',' = 441
ranks[::-1]

[(441, 70048, ','),
 (440, 45571, '.'),
 (439, 22727, 'в'),
 (438, 21243, 'и'),
 (437, 16177, '"'),
 (436, 12898, '-'),
 (435, 10531, 'на'),
 (434, 10434, 'не'),
 (433, 7464, 'что'),
 (432, 6833, 'с'),
 (431, 4306, 'по'),
 (430, 3590, ':'),
 (429, 3490, 'к'),
 (428, 3439, 'как'),
 (427, 3258, 'а'),
 (426, 3224, 'В'),
 (425, 3058, 'из'),
 (424, 2979, 'это'),
 (423, 2691, 'для'),
 (422, 2554, ')'),
 (421, 2545, 'от'),
 (420, 2544, '('),
 (419, 2375, '?'),
 (418, 2354, 'о'),
 (417, 2275, 'за'),
 (416, 2236, 'его'),
 (415, 2201, 'он'),
 (414, 2094, 'все'),
 (413, 2093, 'то'),
 (412, 2036, 'же'),
 (411, 1877, 'но'),
 (410, 1794, 'у'),
 (409, 1760, 'И'),
 (408, 1728, 'Но'),
 (407, 1680, 'было'),
 (406, 1669, 'или'),
 (405, 1647, 'их'),
 (404, 1594, 'только'),
 (403, 1482, 'так'),
 (402, 1478, 'бы'),
 (401, 1465, 'до'),
 (400, 1429, 'А'),
 (399, 1423, 'уже'),
 (398, 1286, 'мы'),
 (397, 1271, 'я'),
 (396, 1265, 'они'),
 (395, 1254, 'еще'),
 (394, 1187, 'был'),
 (393, 1167, 'России'),
 (392, 11