In [38]:
from collections import Counter, defaultdict
import operator
import stringdist
import os
import re
from tqdm import tqdm

In [2]:
texts = []

for i in os.listdir('data'):
    for j in os.listdir(f'data/{i}'):
        file = os.listdir(f'data/{i}/{j}')[0]
        with open(f"data/{i}/{j}/{file}", encoding="utf-8") as f:
            texts.append(f.read().replace(u'\xa0', ' '))

In [3]:
cnt = Counter()

In [4]:
words = []
pattern = "[а-яА-Я]{3,}"

for text in texts:
    text_words = text.split()
    for word in text_words:
        corrects = re.findall(pattern, word)
        if len(corrects) == 1:
            words.append(corrects[0].lower())

In [5]:
for word in words:
    cnt[word] += 1

In [14]:
def find_nearest(word, n=10):
    result = []
    for cnt_word, count in cnt.items():
        dist = stringdist.rdlevenshtein(word, cnt_word)
        result.append((cnt_word, dist, count))
        
    result.sort(key=operator.itemgetter(1, 2))
    
    return result[:n]

In [16]:
%%time

find_nearest('призидент')

Wall time: 9.54 s


[('президент', 1, 1453),
 ('президет', 2, 1),
 ('резидент', 2, 9),
 ('президенте', 2, 24),
 ('президенты', 2, 37),
 ('президенту', 2, 109),
 ('президента', 2, 846),
 ('президентам', 3, 1),
 ('резиденту', 3, 1),
 ('президета', 3, 1)]

## Bigrams

In [18]:
bigram_index = defaultdict(list)

In [44]:
for word in set(words):
    bigrams = [word[i:i+2] for i in range(len(word)-1)]
    [bigram_index[bigram].append(word) for bigram in bigrams]

In [75]:
def find_nearest_by_index(word, n=10):
    result = []
    
    bigram_words = []
    bigrams = [word[i:i+2] for i in range(len(word)-1)]
    [bigram_words.extend(bigram_index[bigram]) for bigram in bigrams]
    
    c = Counter(bigram_words)
    bigram_set = {k for k,v in c.items() if v > 1 and len(k)-len(word) < 3 and len(k)-len(word) > -3}
    
    for bigram_word in bigram_set:
        dist = stringdist.rdlevenshtein(word, bigram_word)
        result.append((bigram_word, dist, cnt[bigram_word]))
        
    result.sort(key=operator.itemgetter(1, 2))
    
    return result[:n]

In [76]:
%%time

find_nearest_by_index('призидент')

Wall time: 1.86 s


[('президент', 1, 1453),
 ('президет', 2, 1),
 ('резидент', 2, 9),
 ('президенте', 2, 24),
 ('президенты', 2, 37),
 ('президенту', 2, 109),
 ('президента', 2, 846),
 ('резиденту', 3, 1),
 ('президентам', 3, 1),
 ('президета', 3, 1)]