In [1]:
import pandas as pd

In [2]:
words_path = '/media/alex/nvme-a/vosk-model-ru-0.10/graph/words.txt'

In [3]:
words = pd.read_csv(words_path, sep=" ")
words.columns = ['line', 'id']
words[words.line.str.contains('шолохово')]

Unnamed: 0,line,id


### Форматирование и очистка примеров

In [4]:
def word_data(path):
    
    with open(path, 'rt', encoding="utf8", errors='ignore') as file:
        words = file.read()
    words = words.lower()
    
    correct = '\nйцукенгшщзхъфывапролджэячсмитьбю '
    used = ''.join(list(set(words)))
    for u in used:
        if u not in correct:
            words = words.replace(u, ' ')
    while '  ' in words:
        words = words.replace('  ', ' ')
    words = words.replace(' ', '\n')
    
    with open(path.replace('.csv','_b.csv'), 'w') as file:
        file.write(words)
        
    words = pd.read_csv(path.replace('.csv','_b.csv'), header=None)
    words.columns = ['word']    
    words.drop_duplicates(inplace=True)
    words = words.dropna()
    
    return words

In [7]:
def get_missing(words_path):
    df = pd.DataFrame(columns = ['word'])
    for path in words_path:
        df = pd.concat([df, word_data(path)], axis = 0)
    df.drop_duplicates(inplace=True)
    df = df.dropna()
    df = pd.DataFrame(df[df.word.str.len()>3])
    df.to_csv('csv/all_words.csv', index = False)
    missing_words = df[df.word.isin(words.line)==False]
    missing_words.to_csv('csv/missing_words.csv', index = False)
    print('known words:', len(words))
    print('external words:', len(df))
    print('unknown words:', len(missing_words))    
    return missing_words

### Отсутсвующие слова

In [8]:
# Улицы
words_path = ['csv/streets_msk.csv', 'csv/streets_spb.csv', 'csv/streets_reg.csv']
missing_words = get_missing(words_path)

known words: 709156
external words: 5011
unknown words: 1710


In [62]:
# Неисправности
words_path = ['csv/neispr_msk.csv', 'csv/neispr_spb.csv', 'csv/neispr_reg.csv']
missing_words = get_missing(words_path)

слов в модели: 709156
слов в 1С: 1016
слов, отсутствующих в модели: 40


In [64]:
# Бренды
words_path = ['csv/brands_msk.csv', 'csv/brands_spb.csv', 'csv/brands_reg.csv']
missing_words = get_missing(words_path)

слов в модели: 709156
слов в 1С: 81
слов, отсутствующих в модели: 24


In [17]:
# Объединение
words_path = [
    'csv/streets_msk.csv',
    'csv/streets_spb.csv',
    'csv/streets_reg.csv',
    'csv/neispr_msk.csv',
    'csv/neispr_spb.csv',
    'csv/neispr_reg.csv',
    'csv/brands_msk.csv',
    'csv/brands_spb.csv',
    'csv/brands_reg.csv'
]
missing_words = get_missing(words_path)

known words: 709156
external words: 6076
unknown words: 1773


### Склонение

In [19]:
# https://pymorphy2.readthedocs.io/en/0.2/user/index.html
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

In [52]:
missing_words = pd.read_csv('csv/missing_words.csv')
lexemed_words = []
for ids, missing in missing_words.iterrows():
    original = morph.parse(missing.word)[0]
    for lex in original.lexeme:
        lexemed_words.append(lex.word)
lexemed = pd.DataFrame(sorted(set(lexemed_words)))
lexemed.columns = ['word']
lexemed.to_csv('csv/lexemed.csv', index = False)

In [56]:
# оставим только отсутствующие в модели слова
lexemed_missing = get_missing(['csv/lexemed.csv'])
lexemed_missing.to_csv('csv/lexemed_missing.csv', index = False)

known words: 709156
external words: 21116
unknown words: 19741
