# Data Preprocessing

### files uploading

In [612]:
# files uploading

from google.colab import files
uploaded = files.upload()

In [599]:
def save_to_list(file_txt: str):
  with open(file_txt, 'r', encoding='utf-8') as file:
    return file.readlines()

In [765]:
words_A1 = save_to_list('words_A1.txt')
words_A2 = save_to_list('words_A2.txt')
words_B1 = save_to_list('words_B1.txt')

In [766]:
words_A1[:10]

['а\n',
 'а́вгуст\n',
 'авто́бус\n',
 'а́втор\n',
 'а́дрес\n',
 'акти́вный\n',
 'англи́йский\n',
 'англича́нин\n',
 'а́нгло-русский\n',
 'апре́ль\n']

In [767]:
def delete_new_line(words: list):
  stripped_words = []
  for word in words:
    stripped_words.append(word.strip())
  return stripped_words

In [768]:
words_A1 = delete_new_line(words_A1)
words_A2 = delete_new_line(words_A2)
words_B1 = delete_new_line(words_B1)

In [769]:
words_A1[:10]

['а',
 'а́вгуст',
 'авто́бус',
 'а́втор',
 'а́дрес',
 'акти́вный',
 'англи́йский',
 'англича́нин',
 'а́нгло-русский',
 'апре́ль']

### words preprocessing

*   replace stressed letters
*   remove words with punctuation marks
*   remove collocations
*   remove words with numbers, e.g. "мир 1"; it means that this word might have different meanings, but for our task it's better to use unambigous words
*   remove words with brackets (e.g. "извини(те)")

In [770]:
import re

In [771]:
# remove collocations + remove words with numbers

def remove_collocations(word: str):
  if len(word.split(' ')) == 1:
    return word

In [772]:
# replace stressed letters
# stressed letter in each word is a unicode symbol "\u0301"

def replace_stressed_letters(word: str):
    return re.sub(r'\u0301', '', word)

In [773]:
# remove words with punctuation marks + remove words with brackets

def remove_punctuation(word: str):
  if not re.search(r'[^\s\w\d-]', word):
    return word

In [774]:
def cleaning(words_list):
  cleaned_words = []
  for word in words_list:
    word = replace_stressed_letters(word)
    word = remove_punctuation(word)
    if type(word) == str:
      word = remove_collocations(word)
      if type(word) == str:
        cleaned_words.append(word)
  return cleaned_words

In [903]:
words_A1_cleaned = set(cleaning(words_A1))
words_A2_cleaned = set(cleaning(words_A2))
words_B1_cleaned = set(cleaning(words_B1))

In [904]:
print(len(words_A1))
print(len(words_A1_cleaned))

719
706


### morphological analysis

In [None]:
!pip install pymorphy2

In [779]:
import pymorphy2
# from pymystem3 import Mystem

In [784]:
# mystem = Mystem()
morph_analyzer = pymorphy2.MorphAnalyzer()

In [917]:
lmentry_words_ru = {key: {'pos': [], 'levels': []} for key in words_B1_cleaned)}

In [918]:
# get parts of speech

def get_pos(word: str):
  parses = morph_analyzer.parse(word)
  tags = []
  for parse in parses:
    tag = parse.tag.POS
    tags.append(tag)
  return set(tags)

In [919]:
# get levels

def get_level(word: str):
  levels = []
  if word in words_A1_cleaned:
    levels.append('A1')
  elif word in words_A2_cleaned:
    levels.append('A2')
  levels.append('B1')
  return levels

In [920]:
for word in words_B1_cleaned:
  pos = get_pos(word)
  levels = get_level(word)

  lmentry_words_ru[word]['pos'].extend(pos)
  lmentry_words_ru[word]['levels'].extend(levels)

In [None]:
lmentry_words_ru

### import to json

In [922]:
import json

In [925]:
with open("lmentry_words_ru.json", "w") as outfile:
    json.dump(lmentry_words_ru, outfile, indent=4, ensure_ascii=False)