In [11]:
import pandas as pd
import numpy as np
import re

## Load data:

In [12]:
dictionary = pd.read_csv('dictionary_2.csv')
complaints = pd.read_csv('./fmi_symptom_recognition/data/complaints.csv')
organs = pd.read_csv('./fmi_symptom_recognition/data/organs.csv')
symptoms = pd.read_csv('./fmi_symptom_recognition/data/symptoms.csv')
systems = pd.read_csv('./fmi_symptom_recognition/data/systems.csv')

dictionary = pd.concat([dictionary, complaints, organs, symptoms, systems], ignore_index=True)

df = pd.read_excel('unigrams.xlsx')

## Mark by single correct word

In [13]:
correct_single = df['Word'].str.lower().isin(dictionary['name'])
df['Correct auto'] = correct_single
print(len(df[df['Correct auto'] == True]))

11653


## Clean by joined correct words

In [14]:
dictionary_dict = dictionary['name'].to_dict()
dictionary_dict = {v: k for k, v in dictionary_dict.items()}

def check_split_words_in_dictionary(unigram):
    words = re.sub(r"[\.\-]", ' ', str(unigram)).split()
    for word in words:
        if word not in dictionary_dict:
            return False
    return True

split_words = df['Word'].apply(check_split_words_in_dictionary)
df['Correct auto'] += split_words
print(len(df[df['Correct auto'] == True]))

13456


## Mark incorrectly joined words

In [15]:
def check_joined_words_in_dictionary(unigram):
    unigram = str(unigram)
    # if not single word, it was already checked at previous step
    if not re.match(r"[а-яА-Яa-zA-Z]+", unigram):
        return False

    for i in range(1, len(unigram)):
        word1 = unigram[0:i]
        word2 = unigram[i:len(unigram)]
        if word1 in dictionary_dict and word2 in dictionary_dict:
            return word1 + ' ' + word2

    return None

joined_words = df['Word'].apply(check_joined_words_in_dictionary)
df = df.assign(split_suggestions=joined_words)
print(df)

       Unnamed: 0              Word  Count  Correct  Link to correct  \
0               0                 и  31679    False              NaN   
1               1                на  27252    False              NaN   
2               2               без  27204    False              NaN   
3               3               б.о  22399    False              NaN   
4               4                RR  22049    False              NaN   
...           ...               ...    ...      ...              ...   
43284       31484  XII.Десностранна      1    False              NaN   
43285       33836               xsl      1    False              NaN   
43286       36530                 z      1    False              NaN   
43287       22400                Za      1    False              NaN   
43288       40010          ZASTOJNI      1    False              NaN   

         BG to EN    EN to BG  Correct auto split_suggestions  
0             and           и          True              None  
1      

## Translated similarity

In [16]:
def get_similarity(row):
    if row['Correct auto']:
        return None
    
    original = str(row['Word']).lower()
    translated = str(row['EN to BG']).lower()

    i = 0
    same_count = 0
    while i < len(original) and i < len(translated):
        if original[i] == translated[i]:
            same_count += 1
            i += 1
        else:
            break
    
    bigger = max(len(original), len(translated))
    return same_count/bigger

similarity = df.apply(get_similarity, axis=1)
df = df.assign(translated_similarity=similarity)
df['Correct auto similarity'] = df['Correct auto'] + (df['translated_similarity'] >= 0.8)
print(len(df[df['Correct auto similarity'] == True]))

17308


## Mark certainly incorrect

In [17]:
from nltk.metrics.distance import jaccard_distance

dictionary_items_list = dictionary['name'].to_dict().values()
dictionary_bucket = {}
for item in dictionary_items_list:
    str_item = str(item)
    if len(str_item) < 3: continue
    first_letter = str_item[0]
    if first_letter not in dictionary_bucket:
        dictionary_bucket[first_letter] = {}
    second_letter = str_item[1]
    if second_letter not in dictionary_bucket[first_letter]:
        dictionary_bucket[first_letter][second_letter] = []
    dictionary_bucket[first_letter][second_letter].append(item)

def get_incorrect(row):
    unigram = str(row['Word'])
    if len(unigram) < 3 or row['Correct auto similarity'] or row['Correct']:
        return False

    first_letter = unigram[0].lower()
    second_letter = unigram[1].lower()
    if first_letter not in dictionary_bucket:
        return False
    if second_letter not in dictionary_bucket[first_letter]:
        return False
    
    for word in dictionary_bucket[first_letter][second_letter]:
        distance = jaccard_distance(set(unigram), set(word))
        if distance < 0.1:
            return True

    return False

incorrect = df.apply(get_incorrect, axis=1)

In [18]:
df['Incorrect auto'] = incorrect
print(len(df[df['Incorrect auto'] == True]))

3318


## Extract to XLS

In [19]:
df.to_excel('unigrams_new.xlsx')