In [26]:
import pandas as pd
import numpy as np
import re

## Load data:

In [92]:
dictionary = pd.read_csv('dictionary.csv')
organs = pd.read_csv('./fmi_symptom_recognition/data/organs.csv')
symptoms = pd.read_csv('./fmi_symptom_recognition/data/symptoms.csv')
systems = pd.read_csv('./fmi_symptom_recognition/data/systems.csv')

dictionary = pd.concat([dictionary, organs, symptoms, systems], ignore_index=True)

df = pd.read_excel('unigrams.xlsx')

119678
122558


## Filter by single correct word

In [96]:
df_clean_single = df[~df['Word'].str.lower().isin(dictionary['name'])]
print(df_clean_single)
dictionary.to_csv('dictionary_new.csv')

       Unnamed: 0              Word  Count  Correct  Link to correct  \
3               3               б.о  22399    False              NaN   
4               4                RR  22049    False              NaN   
6               6               Cor  16872    False              NaN   
10             10               РСД  12270    False              NaN   
12             12            тонове  11832    False              NaN   
...           ...               ...    ...      ...              ...   
43284       31484  XII.Десностранна      1    False              NaN   
43285       33836               xsl      1    False              NaN   
43286       36530                 z      1    False              NaN   
43287       22400                Za      1    False              NaN   
43288       40010          ZASTOJNI      1    False              NaN   

         BG to EN    EN to BG  
3             b.o         B.O  
4              Rr          RR  
6             COR         Кор  
10     

## Clean by joined correct words

In [102]:
dictionary_dict = dictionary['name'].to_dict()
dictionary_dict = {v: k for k, v in dictionary_dict.items()}

def check_split_words_in_dictionary(unigram):
    words = re.sub(r"[\.\-]", ' ', str(unigram)).split()
    for word in words:
        if word not in dictionary_dict:
            return False
    return True

split_words = df_clean_single['Word'].apply(check_split_words_in_dictionary)
df_clean_joined = df_clean_single[~split_words]
print(df_clean_joined)

       Unnamed: 0              Word  Count  Correct  Link to correct  \
3               3               б.о  22399    False              NaN   
4               4                RR  22049    False              NaN   
6               6               Cor  16872    False              NaN   
10             10               РСД  12270    False              NaN   
12             12            тонове  11832    False              NaN   
...           ...               ...    ...      ...              ...   
43284       31484  XII.Десностранна      1    False              NaN   
43285       33836               xsl      1    False              NaN   
43286       36530                 z      1    False              NaN   
43287       22400                Za      1    False              NaN   
43288       40010          ZASTOJNI      1    False              NaN   

         BG to EN    EN to BG  
3             b.o         B.O  
4              Rr          RR  
6             COR         Кор  
10     

## Mark incorrectly joined words

In [109]:
def check_joined_words_in_dictionary(unigram):
    unigram = str(unigram)
    # if not single word, it was already checked at previous step
    if not re.match(r"[а-яА-Яa-zA-Z]+", unigram):
        return False

    for i in range(1, len(unigram)):
        word1 = unigram[0:i]
        word2 = unigram[i:len(unigram)]
        if word1 in dictionary_dict and word2 in dictionary_dict:
            return word1 + ' ' + word2

    return None

joined_words = df_clean_joined['Word'].apply(check_joined_words_in_dictionary)
df_with_suggestions = df_clean_joined.assign(split_suggestions=joined_words)
print(df_with_suggestions)

       Unnamed: 0              Word  Count  Correct  Link to correct  \
3               3               б.о  22399    False              NaN   
4               4                RR  22049    False              NaN   
6               6               Cor  16872    False              NaN   
10             10               РСД  12270    False              NaN   
12             12            тонове  11832    False              NaN   
...           ...               ...    ...      ...              ...   
43284       31484  XII.Десностранна      1    False              NaN   
43285       33836               xsl      1    False              NaN   
43286       36530                 z      1    False              NaN   
43287       22400                Za      1    False              NaN   
43288       40010          ZASTOJNI      1    False              NaN   

         BG to EN    EN to BG Split suggestions split_suggestions  
3             b.o         B.O              None              None  

## Translated similarity

In [116]:
def get_similarity(row):
    original = str(row['Word']).lower()
    translated = str(row['EN to BG']).lower()

    i = 0
    same_count = 0
    while i < len(original) and i < len(translated):
        if original[i] == translated[i]:
            same_count += 1
            i += 1
        else:
            break
    
    bigger = max(len(original), len(translated))
    return same_count/bigger

similarity = df_with_suggestions.apply(get_similarity, axis=1)
df_with_similarity = df_with_suggestions.assign(translated_similarity=similarity)
print(df_with_similarity['translated_similarity'])

3        0.00
4        1.00
6        0.00
10       0.00
12       1.00
         ... 
43284    0.25
43285    1.00
43286    1.00
43287    0.00
43288    1.00
Name: translated_similarity, Length: 36283, dtype: float64


## Extract to XLS

In [117]:
df_with_similarity.to_excel('unigrams_new.xlsx')