### 5. Get unique names from text: find checked names in similarity dicts and select unique representation

In [1]:
import pandas as pd
from uk_stemmer import UkStemmer
from nltk import stem

In [2]:
stemmer_ru = stem.snowball.SnowballStemmer("russian") 
stemmer_ukr = UkStemmer()

In [3]:
entities_path = '../data/entities_may.csv'
entities = pd.read_csv(entities_path, index_col=[0])
entities.columns

Index(['link', 'language', 'all_ent_str', 'name_and_kw', 'kw_and_ent',
       'names_and_kw_str', 'names_str', 'names_sets_str',
       'one_name_per_set_str', 'string_names_sets_str', 'filt_kw_names_str',
       'checked_with_conllu', 'checked_names_str'],
      dtype='object')

In [4]:
entities['filt_kw_names_list'] = entities.filt_kw_names_str.str.split('§')
entities['checked_names_list'] = entities.checked_names_str.str.split('§')

In [5]:
def split_name_sets_to_list(sets_str):
    if pd.notna(sets_str):
        return [part.split('<+>') for part in sets_str.split('<@>')]
    return None

def get_names_by_num(num_sets, names):
    if isinstance(num_sets, list):
        res = []
        for num_set in num_sets:
            name_set = [names[int(i)] for i in num_set]
            res.append(name_set)
        return res
    return None

In [6]:
entities['name_sets_lists'] = entities.string_names_sets_str.apply(split_name_sets_to_list)
entities['one_name_per_set'] = entities.one_name_per_set_str.str.split('<@>')

In [7]:
def stem_names(names_list, stemmer):
    if isinstance(names_list, list): 
        res=[]
        for name in names_list:
            stemmed_name = []
            for word in name.split():
                st = stemmer(word)
                if word[0].isupper():
                    st = st.capitalize()
                stemmed_name.append(st)
            res.append(' '.join(stemmed_name))
        return res
    return None

def stem_name(name, stemmer):
    stemmed_name = []
    for word in name.split():
        st = stemmer(word)
        if word[0].isupper():
            st = st.capitalize()
        stemmed_name.append(st)
    return ' '.join(stemmed_name)

In [8]:
def find_names_in_dicts(name_ent_list, name_forms, one_name_list, stemmer):
    if isinstance(name_ent_list, list):
        res = []
        if len(name_ent_list)==1:
            return [stem_name(name_ent_list[0], stemmer)]
        
        for name in name_ent_list:
            try:
                for i in range(len(name_forms)):
                    if name in name_forms[i]:
                        res.append(one_name_list[i])
            except:
                print(name_ent_list, name_forms, one_name_list)
        return res
    return None
#         stemmed_name = stem_str(name, stemmer)


def find_checked_names_in_dicts(name_ent_list, checked_name_ent_list, name_forms, one_name_list, stemmer):
    if isinstance(checked_name_ent_list, list) and isinstance(name_ent_list, list):
        res = []
        if len(name_ent_list)==1 and name_ent_list[0] in checked_name_ent_list:
            return [stem_name(name_ent_list[0], stemmer)]
        
        for name in name_ent_list:
            try:
                if name in checked_name_ent_list:
                    for i in range(len(name_forms)):
                        if name in name_forms[i]:
                            res.append(one_name_list[i])
            except:
                print(name_ent_list, name_forms, one_name_list)
                
        if len(res) > 0:
            return res
    return None
#         stemmed_name = stem_str(name, stemmer)
        

In [9]:
entities['kw_names_norm_form'] = entities.apply(lambda row: 
                                                find_checked_names_in_dicts(row.filt_kw_names_list,
                                                                            row.checked_names_list, 
                                                                      row.name_sets_lists, 
                                                                      row.one_name_per_set,
                                                                      stemmer_ukr.stem_word) if row.language=='uk'
                                                else find_checked_names_in_dicts(row.filt_kw_names_list,
                                                                                 row.checked_names_list, 
                                                                      row.name_sets_lists, 
                                                                      row.one_name_per_set,
                                                                      stemmer_ru.stem), axis=1)

In [10]:
entities['kw_names_unique'] = entities['kw_names_norm_form'].apply(lambda x: 
                                                                   list(dict.fromkeys(x)) if isinstance(x, list)
                                                                  else None)

In [11]:
entities['kw_names_unique_str'] = entities.kw_names_unique.str.join('§')

In [12]:
entities.columns

Index(['link', 'language', 'all_ent_str', 'name_and_kw', 'kw_and_ent',
       'names_and_kw_str', 'names_str', 'names_sets_str',
       'one_name_per_set_str', 'string_names_sets_str', 'filt_kw_names_str',
       'checked_with_conllu', 'checked_names_str', 'filt_kw_names_list',
       'checked_names_list', 'name_sets_lists', 'one_name_per_set',
       'kw_names_norm_form', 'kw_names_unique', 'kw_names_unique_str'],
      dtype='object')

In [13]:
entities[['link', 'language', 'all_ent_str', 'name_and_kw', 'kw_and_ent',
       'names_and_kw_str', 'names_str', 'names_sets_str',
       'one_name_per_set_str', 'string_names_sets_str', 'filt_kw_names_str',
       'checked_with_conllu', 'checked_names_str', 'kw_names_unique_str']].to_csv(entities_path)