### 4. Filter names that that failed conllu checking

In [1]:
import pandas as pd
import re
from uk_stemmer import UkStemmer
from nltk import stem

In [2]:
stemmer_ru = stem.snowball.SnowballStemmer("russian") 
stemmer_ukr = UkStemmer()

In [3]:
checked_filepath = 'checked_with_conllu_may.csv'
checked_all = pd.read_csv(checked_filepath, index_col=[0])
# checked_all = checked_all.set_index('link')
checked_all.columns

Index(['checked_with_conllu'], dtype='object')

In [7]:
entities_filepath = '../data/entities_may.csv'
entities = pd.read_csv(entities_filepath, index_col=[0])
entities.columns

In [9]:
entities['checked_with_conllu'] = checked_all['checked_with_conllu']

In [13]:
entities['checked_list'] = entities['checked_with_conllu'].apply(lambda checked_str:
                                                    [c.strip("'") for c in checked_str.strip('"[]').split(', ')] 
                                                    if pd.notna(checked_str) else None)

In [14]:
def names_and_kw_str_to_list(names_and_kw_str):
    if pd.notna(names_and_kw_str):
        res = []
        for ent in names_and_kw_str.split('<+>'):
            parts = ent.split('<#>')
            names = re.findall('(\(\d+\, \d+\))<§§>PERS<§§>(.*?)<§§>([\d\.]+)', parts[2])
            for n in names:
                r = [int(i) for i in n[0].strip('()').split(', ')]
                res.append( ( int(parts[0]), parts[1], r, n[1], float(n[2]) ) )
        return res
    return None

In [15]:
entities['keyword_entities'] = entities.kw_and_ent.apply(names_and_kw_str_to_list)
entities['filt_kw_names'] = entities.filt_kw_names_str.str.split('§')

In [16]:
def conllu_filter(entitites_list, filt_names_list, checked_list):
    if isinstance(entitites_list, list) and isinstance(filt_names_list, list):
        res = []
        for i in range(len(entitites_list)):
            if (entitites_list[i][3] in filt_names_list) and checked_list[i]=='True':
                res.append(entitites_list[i][3])
        if len(res)>0:
            return res
    return None

In [17]:
entities['checked_names_list'] = entities.apply(lambda row:
                                       conllu_filter(row.keyword_entities, row.filt_kw_names, row.checked_list), axis=1)

In [19]:
entities['checked_names_str'] = entities.checked_names_list.str.join('§')

In [27]:
entities[['link', 'language', 'all_ent_str', 'name_and_kw', 'kw_and_ent',
       'names_and_kw_str', 'names_str', 'names_sets_str',
       'one_name_per_set_str', 'string_names_sets_str', 'filt_kw_names_str',
       'checked_with_conllu', 'checked_names_str']].to_csv(entities_filepath)

In [20]:
def stem_names(names_list, stemmer):
    if isinstance(names_list, list): 
        res=[]
        for name in names_list:
            stemmed_name = []
            for word in name.split():
                st = stemmer(word)
                if word[0].isupper():
                    st = st.capitalize()
                stemmed_name.append(st)
            res.append(' '.join(stemmed_name))
        return res
    return None

def stem_name(name, stemmer):
    stemmed_name = []
    for word in name.split():
        st = stemmer(word)
        if word[0].isupper():
            st = st.capitalize()
        stemmed_name.append(st)
    return ' '.join(stemmed_name)

In [21]:
entities['checked_stem_names_list'] = entities.apply(lambda row: 
                                             stem_names(row.checked_names_list, stemmer_ukr.stem_word) if row.language=='uk'
                                             else stem_names(row.checked_names_list, stemmer_ru.stem), axis=1)



In [22]:
entities['checked_names_unique'] = entities['checked_stem_names_list'].apply(lambda x:
                                                                    list(set(x)) if isinstance(x, list) else None)

#### Зайве

In [148]:
def get_norm_form_for_checked_names(checked_names_list, one_name_per_set, name_sets, filt_kw_names, stemmer):
    if isinstance(checked_names_list, list):
        res = []
        for name in checked_names_list:
            if isinstance(one_name_per_set, list):
                for i in range(len(one_name_per_set)):
                    if name in name_sets[i]:
                        res.append(one_name_per_set[i])
            elif len(filt_kw_names)==1:
                res.append(stem_name(filt_kw_names[0], stemmer))
            else:
                print(checked_names_list, filt_kw_names)
                
        return res
    return None

In [150]:
news['checked_names_norm_form'] = news.apply(lambda row: 
                                                get_norm_form_for_checked_names(row.checked_names_list,
                                                                           row.one_name_per_set,
                                                                           row.name_sets,
                                                                           row.filt_kw_names,
                                                                           stemmer_ukr.stem_word) if row.language=='uk'
                                                else get_norm_form_for_checked_names(row.checked_names_list,
                                                                           row.one_name_per_set,
                                                                           row.name_sets,
                                                                           row.filt_kw_names,
                                                                          stemmer_ru.stem), axis=1)

In [153]:
news['checked_norm_forms_unique'] = news['checked_names_norm_form'].apply(lambda x:
                                                                    list(set(x)) if isinstance(x, list) else None)