### 2. Filter name entities and join them by similarity

In [2]:
import pandas as pd
import re
from fuzzywuzzy import fuzz
from uk_stemmer import UkStemmer
from nltk import stem
import pymorphy2
from collections import Counter

In [3]:
stemmer_ru = stem.snowball.SnowballStemmer("russian") 
stemmer_ukr = UkStemmer()

In [4]:
names_uk_path = '../dicts/names_ukr.txt' 
names_ru_path = '../dicts/names_ru.txt' 
with open(names_uk_path) as f:
    names_uk = [name.strip() for name in f.readlines()]
    
with open(names_ru_path) as f:
    names_ru = [name.strip() for name in f.readlines()]
names = list(set(names_uk+names_ru))
names_string = '|'.join(names)

In [5]:
names_string_pat = re.compile(names_string)
two_cap_pat = re.compile(r'^[А-ЯЇҐЄІ]\S* [А-ЯЇҐЄІ]\S*$')

In [7]:
filepath = '../data/entities_may.csv'
entities = pd.read_csv(filepath, index_col=[0])
entities.columns

Index(['link', 'language', 'all_ent_str', 'name_and_kw', 'kw_and_ent',
       'names_and_kw_str', 'names_str'],
      dtype='object')

#### Get PERS entities from all entities and filter them

In [8]:
def get_pers_ent_from_all(all_ents_str):
    if pd.notna(all_ents_str):
        pers_ents = re.findall(r'<§§>PERS<§§>(.*?)<§§>([\d\.]+)', all_ents_str)
        if pers_ents:
            return [(ent[0], float(ent[1])) for ent in pers_ents]
    return None


def extract_pers_ent_str(news_entities):
    res = []
    if pd.notna(news_entities):
        s = 0
        for sent in news_entities.split('++'):
            for ent in sent.split(';;'):
                if '::PERS::' in ent:
                    res.append('§'.join([str(s), ent]))
            s += 1
        return '%%'.join(res)
    return None


def list_of_tuples_to_str(tuples):
    if isinstance(tuples, list):
        res=['§'.join((str(t[0]), t[1], t[2], str(t[3]))) for t in tuples]
        return '±±'.join(res)
    return None


def get_kw_name_tuples(kw_names_str):
    if pd.notna(kw_names_str):
        res = []
        for ent in kw_names_str.split('@+@'):
            parts = ent.split('<+>')
            res.append((int(parts[0]), parts[1], parts[2], float(parts[3])))
        return res
    return None


def extract_pers_ent_tuple(news_entities):
    res = []
    if pd.notna(news_entities):
        s = 0
        for sent in news_entities.split('++'):
            for ent in sent.split(';;'):
                if '::PERS::' in ent:
                    r, n = ent.split('::PERS::')
                    name, score = n.rsplit('::', maxsplit=1)
                    if (len(name)>1) and (name.upper()!=name) and (name.lower()!=name):
                        res.append((s, r, name, float(score)))
            s += 1
        if len(res)>0:
            return res
    return None


def filter_by_score(pers_tuples, ind1=2, ind2=3, iteration=1, min_score=0.5):
    if isinstance(pers_tuples, list):
        ents = pers_tuples.copy()
        for pt in pers_tuples:
            two_capital = re.search(r'[А-ЯЇҐЄІ].* .*[А-ЯЇҐЄІ]', pt[ind1])
            if iteration == 1:
                isname = re.search(names_string, pt[ind1])
                if (not isname) and (not two_capital) and pt[ind2] < min_score:
                    ents.remove(pt)
            elif iteration == 2:
                if (not two_capital) and pt[ind2] < min_score:
                    ents.remove(pt)
        if len(ents)>0:
            return ents
    return None


def tuples_to_list(pers_tuples, ind=2):
    if isinstance(pers_tuples, list):
        return [tp[ind] for tp in pers_tuples]
    return None

#### Getting ALL name entities from entities string and filter them

In [11]:
entities['pers_tuples'] = entities.all_ent_str.apply(get_pers_ent_from_all)
entities['filt_pers_tuples'] = entities['pers_tuples'].apply(filter_by_score, ind1=0, ind2=1)

In [14]:
entities['filt_pers_tuples'] = entities['filt_pers_tuples'].apply(filter_by_score, ind1=0, ind2=1, iteration=2)
entities['filt_pers_list'] = entities['filt_pers_tuples'].apply(tuples_to_list, ind=0)

#### Calculate similarity ratios for stemmed names in news

In [15]:
def stem_names(names_list, stemmer):
    if isinstance(names_list, list): 
        res=[]
        for name in names_list:
            stemmed_name = []
            for word in name.split():
                st = stemmer(word)
                if word[0].isupper():
                    st = st.capitalize()
                stemmed_name.append(st)
            res.append(' '.join(stemmed_name))
        return res
    return None

In [16]:
entities['stem_pers_list'] = entities.apply(lambda row: \
                                            stem_names(row.filt_pers_list, stemmer_ukr.stem_word) if row.language=='uk'\
                                            else stem_names(row.filt_pers_list, stemmer_ru.stem), axis=1)

In [17]:
def get_similarity_ratio_from_list(per_list, ratio=fuzz.partial_ratio):
    if isinstance(per_list, list):
        res = []
        for i in range(len(per_list)):
            for j in range(i+1, len(per_list)):
#                 res.append((i, j, fuzz.WRatio(per_list[i], per_list[j])))
#                 res.append((i, j, fuzz.partial_ratio(per_list[i], per_list[j])))
                res.append((i, j, ratio(per_list[i], per_list[j])))
        if len(res)>0:
            return res
    return None

In [19]:
%%time
entities['stem_partial_ratio'] = entities.stem_pers_list.apply(get_similarity_ratio_from_list)

CPU times: user 1min 33s, sys: 1.65 s, total: 1min 34s
Wall time: 1min 39s


In [20]:
%%time
entities['stem_wratio'] = entities.stem_pers_list.apply(lambda x: get_similarity_ratio_from_list(x, fuzz.WRatio))

CPU times: user 6min 29s, sys: 4.48 s, total: 6min 33s
Wall time: 6min 49s


#### Form similarity dicts

In [21]:
def get_similarity_dicts(pers_list, wratios, partial_ratios, top_lim=90, bot_lim=70, mid_lim=80):
    if isinstance(partial_ratios, list):
        res = {}
        for i in range(len(wratios)):
            tp = wratios[i]
            if tp[2] >= top_lim:
                res[tp[0]] = res.get(tp[0], []) + [tp[1]]
            elif tp[2] >= bot_lim:
                str1 = pers_list[tp[0]]
                str2 = pers_list[tp[1]]
                if (two_cap_pat.match(str1) and two_cap_pat.match(str2)):
                    uk_name1 = names_string_pat.match(str1)
                    uk_name2 = names_string_pat.match(str2)
                    if uk_name1 and uk_name2:
                        if uk_name1 != uk_name2:
                            if not res.get(tp[0]):
                                res[tp[0]] = res.get(tp[0], [])
                            continue
                        else:
                            if re.search(r'\s\w{2}', str1).group()!=re.search(r'\s\w{2}', str2).group():
                                if not res.get(tp[0]):
                                    res[tp[0]] = res.get(tp[0], [])
                                continue
                if (tp[2] >= mid_lim) or (partial_ratios[i][2] >= mid_lim):
                    res[tp[0]] = res.get(tp[0], []) + [tp[1]]
                else:
                    res[tp[0]] = res.get(tp[0], [])
            else:
                res[tp[0]] = res.get(tp[0], [])       
        return res
    return None

In [22]:
entities['similarity_dicts'] = entities.apply(lambda row: \
                                              get_similarity_dicts(row.filt_pers_list,\
                                                                    row.stem_wratio, \
                                                                    row.stem_partial_ratio), axis=1)



In [23]:
def print_names_dicts_ratios(id):
    print('names:', entities.loc[id].stem_pers_list)
    print('ratios:', entities.loc[id].stem_wratio)
    print('dicts:', entities.loc[id].similarity_dicts)

#### Transform dicts to lists 

In [24]:
# get ratio from tutple by names indices
def get_ratio_by_ind(i, j, tuples):
    for tp in tuples:
        if tp[0]==i and tp[1]==j:
            return tp[2]

# if current value      
def dict_ratio_is_bigger(sim_dict, wratios, key, value, max_ratio):
    for k, v in sim_dict.items():
        if k not in sim_dict[key] and value in sim_dict[k]:
            ratio = get_ratio_by_ind(k, value, wratios)
            if ratio > max_ratio:
                return True
    return False


def value_ratio_is_bigger(value, v_sim, max_ratio, wratios):
    for v in v_sim:
        if get_ratio_by_ind(value, v, wratios) > max_ratio:
            return True
    return False
        
    
def sim_dicts_to_lists(sim_dict, wratios):
    if pd.notna(sim_dict):
        res = [] 
        for key, value in sim_dict.items():
            same = []
            res_list = [item for sublist in res for item in sublist]
            if key not in res_list:
                if len(value)==0:
                    res.append([key])
                else:
                    same.append(key)
                    for v in value:
                        if v not in res_list:
#                             print(key, v)
                            curr_ratio = get_ratio_by_ind(key, v, wratios)
                            if not dict_ratio_is_bigger(sim_dict, wratios, key, v, curr_ratio):
#                                 print('not bigger')
                                v_sim = sim_dict.get(v, [])
                                if len(v_sim) > 0:
                                    if set(v_sim).issubset(set(value)):
#                                         print('is subset')
                                        same.append(v)
#                                         print(same)
                                    else:
                                        if value_ratio_is_bigger(v, v_sim, curr_ratio, wratios):
                                            continue
                                        else:
                                            same.append(v)
                                else:
                                    same.append(v)
#                                     print(same)
            if len(same)>0:
                res.append(same)
        if len(sim_dict) not in [item for sublist in res for item in sublist]:
            res.append([len(sim_dict)])
                            
        return res    
    return None

In [25]:
%%time
entities['names_sets'] = entities.apply(lambda row: sim_dicts_to_lists(row.similarity_dicts, row.stem_wratio), axis=1)

CPU times: user 30.6 s, sys: 131 ms, total: 30.7 s
Wall time: 31.4 s


In [26]:
def names_sets_to_lists(names_sets):
    if isinstance(names_sets, list):
        res=['+'.join(map(str, s)) for s in names_sets]
        return ';'.join(res)
    return None

In [32]:
entities['names_sets_str'] = entities['names_sets'].apply(names_sets_to_lists)

In [None]:
def fix_lists(entities, id):
    right_str = '0+6;1+4;2;3;5;7'
    entities.loc[id, 'names_sets_str'] = right_str
    entities['names_sets'] = entities.names_sets_str.apply(split_name_sets_to_list)
    entities['string_names_sets'] = entities.apply(lambda row: get_names_by_num(row.names_sets, row.filt_pers_list), axis=1)
    

In [36]:
def get_names_by_num(indices, names):
    if isinstance(indices, list):
        res = []
        for ind_set in indices:
            name_set = []
            for ind in ind_set:
                name_set.append(names[ind])
            res.append(name_set)
        return res
    return None

In [41]:
entities['string_names_sets'] = entities.apply(lambda row: get_names_by_num(row.names_sets, row.filt_pers_list), axis=1)


#### Select one name that represens every list of similar names

In [42]:
def get_one_name_from_set(name_sets, stemmer):
    if isinstance(name_sets, list):
        res=[]
        for name_set in name_sets:
            chosen = max(name_set, key=len)
            for name in name_set:
                isname = re.match(names_string, name)
                two_capital = re.search(r'[А-ЯЇҐЄІ].* .*[А-ЯЇҐЄІ]', name)
                if isname and two_capital:
                    chosen = name
                    break
            stemmed_name = []
            for word in chosen.split():
                st = stemmer(word)
                if word[0].isupper():
                    st = st.capitalize()
                stemmed_name.append(st)
            res.append(' '.join(stemmed_name))
        return res
    return None

In [43]:
entities['one_name_per_set'] = entities.apply(lambda row: \
                                            get_one_name_from_set(row.string_names_sets, stemmer_ukr.stem_word) if row.language=='uk'\
                                            else get_one_name_from_set(row.string_names_sets, stemmer_ru.stem), axis=1)

In [44]:
def lists_to_str(name_sets):
    if isinstance(name_sets, list):
        res=[]
        for ns in name_sets:
            res.append('<+>'.join(ns))
        return '<@>'.join(res)
    return None
    
def list_to_str(name_sets):
    if isinstance(name_sets, list):
        return '<@>'.join(name_sets)
    return None

def ratio_to_str(ratio_list):
    if isinstance(ratio_list, list):
        res = []
        for rt in ratio_list:
            res.append('+'.join([str(rt[0]), str(rt[1]), str(rt[2])]))
        return ';'.join(res)
    return None

In [45]:
entities['one_name_per_set_str'] = entities.one_name_per_set.apply(list_to_str)
entities['string_names_sets_str'] = entities.string_names_sets.apply(lists_to_str)
# entities['stem_wratio_str'] = entities.stem_wratio.apply(ratio_to_str)              

#### Get names entities that go with keywords and filter them

In [50]:
entities['kw_names_tuples'] = entities.names_and_kw_str.apply(get_kw_name_tuples)
entities['filt_kw_names_tuples'] = entities['kw_names_tuples'].apply(filter_by_score, iteration=1)

entities['filt_kw_names_tuples'] = entities['filt_kw_names_tuples'].apply(filter_by_score, iteration=2)
entities['filt_kw_names_list'] = entities['filt_kw_names_tuples'].apply(tuples_to_list)

In [52]:
entities['filt_kw_names_str'] = entities.filt_kw_names_list.str.join('§')

In [54]:
entities[['link', 'language', 'all_ent_str', 'name_and_kw', 'kw_and_ent',
       'names_and_kw_str', 'names_str', 'names_sets_str', 
          'one_name_per_set_str', 'string_names_sets_str', 'filt_kw_names_str']].to_csv(filepath)