### 6. Search for filtered names in the dictionary with all names and name forms

In [1]:
import pandas as pd
from uk_stemmer import UkStemmer
from nltk import stem
import re
from collections import Counter
import json

In [2]:
stemmer_ru = stem.snowball.SnowballStemmer("russian") 
stemmer_ukr = UkStemmer()

### Searching for names

In [3]:
entities_path = '../data/entities_may.csv'
entities = pd.read_csv(entities_path, index_col=[0])
entities.columns

Index(['link', 'language', 'all_ent_str', 'name_and_kw', 'kw_and_ent',
       'names_and_kw_str', 'names_str', 'names_sets_str',
       'one_name_per_set_str', 'string_names_sets_str', 'filt_kw_names_str',
       'checked_with_conllu', 'checked_names_str', 'kw_names_unique_str'],
      dtype='object')

In [102]:
# file with all names and name forms gathered during previous months
names_dict_path = 'joined_names_dict.csv'
names_df = pd.read_csv(names_dict_path, index_col=[0])
names_df.head()

Unnamed: 0_level_0,num,stem_name,all_vars_str,id
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Володимир Зеленськ,18633.0,Володимир Зеленськ,Володимир Зеленськ Перестановк§Волод Зеленск§*...,0.0
Макс Степан,14884.0,Макс Степан,Очільник Моз Макс Степан§Здоров Макс Степан§Ма...,1.0
Денис Шмигал,9970.0,Денис Шмигал,Глав Кабмін Денис Шмигал§Деніс Шмигал§Денис Шм...,2.0
Дональд Трамп,5823.0,Дональд Трамп,Дональл Трамп§Дональд Трамп Сша§Дональд Трампа...,3.0
Джо Байден,6450.0,Джо Байден,Дже Байден§: Джо Байден§Демократ Джозеф Байден...,4.0


In [6]:
# import ast
# entities['names_from_dict'] = entities.names_from_dict.apply(lambda x: ast.literal_eval(x) if pd.notna(x) else None)

In [105]:
entities['unique_kw_names'] = entities.kw_names_unique_str.str.split('§')
entities['set_names'] = entities.one_name_per_set_str.str.split('<@>')

In [6]:
def kw_name_str_to_list(names_and_kw_str):
    if pd.notna(names_and_kw_str):
        res = []
        for ent in names_and_kw_str.split('@+@'):
            res.append(ent.split('<+>'))
        return res
    return None

In [8]:
def stem_names(names_list, stemmer):
    if isinstance(names_list, list): 
        res=[]
        for name in names_list:
            stemmed_name = []
            for word in name.split():
                st = stemmer(word)
                if word[0].isupper():
                    st = st.capitalize()
                stemmed_name.append(st)
            res.append(' '.join(stemmed_name))
        return res
    return None

In [109]:
entities['names_from_dict'] = None
entities['found'] = False

In [113]:
top_names = names_df[(names_df.num>0)]
top_names.shape

(10874, 4)

In [114]:
def find_names_in_dict(stem_names, found_names, found_flag):
    if isinstance(stem_names, list) and not found_flag:
        res = {}
        if pd.notna(found_names):
            res = found_names
        for i in range(len(stem_names)):
            if (i not in res.keys()) and (stem_names[i] in top_names.index.values):
                res[i] = stem_names[i]
        if len(res) > 0:
            return res
    return None

In [115]:
%%time
part = entities.apply(lambda row: find_names_in_dict(row.set_names, row.names_from_dict, row.found), axis=1)
entities['names_from_dict'].update(part)

CPU times: user 1min 55s, sys: 2.47 s, total: 1min 57s
Wall time: 2min 3s


In [117]:
entities['found'] = entities.apply(lambda row: 
                           True if (pd.notna(row.names_from_dict) and 
                           len(row.names_from_dict)==len(row.set_names)) else False, axis=1)

In [118]:
entities.found.sum()

20932

In [119]:
not_found=[]
variants=[]

In [120]:
top_names = names_df[names_df.all_vars_str.notna()]
top_names.shape

(6780, 4)

In [121]:
def find_names_in_variants(found_names, names, found):
    if isinstance(names, list) and not found:
        res = {}
        if pd.notna(found_names):
            res = found_names
        for i in range(len(names)):
            nf = names[i]
            if (i not in res.keys()) and \
                    (nf not in not_found) and \
                        (nf not in variants):
                nf = re.escape(nf)
                pat = rf'§{nf}§|^{nf}§|§{nf}$|^{nf}$'
                found_part = top_names[top_names.all_vars_str.str.contains(pat, na=False)]
                if len(found_part) == 1:
                    res[i] = found_part['stem_name'].iloc[0]
                elif len(found_part) > 1:
                    variants.append(names[i])
                else:
                    not_found.append(names[i])
                    
                del pat, found_part
        if len(res) > 0:
            return res
    return None

In [1]:
%%time
part = entities.apply(lambda row: find_names_in_variants(row.names_from_dict, row.set_names, row.found), axis=1)

entities['names_from_dict'].update(part)


In [69]:
entities[(entities.found==False)&entities.unique_kw_names.notna()].shape

(5949, 17)

In [71]:
missing = []
def count_missing(stem_names, found_names, found_flag, missing):
    if isinstance(stem_names, list) and not found_flag:
        found={}
        res = [] 
        if pd.notna(found_names):
            found = found_names
        for i in range(len(stem_names)):
            if i not in found.keys():
                res.append(stem_names[i])
        res = list(set(res))
        missing += res
        

In [72]:
entities.apply(lambda row: count_missing(row.unique_kw_names, row.names_from_dict, row.found, missing), axis=1)

id
2075213    None
2077435    None
2076296    None
2081964    None
2081965    None
           ... 
2072553    None
2072554    None
2072555    None
2072556    None
2072557    None
Length: 151397, dtype: object

In [73]:
missing_counts = Counter(missing)
missing_counts = {k: v for k, v in sorted(missing_counts.items(), key=lambda item: item[1], reverse=True)}
missing_counts

{'Оос': 173,
 'Гарр': 110,
 'Ляшк': 94,
 'премєр': 73,
 'Даш': 62,
 'Вооз': 61,
 'Сар': 55,
 'Юл': 53,
 'Маск': 50,
 'Сбу': 43,
 'Жан': 43,
 'Олен': 43,
 'Байд': 42,
 'журналист': 41,
 'Ольг': 36,
 'Анн': 34,
 'чиновниц': 34,
 'Іран': 33,
 'Тар': 33,
 'Наст': 32,
 'Зірк': 32,
 'Мар': 31,
 'Лод': 31,
 'Серг': 30,
 'Р': 30,
 'Ірин': 28,
 'очевидц': 26,
 'Ганц': 25,
 'Хамас': 24,
 'Кус': 24,
 'Ал': 21,
 'Мкіп': 21,
 'Бен': 20,
 'Діан': 20,
 'Кив': 19,
 'Андр': 18,
 'Рааб': 18,
 'Офіс': 18,
 'Віл': 18,
 'Слав': 18,
 'Маш': 18,
 'Макс': 17,
 'Усик': 17,
 'Білл': 17,
 'Галявієв': 17,
 'Дзерб': 17,
 'епідеміолог': 17,
 'Ян': 16,
 'Авак': 16,
 'очевидец': 16,
 'А': 16,
 'Іванга': 15,
 'Кміс': 15,
 'Маас': 15,
 'Лир': 15,
 'Юр': 15,
 'Павл': 14,
 'Шрі': 14,
 'Над': 14,
 'Баст': 13,
 'Сант': 13,
 'Белт': 13,
 'Ганн': 13,
 'Саш': 13,
 'Ігор': 12,
 'Псак': 12,
 'Міш': 12,
 'Злат': 12,
 'чоловік': 12,
 'Ле': 12,
 'Олег': 12,
 'М і К . Ми з колег із правоохоронн орган': 11,
 'Ухан': 11,
 'Едвард Чо'

In [None]:
### adding missing names to names dict

# missing_names = [name for name if missing_counts.keys() if re.search(r'[А-ЯЇҐЄІ]\S* [А-ЯЇҐЄІ]\S*', name)]
# for name in missing_names:
#     names_df.loc[name, 'num'] = missing_counts[name]
#     names_df.loc[name, 'id'] = names_df.id.max()+1
# names_df.to_csv(names_dict_path)

In [84]:
entities['names_found'] = entities.names_from_dict.apply(lambda d:
                                                 list(dict(sorted(d.items())).values()) if pd.notna(d) else None)

In [86]:
entities['found_names_str'] = entities.names_found.apply(lambda names: 
                                                    '§'.join(names) if isinstance(names, list) else None)

In [87]:
entities.found_names_str.notna().sum()

71557

In [92]:
entities.columns

Index(['link', 'language', 'all_ent_str', 'name_and_kw', 'kw_and_ent',
       'names_and_kw_str', 'names_str', 'names_sets_str',
       'one_name_per_set_str', 'string_names_sets_str', 'filt_kw_names_str',
       'checked_with_conllu', 'checked_names_str', 'kw_names_unique_str',
       'unique_kw_names', 'names_from_dict', 'found', 'names_found',
       'found_names_str', 'dict_str'],
      dtype='object')

In [94]:
entities[['link', 'language', 'all_ent_str', 'name_and_kw', 'kw_and_ent',
       'names_and_kw_str', 'names_str', 'names_sets_str',
       'one_name_per_set_str', 'string_names_sets_str', 'filt_kw_names_str',
       'checked_with_conllu', 'checked_names_str', 'kw_names_unique_str',
       'found', 'found_names_str', 'dict_str']].to_csv(entities_path)

In [96]:
news_filepath = '../data/may.csv'
news = pd.read_csv(news_filepath, index_col=[0])
news.columns

Index(['title', 'text', 'subtitle', 'link', 'domain', 'datetime', 'views',
       'created_at', 'category', 'language', 'domain_alias', 'mycategory'],
      dtype='object')

In [99]:
news['found_names_str'] = entities['found_names_str']

In [100]:
news.to_csv(news_filepath)

In [88]:
def dict_to_str(names_from_dict):
    if pd.notna(names_from_dict):
        res = []
        for k, v in names_from_dict.items():
            res.append('§'.join([str(k), v]))
        return '<+>'.join(res)
    return None

In [89]:
entities[entities.names_from_dict.notna()].shape

(71557, 19)

In [90]:
entities['dict_str'] = entities.names_from_dict.apply(dict_to_str)