### 7_Dealing_with_names_counts_dictionaries

##### After counting checked unique name mentions and saving this counts to dataframe, we join different name forms and save these name variations to use them for searching later

In [143]:
import pandas as pd
import re
import json
from collections import Counter

#### Functions for find name variations

In [115]:
names_dict_path = 'joined_names_dict.csv'
names_counts_df = pd.read_csv(names_dict_path, index_col=[0])
names_counts_df.columns

Index(['name', 'num', 'stem_name', 'all_vars_str', 'id'], dtype='object')

In [116]:
names_counts_df['all_vars'] = names_counts_df.all_vars_str.str.split('§')

In [117]:
def find_lang_variants(name, used_names, ending = '\w{,2}', min_len = (1,3)):
    
    name_var_split = name.split()
    if len(name_var_split)==2:
        if len(name_var_split[0])>min_len[0] and len(name_var_split[1])>min_len[1]:
            name_var = re.sub(r'Во?л(о|а)д', 'Во?л(о|а)д', name_var_split[0])
            name_var = re.sub(r'Паве?л', 'Паве?л', name_var)
            name_var = re.sub(r'Фед(і|о)р', 'Фед(і|о)р', name_var)
            name_var = re.sub(r'В\'?яч', 'В\'?яч', name_var)
            name_var = re.sub(r'Генн?ад', 'Генн?ад', name_var)
            name_var = re.sub(r'Кон?ст(я|а)нт', 'Кон?ст(я|а)нт', name_var)
            name_var = re.sub(r'(Е|О)лен', '(Е|О)лен', name_var)
            name_var = re.sub(r'(М|Н)ик', '(М|Н)ик', name_var)
            name_var = re.sub(r'(О|А)л(е|є|э)кс', '(О|А)л(е|є|э)кс', name_var)
            name_var = re.sub(r'(Х|К)рист', '(Х|К)рист', name_var)
            name_var = re.sub(r'Св(і|е)тл', 'Св(і|е)тл', name_var)
            name_var = re.sub(r'Е?(К|к)атер', 'Е?(К|к)атер', name_var)
            name_var = re.sub(r'Уль?я', 'Уль?я', name_var)

            if name_var != name_var_split[0]:
                upd_part = name_var_split[1]
            else:
                upd_part = name

            upd_part = re.sub(r'[ІИЇЙ]', '(І|И|Ї|Й)', upd_part)
            upd_part = re.sub(r'[ГХҐ]', '(Г|Х|Ґ)', upd_part)
            upd_part = re.sub(r'[хгґ]', '(х|г|ґ)', upd_part)
            upd_part = re.sub(r'[іиїйы]', '(і|и|ї|й|ы)', upd_part)
            upd_part = re.sub(r'[ЕЄЭ]', '(Є|Е|Э)', upd_part)
            upd_part = re.sub(r'[еєэ]', '(е|є|э)', upd_part)
            upd_part = re.sub(r'сь?к', 'сь?к', upd_part)
            upd_part = re.sub(r'ць?к', 'ць?к', upd_part)

            if name_var != name_var_split[0]:
                upd_part = name_var + '?\w{,1} ' + upd_part + ending
            else:
                new_split = upd_part.split()
                upd_part = new_split[0] + '?\w{,1} ' + new_split[1] + ending

            used_names.append(name)
           
            try:
                variants = list(names_counts_df[(~names_counts_df.name.isin(used_names))&(names_counts_df.name.str.contains(rf'^{upd_part}$'))].name.values)
            except Exception as e:
                print(e)
                print(name, ':', upd_part)
                return None

            if len(variants) > 0:
                used_names += variants
                return variants
    return None

In [118]:
used_names = []
lang_vars = names_counts_df[names_counts_df.num>3].name.apply(lambda x: find_lang_variants(x, used_names))

  return func(self, *args, **kwargs)


In [124]:
lang_vars[lang_vars.notna()]

2                                 [Денисш Шмигал]
6                              [Володимир Путіна]
13                                  [Алекс Даныл]
34                                 [Анґел Меркел]
36                                  [Сер Марченк]
                           ...                   
115460    [Метт Лебланк, Метт Леблан, Мет Леблан]
115462                          [Деніс Хмілевськ]
115463                             [Майкл Патрик]
115469                          [Вячеслав Маслюк]
115471                             [Артур Білоус]
Name: name, Length: 655, dtype: object

In [119]:
def find_contains_variants(name, used_names, min_len = (1,3)):
    name_var_split = name.split()
    if len(name_var_split)==2:
        if len(name_var_split[0])>min_len[0] and len(name_var_split[1])>min_len[1]:
            used_names.append(name)
            try:
                variants = list(names_counts_df[(~names_counts_df.name.isin(used_names))&(names_counts_df.name.str.contains(name))].name.values)
            except Exception as e:
                print(e)
                print(name)
                return None
            
            if len(variants) > 0:
                used_names += variants
                return variants
    return None

In [120]:
used_names2 = []
cont_vars = names_counts_df[names_counts_df.num>3].name.apply(lambda x: find_contains_variants(x, used_names2))

In [122]:
cont_vars[cont_vars.notna()]

0         [Офіс Володимир Зеленськ, йшло під час нарад п...
1                                         [Соз Макс Степан]
2                            [Кабінет міністр Денис Шмигал]
4         [Джо Байден / Фот : Flickr, Джо Байден 46-м пр...
6         [Володимир Путіна, Володимир Путін Віктор Медв...
                                ...                        
115388                                         [Паул Дибал]
115396                                        [Ар Зограбян]
115424                             [Колишн Козловськ Рамін]
115449                        [Фот з інстаграм Тесс Голлід]
115463                                  [Майкл Патрік Кінг]
Name: name, Length: 385, dtype: object

In [125]:
names_counts_df['vars1'] = None
names_counts_df['vars2'] = None

names_counts_df.vars1.update(lang_vars)
names_counts_df.vars2.update(cont_vars)

In [126]:
names_counts_df = names_counts_df.set_index('name', drop=False)

In [127]:
def add_same_names_counts(names_counts_df, variants):
    res_df = pd.DataFrame(columns=list(names_counts_df.columns)+['adj_names'])
    for i, r in names_counts_df[names_counts_df[variants].notna()].iterrows():
        names_list = [r.name] + r[variants]
        for sec_name in r[variants]:
            try:
                sec_vars = names_counts_df.loc[sec_name, variants]
            except:
                print(i, sec_name)
                continue   
            if isinstance(sec_vars, list):
                names_list += sec_vars

        res_df.loc[r.name] = names_counts_df[names_counts_df.name.isin(names_list)].sum()
        names_list.remove(r.name)
        res_df.loc[r.name, 'adj_names'] = '§'.join(names_list)
    return res_df

In [133]:
sum_df = add_same_names_counts(names_counts_df, 'vars2')
sum_df['adj_list'] = sum_df.adj_names.str.split('§')

Володимир Путін Володимир Путіна
Мірч Луческ Мірч Луческе
Борис Філат Борис Філатов
Ольг Михайл Ольг Михайлюк
Олександр Старух Олександр Старухін
Віктор Петр Віктор Петрук
Олександр Щерб Олександр Щербук
Олег Синєгуб Олег Синєгубов
Серг Іван Серг Іванов
Серг Черн Серг Чернег
Олександр Денис Олександр Денисов
Александр Мясник Александр Мясников
Олександр Борняк Олександр Борняков
Дмитр Раим Дмитр Раимов
Дан Богатыр Дан Богатырев
Михайл Булгак Михайл Булгаков
Майкл Макфол Майкл Макфолл
Богдан Цимбал Богдан Цимбалюк
Дмитр Шпен Дмитр Шпенов
Віктор Андрус Віктор Андрусів
Ан Колесник Ан Колесников
Олександр Сладк Олександр Сладков
Віктор Ткач Віктор Ткачук
Віктор Черн Віктор Черниш
Віктор Черн Віктор Черняк
Олександр Лукаш Олександр Лукашук
Андр Аксьон Андр Аксьонов
Борис Колеснік Борис Колесніков
Олександр Петр Олександр Петрун
Дмитр Іван Дмитр Іванц
Олен Борис Олен Борисенк
Андр Черн Андр Чернец
Кат Репях Кат Репяхов
Юр Сула Юр Сулаев
Серг Сорок Серг Сорокін
Серг Сорок Серг Сорокин
Тумс Аб

In [129]:
backup = names_counts_df.copy()

In [44]:
# names_counts_df = backup.copy()

In [195]:
names_counts_df.shape

(127495, 10)

In [135]:
variants = 'vars2'
deleted = []
for i, r in names_counts_df[names_counts_df[variants].notna()].iterrows():
    if r.name not in deleted:
        adj = sum_df.loc[r.name, 'adj_list']
        names_counts_df = names_counts_df[~names_counts_df.name.isin(adj)] 
        deleted += adj
        names_counts_df.loc[r.name, 'num'] = sum_df.loc[r.name, 'num']
        for adj_name in sum_df.loc[r.name, 'adj_list']:
            names_counts_df.loc[r.name, variants].append(adj_name)

In [137]:
def join_two_vars(var1, var2):
    if isinstance(var1, list) and isinstance(var2, list):
        return list(set(var1+var2))
    elif isinstance(var1, list):
        return list(set(var1))
    elif isinstance(var2, list):
        return list(set(var2))
    else:
        return None

In [138]:
names_counts_df['vars3'] = names_counts_df.apply(lambda row: join_two_vars(row.vars1, row.vars2), axis=1)
names_counts_df['all_vars_joined'] = names_counts_df.apply(lambda row: two_vars(row.join_two_vars, row.vars3), axis=1)

In [79]:
names_counts_df.columns

Index(['name', 'num', 'stem_name', 'all_vars_str', 'id', 'vars1', 'vars2',
       'all_vars', 'vars3', 'all_vars_joined'],
      dtype='object')

In [149]:
names_counts_df['all_vars'] = names_counts_df['all_vars_joined']
names_counts_df['all_vars_str'] = names_counts_df.all_vars.str.join('§')

In [202]:
names_counts_df.columns

Index(['name', 'num', 'stem_name', 'all_vars_str', 'id', 'all_vars', 'vars1',
       'vars2', 'vars3', 'all_vars_joined'],
      dtype='object')

In [203]:
names_counts_df[['name', 'num', 'stem_name', 'all_vars_str', 'id']].to_csv(names_dict_path)

In [None]:
s = pd.Series(index=names_counts_df.index)
s = s.apply(lambda x: [] if x!=x else [])

names_counts_df['all_vars'] = names_counts_df['vars1'].fillna(s) + names_counts_df['vars2'].fillna(s)

names_counts_df['all_vars'] = names_counts_df.all_vars_str.str.split('§')

names_counts_df['all_vars'] = names_counts_df['all_vars'].apply(lambda x: list(set(x)) if isinstance(x,list) else None)

In [197]:
flat_list = [item for sublist in list(names_counts_df[names_counts_df.all_vars.notna()].all_vars.values) for item in sublist]
len(flat_list)

13867

In [199]:
names_counts_df[names_counts_df.index.isin(flat_list)]

Unnamed: 0_level_0,name,num,stem_name,all_vars_str,id,all_vars,vars1,vars2,vars3,all_vars_joined
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [56]:
# names_counts_df = names_counts_df[~names_counts_df.index.isin(flat_list)]

In [201]:
names_counts_df = clean_up(names_counts_df)

#### Joining dicts

In [5]:
def check_presence(name):
    nf = re.escape(name)
    pat = rf'§{nf}§|^{nf}§|§{nf}$|^{nf}$'
    res = part2[part2.all_vars_str.str.contains(pat)]
    return len(res)

# q = winter_names_counts_df.stem_name.apply(check_presence)

In [200]:
def clean_up(joined_df):
    joined_df['all_vars_str'] = joined_df.all_vars.str.join('§')
    joined_df['all_vars'] = joined_df.all_vars_str.str.split('§')
    joined_df['all_vars'] = joined_df['all_vars'].apply(lambda x: list(set(x)) if isinstance(x,list) else None)
    joined_df['all_vars_str'] = joined_df.all_vars.str.join('§')
    return joined_df

#### Fix names in dicts

In [210]:
def fix_name(joined_df, to_del, to_expand, re_flag):
    if not re_flag:
        try:
            to_del_part = joined_df.loc[to_del]
        except KeyError:
            print(to_del, 'was already fixed')
            return joined_df
        try:
            joined_df.loc[to_expand, 'all_vars'].append(to_del)
            joined_df['all_vars_str'] = joined_df.all_vars.str.join('§')
        except AttributeError:
            joined_df.loc[to_expand, 'all_vars_str'] = to_del
            joined_df['all_vars'] = joined_df.all_vars_str.str.split('§')
            
        joined_df.loc[to_expand, 'num'] += to_del_part.num
        if isinstance(to_del_part.all_vars, list):
            for n in to_del_part.all_vars:
                joined_df.loc[to_expand, 'all_vars'].append(n)
            joined_df['all_vars_str'] = joined_df.all_vars.str.join('§')
        joined_df = joined_df[joined_df.index!=to_del]
                
    else:
        to_del_part = joined_df[(joined_df.index!=to_expand)&joined_df.index.str.contains(to_del)].copy()
        for i,r in to_del_part.iterrows():
            joined_df.loc[to_expand, 'all_vars'].append(i)
            joined_df['all_vars_str'] = joined_df.all_vars.str.join('§')
            joined_df.loc[to_expand, 'num'] += r.num
            if isinstance(r.all_vars, list):
                for n in r.all_vars:
                    joined_df.loc[to_expand, 'all_vars'].append(n)
                joined_df['all_vars_str'] = joined_df.all_vars.str.join('§')
        joined_df = joined_df[~joined_df.index.isin(to_del_part.index.values)]
        del to_del_part
    
    return joined_df

In [33]:
names_counts_df.loc['Королев Єлизавет']

num                          3.0
stem_name       Королев Єлизавет
all_vars_str                 NaN
id                       18527.0
all_vars                     NaN
Name: Королев Єлизавет, dtype: object

In [47]:
fix_dict = json.load(open('fix_dict.json'))

In [269]:
names_counts_df = fix_name(names_counts_df, ': Шмигал', 'Біньямін Нетаньях', False)

In [48]:
for k,v in fix_dict.items():
    print(names_counts_df.shape)
    names_counts_df = fix_name(names_counts_df, k, v, False)
    print(names_counts_df.shape)

(114920, 5)
Серг Комиссаренк was already fixed
(114920, 5)
(114920, 5)
Вооз Тедрос Гебреєсус was already fixed
(114920, 5)
(114920, 5)
Адан Гебрейесус was already fixed
(114920, 5)
(114920, 5)
Тедрос Адхан Гебреєсус was already fixed
(114920, 5)
(114920, 5)
Питер Бен was already fixed
(114920, 5)
(114920, 5)
Майкл Раян was already fixed
(114920, 5)
(114920, 5)
Маск Степан was already fixed
(114920, 5)
(114920, 5)
Адхан Гебреесус was already fixed
(114920, 5)
(114920, 5)
Серг Бесараб was already fixed
(114920, 5)
(114920, 5)
Владислав Молчан was already fixed
(114920, 5)
(114920, 5)
Професорк Більченк was already fixed
(114920, 5)
(114920, 5)
Драгоман Більченк was already fixed
(114920, 5)
(114920, 5)
звільнят Більченк was already fixed
(114920, 5)
(114920, 5)
Ощасливлен Більченк was already fixed
(114920, 5)
(114920, 5)
* Більченк was already fixed
(114920, 5)
(114920, 5)
Більченк Дробот was already fixed
(114920, 5)
(114920, 5)
Заяв Більченк was already fixed
(114920, 5)
(114920, 5)
В

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


(114917, 5)
(114917, 5)
(114916, 5)
(114916, 5)
Сбу Віктор Ягун was already fixed
(114916, 5)
(114916, 5)
(114915, 5)
(114915, 5)
(114914, 5)
(114914, 5)
(114913, 5)
(114913, 5)
Эдгар Ринкевич was already fixed
(114913, 5)
(114913, 5)
(114912, 5)
(114912, 5)
(114911, 5)
(114911, 5)
(114910, 5)
(114910, 5)
(114909, 5)
(114909, 5)
(114908, 5)
(114908, 5)
(114907, 5)
(114907, 5)
Микол Чаус was already fixed
(114907, 5)
(114907, 5)
Никола Чаус was already fixed
(114907, 5)
(114907, 5)
(114906, 5)
(114906, 5)
(114905, 5)
(114905, 5)
(114904, 5)
(114904, 5)
(114903, 5)
(114903, 5)
(114902, 5)
(114902, 5)
(114901, 5)
(114901, 5)
(114900, 5)
(114900, 5)
(114899, 5)
(114899, 5)
(114898, 5)
(114898, 5)
(114897, 5)
(114897, 5)
(114896, 5)
(114896, 5)
(114895, 5)
(114895, 5)
(114894, 5)
(114894, 5)
(114893, 5)
(114893, 5)
(114892, 5)
(114892, 5)
(114891, 5)
(114891, 5)
(114890, 5)
(114890, 5)
(114889, 5)
(114889, 5)
(114888, 5)
(114888, 5)
(114887, 5)
(114887, 5)
(114886, 5)
(114886, 5)
(114885, 5

In [270]:
names_counts_df = clean_up(names_counts_df)
names_counts_df[['num', 'stem_name', 'all_vars_str', 'id']].to_csv(names_dict_path)

In [None]:
# to_del = 'Максим Бужанск'
# to_expand = 'Макс Бужанськ'
# to_del_part = joined_df.loc[to_del]
# joined_df.loc[to_expand, 'all_vars'].append(to_del)
# joined_df.loc[to_expand, 'num'] += to_del_part.num
# if isinstance(to_del_part.all_vars, list):
#     for n in to_del_part.all_vars:
#         joined_df.loc[to_expand, 'all_vars'].append(n)
# joined_df = joined_df[joined_df.index!=to_del]