# Convert fore names to simplified (one word) fore names

In [1]:
import collections
import string
import pandas

import pubmedpy.names

In [2]:
name_df_paths = [
    'data/pubmed/authors.tsv.xz',
    'data/pmc/authors.tsv.xz',
    'data/iscb/keynotes.tsv',
]
path_to_df = {}
for path in name_df_paths:
    name_df = pandas.read_csv(path, keep_default_na=False, na_values=[''], sep='\t')
    path_to_df[path] = name_df

In [3]:
def simplify_names(dfs, name_column):
    names = pandas.concat(df[name_column] for df in dfs)
    simple_name_df = names.value_counts(dropna=True).reset_index()
    simple_name_df.columns = [name_column, 'n_authors']
    assert simple_name_df[name_column].isna().sum() == 0
    simplifier = getattr(pubmedpy.names, f'simplify_{name_column}')
    simple_name_df[f'{name_column}_simple'] = simple_name_df[name_column].map(lambda x: simplifier(x, lower=True))
    simple_name_df = simple_name_df.sort_values(name_column)
    return simple_name_df

In [4]:
simple_fore_name_df = simplify_names(path_to_df.values(), 'fore_name')
simple_fore_name_df

Unnamed: 0,fore_name,n_authors,fore_name_simple
36084,(Max) Zong-Ming,1,max
27531,", Yuxin",1,yuxin
24,A,374,
2230,A A,17,
18318,A Aldo,2,aldo
...,...,...,...
20397,Špela,2,špela
17183,Žiga,2,žiga
33545,Živa,1,živa
22748,Živadin,2,živadin


In [5]:
simple_last_name_df = simplify_names(path_to_df.values(), 'last_name')
simple_last_name_df

Unnamed: 0,last_name,n_authors,last_name_simple
33605,Michor,1,michor
34856,Rozowsky,1,rozowsky
36364,Slater,1,slater
33519,'t Hart,1,t hart
2366,'t Hoen,14,t hoen
...,...,...,...
23084,Žárský,2,žárský
36693,železný,1,železný
37756,žurauskienė,1,žurauskienė
18414,‘t Hoen,2,‘t hoen


In [6]:
simple_fore_name_df.sample(n=20, random_state=0).sort_values('n_authors', ascending=False)

Unnamed: 0,fore_name,n_authors,fore_name_simple
3435,Marine,11,marine
3990,Shihab,10,shihab
7036,Guy N,6,guy
11226,Moliang,4,moliang
11590,Denisa,3,denisa
21742,Laxmikanth,2,laxmikanth
22300,Cátia,2,cátia
24808,Amy C,2,amy
15885,Reiko Matsuda,2,reiko
20073,Bo Kyung,2,bo


In [7]:
simple_last_name_df.sample(n=20, random_state=0).sort_values('n_authors', ascending=False)

Unnamed: 0,last_name,n_authors,last_name_simple
1174,Lou,23,lou
1768,van Hijum,18,van hijum
4228,Zarringhalam,9,zarringhalam
8644,Löwer,5,löwer
10763,Parkkinen,4,parkkinen
10195,Harkin,4,harkin
14223,Pascal,3,pascal
15440,Colsch,3,colsch
13723,Krupp,3,krupp
19656,Hagenaars,2,hagenaars


In [8]:
# Most common last names
(
    simple_last_name_df
    .groupby('last_name_simple')
    .n_authors.sum()
    .sort_values(ascending=False)
    .reset_index()
    .head(3)
)

Unnamed: 0,last_name_simple,n_authors
0,wang,3240
1,li,2715
2,zhang,2655


In [9]:
simple_fore_name_df.to_csv('data/names/fore-names.tsv.xz', sep='\t', index=False)
simple_last_name_df.to_csv('data/names/last-names.tsv.xz', sep='\t', index=False)
simple_fore_name_df.to_csv('data/names/simple-fore-names.tsv.xz', sep='\t', index=False)
simple_last_name_df.to_csv('data/names/simple-last-names.tsv.xz', sep='\t', index=False)