# Convert fore names to simplified (one word) fore names

In [1]:
import collections
import string
import pandas

import pubmedpy.names
from pubmedpy.names import simplify_fore_name, simplify_last_name

In [2]:
name_df_paths = [
    'data/pubmed/authors.tsv.xz',
    'data/pmc/authors.tsv.xz',
    'data/iscb/keynotes.tsv',
]
path_to_df = {}
for path in name_df_paths:
    name_df = pandas.read_csv(path, keep_default_na=False, na_values=[''], sep='\t')
    path_to_df[path] = name_df

## full name table

create a table of individuals with full names for prediction methods that require full names.

In [3]:
full_name_df = pandas.concat(
    [df[["fore_name", "last_name"]] for df in path_to_df.values()]
).drop_duplicates()
full_name_df.head(2)

Unnamed: 0,fore_name,last_name
0,A H,Samad
1,W W,Cai


In [4]:
def clean_full_name(name: str) -> str:
    """
    Lightly clean full names by replacing periods with spaces
    and standardizing whitespace.
    """
    name = name.replace(".", " ")
    name = " ".join(name.split())
    return name

In [5]:
full_name_df['fore_name_simple'] = full_name_df.fore_name.map(
    lambda x: simplify_fore_name(x, lower=True))
full_name_df['last_name_simple'] = full_name_df.last_name.map(
    lambda x: simplify_last_name(x, lower=True))
full_name_df = full_name_df.dropna(subset=['fore_name_simple', "last_name_simple"])
full_name_df["full_name"] = (
    full_name_df.fore_name.str.strip() + " " +
    full_name_df.last_name.str.strip()).map(clean_full_name)
full_name_df.sample(n=10, random_state=0)

Unnamed: 0,fore_name,last_name,fore_name_simple,last_name_simple,full_name
179414,Gareth,Morgan,gareth,morgan,Gareth Morgan
168750,Michael,Cackovic,michael,cackovic,Michael Cackovic
441432,Georgia,Skreti,georgia,skreti,Georgia Skreti
882707,Ara A,Vaporciyan,ara,vaporciyan,Ara A Vaporciyan
457022,Oh Sung,Kwon,oh,kwon,Oh Sung Kwon
1008415,Felix,Feng,felix,feng,Felix Feng
1057795,Anush,Mukeria,anush,mukeria,Anush Mukeria
331074,Klaas M,Pos,klaas,pos,Klaas M Pos
572639,Mark,Sistrom,mark,sistrom,Mark Sistrom
904657,Deepika,Paliwal,deepika,paliwal,Deepika Paliwal


In [6]:
full_name_df.to_csv('data/names/full-names.tsv.xz', sep='\t', index=False)

## fore and last name tables

In [7]:
def simplify_names(dfs, name_column):
    names = pandas.concat(df[name_column] for df in dfs)
    simple_name_df = names.value_counts(dropna=True).reset_index()
    simple_name_df.columns = [name_column, 'n_authors']
    assert simple_name_df[name_column].isna().sum() == 0
    simplifier = getattr(pubmedpy.names, f'simplify_{name_column}')
    simple_name_df[f'{name_column}_simple'] = simple_name_df[name_column].map(lambda x: simplifier(x, lower=True))
    simple_name_df = simple_name_df.sort_values(name_column)
    return simple_name_df

In [8]:
simple_fore_name_df = simplify_names(path_to_df.values(), 'fore_name')
simple_fore_name_df

Unnamed: 0,fore_name,n_authors,fore_name_simple
92029,(Max) Zong-Ming,1,max
79539,", Yuxin",1,yuxin
106440,-,1,
3,A,3861,
91289,A Gordon,1,gordon
...,...,...,...
78727,Živojin,1,živojin
123656,Žygimantė,1,žygimantė
108348,željka,1,željka
146414,Νicolaos,1,νicolaos


In [9]:
simple_last_name_df = simplify_names(path_to_df.values(), 'last_name')
simple_last_name_df

Unnamed: 0,last_name,n_authors,last_name_simple
99238,'t Hart,1,t hart
5560,'t Hoen,26,t hoen
117960,(Holly) Yang,1,holly) yang
165754,-L Zhu,1,l zhu
6854,A,21,a
...,...,...,...
168500,žurauskienė,1,žurauskienė
97748,ʼt Hoen,1,ʼt hoen
115175,Ӧkmen,1,ӧkmen
78783,‘t Hoen,2,‘t hoen


In [10]:
simple_fore_name_df.sample(n=20, random_state=0).sort_values('n_authors', ascending=False)

Unnamed: 0,fore_name,n_authors,fore_name_simple
1040,Edward J,150,edward
21477,Taisen,7,taisen
23489,Guangtang,6,guangtang
23929,Sanjoy,6,sanjoy
25310,Nga,6,nga
67018,Robert Sidney,2,robert
60949,Taekjip,2,taekjip
95647,Wanda M,1,wanda
112238,Man-ling,1,man-ling
88478,Hinrick W. H.,1,hinrick


In [11]:
simple_last_name_df.sample(n=20, random_state=0).sort_values('n_authors', ascending=False)

Unnamed: 0,last_name,n_authors,last_name_simple
1292,Ansari,81,ansari
5864,Sahasrabuddhe,25,sahasrabuddhe
6309,Hofestädt,23,hofestädt
8858,Keil,17,keil
20187,Guner,8,guner
30121,Fritzsch,6,fritzsch
35026,LoVerde,5,loverde
51108,Paterlini,3,paterlini
55578,Veo,3,veo
92180,Wambua,2,wambua


In [12]:
# Most common last names
(
    simple_last_name_df
    .groupby('last_name_simple')
    .n_authors.sum()
    .sort_values(ascending=False)
    .reset_index()
    .head(3)
)

Unnamed: 0,last_name_simple,n_authors
0,wang,20367
1,zhang,17814
2,li,17472


In [13]:
simple_fore_name_df.to_csv('data/names/fore-names.tsv.xz', sep='\t', index=False)
simple_last_name_df.to_csv('data/names/last-names.tsv.xz', sep='\t', index=False)