# Convert fore names to simplified (one word) fore names

In [1]:
import collections
import string
import pandas

import pubmedpy.names
from pubmedpy.names import simplify_fore_name, simplify_last_name

In [2]:
name_df_paths = [
    'data/pubmed/authors.tsv.xz',
    'data/pmc/authors.tsv.xz',
    'data/iscb/keynotes.tsv',
]
path_to_df = {}
for path in name_df_paths:
    name_df = pandas.read_csv(path, keep_default_na=False, na_values=[''], sep='\t')
    path_to_df[path] = name_df

## full name table

create a table of individuals with full names for prediction methods that require full names.

In [3]:
full_name_df = pandas.concat(
    [df[["fore_name", "last_name"]] for df in path_to_df.values()]
).drop_duplicates()
full_name_df.head(2)

Unnamed: 0,fore_name,last_name
0,A H,Samad
1,W W,Cai


In [4]:
def clean_full_name(name: str) -> str:
    """
    Lightly clean full names by replacing periods with spaces
    and standardizing whitespace.
    """
    name = name.replace(".", " ")
    name = " ".join(name.split())
    return name

In [5]:
full_name_df['fore_name_simple'] = full_name_df.fore_name.map(
    lambda x: simplify_fore_name(x, lower=True))
full_name_df['last_name_simple'] = full_name_df.last_name.map(
    lambda x: simplify_last_name(x, lower=True))
full_name_df = full_name_df.dropna(subset=['fore_name_simple', "last_name_simple"])
full_name_df["full_name"] = (
    full_name_df.fore_name.str.strip() + " " +
    full_name_df.last_name.str.strip()).map(clean_full_name)
full_name_df.sample(n=10, random_state=0)

Unnamed: 0,fore_name,last_name,fore_name_simple,last_name_simple,full_name
1042181,Azemi,Barama,azemi,barama,Azemi Barama
1064826,Youyong,Wang,youyong,wang,Youyong Wang
114476,Helgard I,Nirenberg,helgard,nirenberg,Helgard I Nirenberg
44710,Yong-Hua,Gan,yong-hua,gan,Yong-Hua Gan
612265,Denise P,Barlow,denise,barlow,Denise P Barlow
1068732,Sylvain,Martineau,sylvain,martineau,Sylvain Martineau
531754,Won-Hee,Song,won-hee,song,Won-Hee Song
372138,Christiane,Rondeau,christiane,rondeau,Christiane Rondeau
421503,Jessica,Tyler,jessica,tyler,Jessica Tyler
478038,Vytas,Svedas,vytas,svedas,Vytas Svedas


In [6]:
full_name_df.to_csv('data/names/full-names.tsv.xz', sep='\t', index=False)

## fore and last name tables

In [7]:
def simplify_names(dfs, name_column):
    names = pandas.concat(df[name_column] for df in dfs)
    simple_name_df = names.value_counts(dropna=True).reset_index()
    simple_name_df.columns = [name_column, 'n_authors']
    assert simple_name_df[name_column].isna().sum() == 0
    simplifier = getattr(pubmedpy.names, f'simplify_{name_column}')
    simple_name_df[f'{name_column}_simple'] = simple_name_df[name_column].map(lambda x: simplifier(x, lower=True))
    simple_name_df = simple_name_df.sort_values(name_column)
    return simple_name_df

In [8]:
simple_fore_name_df = simplify_names(path_to_df.values(), 'fore_name')
simple_fore_name_df

Unnamed: 0,fore_name,n_authors,fore_name_simple
136030,(Max) Zong-Ming,1,max
102701,", Yuxin",1,yuxin
107469,-,1,
3,A,3787,
138774,A Gordon,1,gordon
...,...,...,...
80979,Živojin,1,živojin
141327,Žygimantė,1,žygimantė
145797,željka,1,željka
129082,Νicolaos,1,νicolaos


In [9]:
simple_last_name_df = simplify_names(path_to_df.values(), 'last_name')
simple_last_name_df

Unnamed: 0,last_name,n_authors,last_name_simple
96300,'t Hart,1,t hart
5389,'t Hoen,26,t hoen
170911,(Holly) Yang,1,holly) yang
167472,-L Zhu,1,l zhu
9870,A,16,a
...,...,...,...
161845,žurauskienė,1,žurauskienė
137908,ʼt Hoen,1,ʼt hoen
172686,Ӧkmen,1,ӧkmen
68537,‘t Hoen,2,‘t hoen


In [10]:
simple_fore_name_df.sample(n=20, random_state=0).sort_values('n_authors', ascending=False)

Unnamed: 0,fore_name,n_authors,fore_name_simple
14602,Kuo-Bin,11,kuo-bin
15588,Mirosław,10,mirosław
21273,Collin M,7,collin
28687,Lee-Wei,5,lee-wei
36287,Ronak H,4,ronak
33168,Baoqiang,4,baoqiang
33974,Avichai,4,avichai
45063,Christian Clement,3,christian
42722,Sárka,3,sárka
41268,Ke-Yi,3,ke-yi


In [11]:
simple_last_name_df.sample(n=20, random_state=0).sort_values('n_authors', ascending=False)

Unnamed: 0,last_name,n_authors,last_name_simple
2200,Leite,54,leite
10294,Wieland,15,wieland
36171,Delmont,5,delmont
35443,Rönnblom,5,rönnblom
35906,Saia-Cereda,5,saia-cereda
53155,Goichon,3,goichon
49440,van Peij,3,van peij
79396,Antonovsky,2,antonovsky
81386,Tala,2,tala
79182,Lecube,2,lecube


In [12]:
# Most common last names
(
    simple_last_name_df
    .groupby('last_name_simple')
    .n_authors.sum()
    .sort_values(ascending=False)
    .reset_index()
    .head(3)
)

Unnamed: 0,last_name_simple,n_authors
0,wang,20137
1,zhang,17491
2,li,17246


In [13]:
simple_fore_name_df.to_csv('data/names/fore-names.tsv.xz', sep='\t', index=False)
simple_last_name_df.to_csv('data/names/last-names.tsv.xz', sep='\t', index=False)