# Convert fore names to simplified (one word) fore names

In [1]:
import collections
import string
import pandas

import pubmedpy.names
from pubmedpy.names import simplify_fore_name, simplify_last_name

In [2]:
name_df_paths = [
    'data/pubmed/authors.tsv.xz',
    'data/pmc/authors.tsv.xz',
    'data/iscb/keynotes.tsv',
]
path_to_df = {}
for path in name_df_paths:
    name_df = pandas.read_csv(path, keep_default_na=False, na_values=[''], sep='\t')
    path_to_df[path] = name_df

## full name table

create a table of individuals with full names for prediction methods that require full names.

In [3]:
full_name_df = pandas.concat(
    [df[["fore_name", "last_name"]] for df in path_to_df.values()]
).drop_duplicates()
full_name_df.head(2)

Unnamed: 0,fore_name,last_name
0,A H,Samad
1,W W,Cai


In [4]:
def clean_full_name(name: str) -> str:
    """
    Lightly clean full names by replacing periods with spaces
    and standardizing whitespace.
    """
    name = name.replace(".", " ")
    name = " ".join(name.split())
    return name

In [5]:
full_name_df['fore_name_simple'] = full_name_df.fore_name.map(
    lambda x: simplify_fore_name(x, lower=True))
full_name_df['last_name_simple'] = full_name_df.last_name.map(
    lambda x: simplify_last_name(x, lower=True))
full_name_df = full_name_df.dropna(subset=['fore_name_simple', "last_name_simple"])
full_name_df["full_name"] = (
    full_name_df.fore_name.str.strip() + " " +
    full_name_df.last_name.str.strip()).map(clean_full_name)
full_name_df.sample(n=10, random_state=0)

Unnamed: 0,fore_name,last_name,fore_name_simple,last_name_simple,full_name
189136,Meng-Hua,Li,meng-hua,li,Meng-Hua Li
66524,Lee,Smith,lee,smith,Lee Smith
566731,Charles D,Searles,charles,searles,Charles D Searles
489139,Jessica L,Linville,jessica,linville,Jessica L Linville
321497,Edgar P,Spalding,edgar,spalding,Edgar P Spalding
345069,Jinhee,Kim,jinhee,kim,Jinhee Kim
653507,Le,Gu,le,gu,Le Gu
815282,Peter,Nemecek,peter,nemecek,Peter Nemecek
445990,Stephan,Bessler,stephan,bessler,Stephan Bessler
221313,Farhan A,Pasha,farhan,pasha,Farhan A Pasha


In [6]:
full_name_df.to_csv('data/names/full-names.tsv.xz', sep='\t', index=False)

## fore and last name tables

In [7]:
def simplify_names(dfs, name_column):
    names = pandas.concat(df[name_column] for df in dfs)
    simple_name_df = names.value_counts(dropna=True).reset_index()
    simple_name_df.columns = [name_column, 'n_authors']
    assert simple_name_df[name_column].isna().sum() == 0
    simplifier = getattr(pubmedpy.names, f'simplify_{name_column}')
    simple_name_df[f'{name_column}_simple'] = simple_name_df[name_column].map(lambda x: simplifier(x, lower=True))
    simple_name_df = simple_name_df.sort_values(name_column)
    return simple_name_df

In [8]:
simple_fore_name_df = simplify_names(path_to_df.values(), 'fore_name')
simple_fore_name_df

Unnamed: 0,fore_name,n_authors,fore_name_simple
111085,(Max) Zong-Ming,1,max
143321,", Yuxin",1,yuxin
116180,-,1,
3,A,3763,
102623,A Gordon,1,gordon
...,...,...,...
88845,Živojin,1,živojin
142252,Žygimantė,1,žygimantė
114339,željka,1,željka
120511,Νicolaos,1,νicolaos


In [9]:
simple_last_name_df = simplify_names(path_to_df.values(), 'last_name')
simple_last_name_df

Unnamed: 0,last_name,n_authors,last_name_simple
179008,'t Hart,1,t hart
5535,'t Hoen,26,t hoen
109483,(Holly) Yang,1,holly) yang
127094,-L Zhu,1,l zhu
9733,A,16,a
...,...,...,...
163400,žurauskienė,1,žurauskienė
103522,ʼt Hoen,1,ʼt hoen
126830,Ӧkmen,1,ӧkmen
66497,‘t Hoen,2,‘t hoen


In [10]:
simple_fore_name_df.sample(n=20, random_state=0).sort_values('n_authors', ascending=False)

Unnamed: 0,fore_name,n_authors,fore_name_simple
11142,Juan-Carlos,14,juan-carlos
28907,Djamel,5,djamel
33562,Hui-Qing,4,hui-qing
36266,Diana C J,4,diana
33596,Furqan,4,furqan
35324,Shih-Hwa,4,shih-hwa
53115,Terence M,2,terence
58404,Hongxian,2,hongxian
69937,Sharadha,2,sharadha
119261,Oswaldo Keith,1,oswaldo


In [11]:
simple_last_name_df.sample(n=20, random_state=0).sort_values('n_authors', ascending=False)

Unnamed: 0,last_name,n_authors,last_name_simple
9159,Regoes,17,regoes
14934,Lucena,11,lucena
18338,Ishino,9,ishino
21592,Jylhä,8,jylhä
24086,Gulcher,7,gulcher
32574,Stenbroen,5,stenbroen
45206,Dharmawardhana,4,dharmawardhana
39632,Er,4,er
51683,Baguelin,3,baguelin
51302,Moezelaar,3,moezelaar


In [12]:
# Most common last names
(
    simple_last_name_df
    .groupby('last_name_simple')
    .n_authors.sum()
    .sort_values(ascending=False)
    .reset_index()
    .head(3)
)

Unnamed: 0,last_name_simple,n_authors
0,wang,19753
1,zhang,17130
2,li,16889


In [13]:
simple_fore_name_df.to_csv('data/names/fore-names.tsv.xz', sep='\t', index=False)
simple_last_name_df.to_csv('data/names/last-names.tsv.xz', sep='\t', index=False)