# Convert fore names to simplified (one word) fore names

In [1]:
import collections
import string
import pandas

import pubmedpy.names
from pubmedpy.names import simplify_fore_name, simplify_last_name

In [2]:
name_df_paths = [
    'data/pubmed/authors.tsv.xz',
    'data/pmc/authors.tsv.xz',
    'data/iscb/keynotes.tsv',
]
path_to_df = {}
for path in name_df_paths:
    name_df = pandas.read_csv(path, keep_default_na=False, na_values=[''], sep='\t')
    path_to_df[path] = name_df

## full name table

create a table of individuals with full names for prediction methods that require full names.

In [3]:
full_name_df = pandas.concat(
    [df[["fore_name", "last_name"]] for df in path_to_df.values()]
).drop_duplicates()
full_name_df.head(2)

Unnamed: 0,fore_name,last_name
0,B A,Eckman
1,J S,Aaronson


In [4]:
def clean_full_name(name: str) -> str:
    """
    Lightly clean full names by replacing periods with spaces
    and standardizing whitespace.
    """
    name = name.replace(".", " ")
    name = " ".join(name.split())
    return name

In [5]:
full_name_df['fore_name_simple'] = full_name_df.fore_name.map(
    lambda x: simplify_fore_name(x, lower=True))
full_name_df['last_name_simple'] = full_name_df.last_name.map(
    lambda x: simplify_last_name(x, lower=True))
full_name_df = full_name_df.dropna(subset=['fore_name_simple', "last_name_simple"])
full_name_df["full_name"] = (
    full_name_df.fore_name.str.strip() + " " +
    full_name_df.last_name.str.strip()).map(clean_full_name)
full_name_df.sample(n=10, random_state=0)

Unnamed: 0,fore_name,last_name,fore_name_simple,last_name_simple,full_name
9294,Knut,Liestøl,knut,liestøl,Knut Liestøl
14214,Jeffery,Tang,jeffery,tang,Jeffery Tang
119112,Yun,Huang,yun,huang,Yun Huang
31747,Filippo,Utro,filippo,utro,Filippo Utro
99504,Ethan,Bahl,ethan,bahl,Ethan Bahl
95819,Sören,Vogel,sören,vogel,Sören Vogel
115263,Monica,Agrawal,monica,agrawal,Monica Agrawal
115921,Pujan,Pokhrel,pujan,pokhrel,Pujan Pokhrel
36266,Bernd,Wollenweber,bernd,wollenweber,Bernd Wollenweber
70123,Eric,Janssen,eric,janssen,Eric Janssen


In [6]:
full_name_df.to_csv('data/names/full-names.tsv.xz', sep='\t', index=False)

## fore and last name tables

In [7]:
def simplify_names(dfs, name_column):
    names = pandas.concat(df[name_column] for df in dfs)
    simple_name_df = names.value_counts(dropna=True).reset_index()
    simple_name_df.columns = [name_column, 'n_authors']
    assert simple_name_df[name_column].isna().sum() == 0
    simplifier = getattr(pubmedpy.names, f'simplify_{name_column}')
    simple_name_df[f'{name_column}_simple'] = simple_name_df[name_column].map(lambda x: simplifier(x, lower=True))
    simple_name_df = simple_name_df.sort_values(name_column)
    return simple_name_df

In [8]:
simple_fore_name_df = simplify_names(path_to_df.values(), 'fore_name')
simple_fore_name_df

Unnamed: 0,fore_name,n_authors,fore_name_simple
30482,(Max) Zong-Ming,1,max
35334,", Yuxin",1,yuxin
24,A,373,
2169,A A,17,
19483,A Aldo,2,aldo
...,...,...,...
18475,Špela,2,špela
24605,Žiga,2,žiga
38201,Živa,1,živa
20668,Živadin,2,živadin


In [9]:
simple_last_name_df = simplify_names(path_to_df.values(), 'last_name')
simple_last_name_df

Unnamed: 0,last_name,n_authors,last_name_simple
34566,Rozowsky,1,rozowsky
38864,Slater,1,slater
35554,'t Hart,1,t hart
2602,'t Hoen,14,t hoen
39214,(Holly) Yang,1,holly) yang
...,...,...,...
24147,Žárský,2,žárský
38704,železný,1,železný
34750,žurauskienė,1,žurauskienė
26528,‘t Hoen,2,‘t hoen


In [10]:
simple_fore_name_df.sample(n=20, random_state=0).sort_values('n_authors', ascending=False)

Unnamed: 0,fore_name,n_authors,fore_name_simple
449,Giulia,64,giulia
1596,Akinori,23,akinori
2658,Shinya,14,shinya
6184,Mayya,6,mayya
9001,Yongbing,4,yongbing
8960,Zelmina,4,zelmina
10200,Hajer,4,hajer
10958,Cheng-Hsun,4,cheng-hsun
13347,Anu G.,3,anu
21460,Keyuan,2,keyuan


In [11]:
simple_last_name_df.sample(n=20, random_state=0).sort_values('n_authors', ascending=False)

Unnamed: 0,last_name,n_authors,last_name_simple
102,Cohen,127,cohen
1377,Heckerman,21,heckerman
1590,Woo,19,woo
4502,Marín,9,marín
8622,Bähler,5,bähler
9868,Hemenway,4,hemenway
12464,Renou,4,renou
9944,Vervier,4,vervier
10138,Slawski,4,slawski
11235,Sakamoto,4,sakamoto


In [12]:
# Most common last names
(
    simple_last_name_df
    .groupby('last_name_simple')
    .n_authors.sum()
    .sort_values(ascending=False)
    .reset_index()
    .head(3)
)

Unnamed: 0,last_name_simple,n_authors
0,wang,3232
1,li,2716
2,zhang,2652


In [13]:
simple_fore_name_df.to_csv('data/names/fore-names.tsv.xz', sep='\t', index=False)
simple_last_name_df.to_csv('data/names/last-names.tsv.xz', sep='\t', index=False)