# Convert fore names to simplified (one word) fore names

In [1]:
import collections
import string
import pandas

import pubmedpy.names
from pubmedpy.names import simplify_fore_name, simplify_last_name

In [2]:
name_df_paths = [
    'data/pubmed/authors.tsv.xz',
    'data/pmc/authors.tsv.xz',
    'data/iscb/keynotes.tsv',
]
path_to_df = {}
for path in name_df_paths:
    name_df = pandas.read_csv(path, keep_default_na=False, na_values=[''], sep='\t')
    path_to_df[path] = name_df

## full name table

create a table of individuals with full names for prediction methods that require full names.

In [3]:
full_name_df = pandas.concat(
    [df[["fore_name", "last_name"]] for df in path_to_df.values()]
).drop_duplicates()
full_name_df.head(2)

Unnamed: 0,fore_name,last_name
0,B A,Eckman
1,J S,Aaronson


In [4]:
def clean_full_name(name: str) -> str:
    """
    Lightly clean full names by replacing periods with spaces
    and standardizing whitespace.
    """
    name = name.replace(".", " ")
    name = " ".join(name.split())
    return name

In [5]:
full_name_df['fore_name_simple'] = full_name_df.fore_name.map(
    lambda x: simplify_fore_name(x, lower=True))
full_name_df['last_name_simple'] = full_name_df.last_name.map(
    lambda x: simplify_last_name(x, lower=True))
full_name_df = full_name_df.dropna(subset=['fore_name_simple', "last_name_simple"])
full_name_df["full_name"] = (
    full_name_df.fore_name.str.strip() + " " +
    full_name_df.last_name.str.strip()).map(clean_full_name)
full_name_df.sample(n=10, random_state=0)

Unnamed: 0,fore_name,last_name,fore_name_simple,last_name_simple,full_name
81757,Liuchao,Sun,liuchao,sun,Liuchao Sun
54669,Annette F,Jones,annette,jones,Annette F Jones
23047,John E,Major,john,major,John E Major
21663,Danielle S.,Bassett,danielle,bassett,Danielle S Bassett
87188,Matthew G,Bakker,matthew,bakker,Matthew G Bakker
122973,Robin T,Varghese,robin,varghese,Robin T Varghese
60107,Laura C,Lazzeroni,laura,lazzeroni,Laura C Lazzeroni
81000,Daniel F A R,Dourado,daniel,dourado,Daniel F A R Dourado
23944,Duncan P,Brown,duncan,brown,Duncan P Brown
29898,Guillaume,Drin,guillaume,drin,Guillaume Drin


In [6]:
full_name_df.to_csv('data/names/full-names.tsv.xz', sep='\t', index=False)

## fore and last name tables

In [7]:
def simplify_names(dfs, name_column):
    names = pandas.concat(df[name_column] for df in dfs)
    simple_name_df = names.value_counts(dropna=True).reset_index()
    simple_name_df.columns = [name_column, 'n_authors']
    assert simple_name_df[name_column].isna().sum() == 0
    simplifier = getattr(pubmedpy.names, f'simplify_{name_column}')
    simple_name_df[f'{name_column}_simple'] = simple_name_df[name_column].map(lambda x: simplifier(x, lower=True))
    simple_name_df = simple_name_df.sort_values(name_column)
    return simple_name_df

In [8]:
simple_fore_name_df = simplify_names(path_to_df.values(), 'fore_name')
simple_fore_name_df

Unnamed: 0,fore_name,n_authors,fore_name_simple
29979,(Max) Zong-Ming,1,max
37693,", Yuxin",1,yuxin
24,A,373,
2258,A A,17,
15871,A Aldo,2,aldo
...,...,...,...
21124,Špela,2,špela
15292,Žiga,2,žiga
34469,Živa,1,živa
24172,Živadin,2,živadin


In [None]:
simple_last_name_df = simplify_names(path_to_df.values(), 'last_name')
simple_last_name_df

Unnamed: 0,last_name,n_authors,last_name_simple
33296,'t Hart,1,t hart
2494,'t Hoen,14,t hoen
36570,(Holly) Yang,1,holly) yang
35699,A,1,a
34999,A Araújo,1,a araújo
...,...,...,...
28461,Žárský,2,žárský
36181,železný,1,železný
39488,žurauskienė,1,žurauskienė
24386,‘t Hoen,2,‘t hoen


In [None]:
simple_fore_name_df.sample(n=20, random_state=0).sort_values('n_authors', ascending=False)

In [None]:
simple_last_name_df.sample(n=20, random_state=0).sort_values('n_authors', ascending=False)

In [None]:
# Most common last names
(
    simple_last_name_df
    .groupby('last_name_simple')
    .n_authors.sum()
    .sort_values(ascending=False)
    .reset_index()
    .head(3)
)

In [None]:
simple_fore_name_df.to_csv('data/names/fore-names.tsv.xz', sep='\t', index=False)
simple_last_name_df.to_csv('data/names/last-names.tsv.xz', sep='\t', index=False)