# Convert fore names to simplified (one word) fore names

In [1]:
import string
import pandas

from pubmedpy.names import simplify_fore_name

In [2]:
name_df_pubmed = pandas.read_table('data/pubmed/authors.tsv.xz', keep_default_na=False)
name_df_pmc = pandas.read_table('data/pmc/authors.tsv.xz', keep_default_na=False)
name_df_ismb = pandas.read_table('data/ismb/keynotes.tsv', keep_default_na=False)

fore_names = pandas.concat([
    name_df_pubmed.fore_name,
    name_df_pmc.fore_name,
    name_df_ismb.fore_name,
])
len(fore_names)

223206

In [3]:
fore_names.isna().value_counts()

False    223206
Name: fore_name, dtype: int64

In [4]:
fore_name_to_simple = {
    x: simplify_fore_name(x, lower=True)
    for x in sorted(fore_names.dropna().unique())
}

In [5]:
simple_names_df = fore_names.map(fore_name_to_simple).value_counts(dropna=True).reset_index()
simple_names_df.columns = ['name', 'count']
assert simple_names_df.name.isna().sum() == 0
simple_names_df.head(5)

Unnamed: 0,name,count
0,david,3012
1,michael,2780
2,john,1695
3,thomas,1615
4,daniel,1595


In [6]:
simple_names_df.sample(n=20, random_state=0).sort_values('count', ascending=False)

Unnamed: 0,name,count
385,lukas,89
772,jesper,47
1145,karine,32
3024,yulan,11
3855,bahman,8
5639,jaron,5
7700,gyorgy,4
6808,benyun,4
7054,adrião,4
13224,kian,2


In [7]:
# What percents of names in top 1000 most popular
(simple_names_df['count'].cumsum() / sum(simple_names_df['count']))[1_000]

0.5983214089157952

In [8]:
simple_names_df.sort_values('name').to_csv('data/gender/simplified-fore-names.tsv', sep='\t', index=False)