# Convert fore names to simplified (one word) fore names

In [1]:
import string
import pandas

In [2]:
name_df_pubmed = pandas.read_table('data/pubmed/authors.tsv.xz', keep_default_na=False)
name_df_pmc = pandas.read_table('data/pmc/authors.tsv.xz', keep_default_na=False)
name_df_ismb = pandas.read_table('data/ismb/keynotes.tsv', keep_default_na=False)

fore_names = pandas.concat([
    name_df_pubmed.fore_name,
    name_df_pmc.fore_name,
    name_df_ismb.fore_name,
])
len(fore_names)

222712

In [3]:
fore_names.isna().value_counts()

False    222712
Name: fore_name, dtype: int64

In [4]:
def simplify_fore_name(name, lower=False):
    """
    # Convert period to space
    # Split on whitespace
    # Strip punctuation (on termini)
    # Discard <=1 letter strings
    # Discard <=3 letter strings that are ALL CAPS
    # If one string remains, return
    """
    if pandas.isna(name):
        return None
    assert isinstance(name, str)
    name_ = name.replace('.', ' ')
    words = name_.split()
    for word in words:
        word = word.strip(string.punctuation)
        if len(word) <= 1:
            continue
        if word.upper() == word and len(word) <= 3:
            continue
        if lower:
            word = word.lower()
        return word

In [5]:
fore_name_to_simple = {
    x: simplify_fore_name(x, lower=True)
    for x in sorted(fore_names.dropna().unique())
}

In [6]:
simple_names_df = fore_names.map(fore_name_to_simple).value_counts(dropna=True).reset_index()
simple_names_df.columns = ['name', 'count']
assert simple_names_df.name.isna().sum() == 0
simple_names_df.head(5)

Unnamed: 0,name,count
0,david,3006
1,michael,2777
2,john,1692
3,thomas,1612
4,daniel,1593


In [7]:
simple_names_df.sample(n=20, random_state=0).sort_values('count', ascending=False)

Unnamed: 0,name,count
230,rebecca,136
1985,joeri,17
2455,marleen,13
2685,yoo-ah,12
5782,dongliang,5
7749,isis,4
6940,sertan,4
8442,heewook,3
8074,michihiro,3
8512,shanlin,3


In [8]:
# What percents of names in top 1000 most popular
(simple_names_df['count'].cumsum() / sum(simple_names_df['count']))[1_000]

0.5984371408871524

In [9]:
simple_names_df.sort_values('name').to_csv('data/gender/simplified-fore-names.tsv', sep='\t', index=False)