# Gender predictions for names

In [1]:
import datetime
import pathlib
import pandas
import ratelimit
import requests
import jsonlines
import backoff

In [2]:
fore_name_df = pandas.read_csv('data/names/fore-names.tsv.xz', sep='\t', keep_default_na=False, na_values=[''])
# read_csv keep_default_na=True (deafult) interprets some names as NaN
assert fore_name_df[fore_name_df.fore_name.isna()].empty
fore_name_df = (
    fore_name_df
    .groupby('fore_name_simple')
    .n_authors.sum()
    .sort_values(ascending=False)
    .reset_index()
)
fore_name_df.head(2)

Unnamed: 0,fore_name_simple,n_authors
0,david,13068
1,michael,12259


In [3]:
@backoff.on_exception(backoff.expo, Exception)
@ratelimit.limits(calls=90*5, period=86_400)
def request_genderize_io(names, service='genderize'):
    """
    service='nationalize' is also available
    """
    assert 1 <= len(names) <= 10
    url = f"https://api.{service}.io"
    params = {
        f"name[{i}]": name
        for i, name in enumerate(names)
    }
    response = requests.get(url, params)
    response.raise_for_status()
    results = response.json()
    for result in results:
        result['query_date'] = datetime.datetime.utcnow().date().isoformat()
    return results

In [4]:
def grouper(iterable, n):
    """
    Collect data into fixed-length chunks or blocks
    https://docs.python.org/3/library/itertools.html
    grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    """
    from itertools import zip_longest
    args = [iter(iterable)] * n
    return zip_longest(*args)

def genderize_names(names, service='genderize'):
    for subset in grouper(names, n=10):
        yield from request_genderize_io(subset, service=service)

## Genderize fore names

In [5]:
path_genderize = pathlib.Path('data/gender/genderize.jsonl')
lines = jsonlines.open(path_genderize) if path_genderize.exists() else []
existing_names = {row['name'] for row in lines}
all_names = fore_name_df.sort_values('n_authors', ascending=False).fore_name_simple
new_names = all_names[~all_names.isin(existing_names)]
print(f"{len(all_names):,} total names: {len(existing_names):,} already queried, {len(new_names):,} new")

76,583 total names: 22,929 already queried, 54,107 new


In [None]:
with jsonlines.open(path_genderize, mode='a') as writer:
    results = genderize_names(new_names, service='genderize')
    for result in results:
        writer.write(result)

In [None]:
def add_male_probability(result):
    """
    Add probability_male field to a genderize.io result
    """
    if result['gender']:
        result['probability_male'] = (
            result['probability']
            if result['gender'] == 'male' else
            1 - result['probability']
        )
    else:
        result['probability_male'] = None
    return result

In [None]:
lines = jsonlines.open(path_genderize)
lines = map(add_male_probability, lines)
gender_df = pandas.DataFrame(lines)
gender_df = gender_df.rename(columns={'name': 'fore_name_simple', 'count': 'genderize_sample_size'})
gender_df = gender_df.drop(columns=['gender', 'probability'])
gender_df = gender_df.sort_values(['fore_name_simple', 'query_date']).drop_duplicates(subset=['fore_name_simple', 'query_date'], keep='last')
gender_df = fore_name_df.merge(gender_df)
gender_df.head(2)

In [None]:
gender_df.to_csv('data/gender/genderize.tsv', sep='\t', index=False)

## Nationalize fore names

The following chunks are commented out because `nationalize.io` returns only the top three nationalities regardless of number - not useful at the moment.

In [None]:
# path_nationalize = pathlib.Path('data/gender/nationalize.jsonl')
# lines = jsonlines.open(path_nationalize) if path_nationalize.exists() else []
# existing_names = {row['name'] for row in lines}
# new_names = all_names[~all_names.isin(existing_names)]
# print(f"{len(all_names):,} total names: {len(existing_names):,} already queried, {len(new_names):,} new")

In [None]:
# with jsonlines.open(path_nationalize, mode='a') as writer:
#     results = genderize_names(new_names, service='nationalize')
#     for result in results:
#         writer.write(result)