# Gender predictions for names

In [1]:
import datetime
import pathlib
import pandas
import ratelimit
import requests
import jsonlines

In [2]:
name_df = pandas.read_csv('data/simplified-fore-names.tsv', sep='\t')
name_df.head(2)

Unnamed: 0,name,count
0,a-sol,2
1,aabid,2


In [3]:
# @backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
@ratelimit.limits(calls=90, period=86_400)
def request_genderize_io(names):
    assert 1 <= len(names) <= 10
    url = "https://api.genderize.io"
    params = {
        f"name[{i}]": name
        for i, name in enumerate(names)
    }
    response = requests.get(url, params)
    response.raise_for_status()
    results = response.json()
    for result in results:
        result['query_date'] = datetime.datetime.utcnow().date().isoformat()
    return results

In [4]:
def grouper(iterable, n):
    """
    Collect data into fixed-length chunks or blocks
    https://docs.python.org/3/library/itertools.html
    grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    """
    from itertools import zip_longest
    args = [iter(iterable)] * n
    return zip_longest(*args)

def genderize_names(names):
    for subset in grouper(names, n=10):
        yield from request_genderize_io(subset)

In [5]:
path = pathlib.Path('data/genderize.jsonl')
lines = jsonlines.open(path) if path.exists() else []
existing_names = {row['name'] for row in lines}
all_names = name_df.sort_values('count', ascending=False).name
new_names = all_names[~all_names.isin(existing_names)]
print(f"{len(all_names):,} total names: {len(existing_names):,} already queried, {len(new_names):,} new")

18,578 total names: 925 already queried, 17,654 new


In [6]:
with jsonlines.open(path, mode='a') as writer:
    results = genderize_names(new_names)
    for result in results:
        writer.write(result)

RateLimitException: too many calls