# Name2gender dataset compilation

In [1]:
import csv
import string
import unicodedata
import codecs

In [2]:
# name-gen dict
name_gender = {}

# helper cleaning function (lowercase, unicode->ascii)
def clean(s):
    uncoded = ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in string.ascii_letters
    )
    return uncoded.lower()

## Add NLTK names corpus
We create the basic dataset from NLTK's male & female name corpora

In [3]:
import nltk
from nltk.corpus import names

# download names corpus
nltk.download("names")

# add nltk names
male_names = names.words('male.txt')
female_names = names.words('female.txt')
labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])

[nltk_data] Downloading package names to /u1/elbrown/nltk_data...
[nltk_data]   Package names is already up-to-date!


In [6]:
for name,gender in labeled_names:
    name = clean(name)
    name_gender[name] = gender
#     print(name)
len(name_gender)

7553

## Add @mbejda datasets
https://gist.github.com/mbejda
* Black-Female-Names.csv     (~2,400 black (African American) female names)
* Black-Male-Names.csv       (~50,000 black (African American) male names)
* White-Female-Names.csv     (~4,500 white (Caucasian) female names)
* White-Male-Names.csv       (~40,000 white (Caucasian) male names)
* Hispanic-Female-Names.csv  (~200 hispanic female names)
* Hispanic-Male-Names.csv    (~4,000 hispanic male names)

## Add corpus from @mfran

In [11]:
with codecs.open('mfran_names.csv', 'r', 'utf8') as file:
    reader = csv.reader(file, delimiter=',')
    for row in reader:
        # in order to handle the few messed up records
        gender = clean(row[-1])
        name = clean(''.join(row[:-1]))
        if not name in name_gender:
            name_gender[name] = gender
#             print(name)
len(name_gender)

128239

## Add BlackRock user database

In [13]:
with codecs.open('blk_names.csv', 'r', 'utf8') as blkfile:
    reader = csv.reader(blkfile)
    for row in reader:
        name = clean(row[0])
        gender = "female" if clean(''.join(row[-1])) == 'f' else "male"
        if not name in name_gender:
            name_gender[name] = gender
#             print(name, gender)
len(name_gender)

130551

In [None]:
# remove empty string entry
name_gender.pop('')

## Write dataset  to CSV

In [None]:
with open('namedata.csv', 'w', newline='') as out:
    writer = csv.writer(out, delimiter=',')
    for k,v in name_gender.items():
        writer.writerow([k,v])
        print(k,v)