# Parsing gender from names with an API

Genderize.io determines the gender of a first name. Use the API for analytics, ad targeting, user segmenting etc. It utilizes big datasets of information, from user profiles across major social networks and exposes this data through its API. The response includes a certainty factor as well.

In [None]:
import pandas as pd

In [2]:
# %load ../parse_gender.py
import requests
import json


api_url_base = 'https://api.genderize.io/'

def get_gender(firstname):
    request_url = '{}?name={}'.format(api_url_base, firstname)
    response = requests.get(request_url)
    
    if response.status_code == 200:
        return (response.json())
    else:
        print('[!] HTTP {0} looking up name [{1}]'.format(response.status_code, firstname))
        return None

In [4]:
get_gender('Steve')

{'name': 'Steve', 'gender': 'male', 'probability': 1, 'count': 3965}

In [6]:
recurse = pd.read_csv('../recursecenter.csv')

In [8]:
recurse.columns = ['repo', 'username', 'contributions', 'avatar_url', 'profile_url', 'real_name']

In [9]:
recurse.head()

Unnamed: 0,repo,username,contributions,avatar_url,profile_url,real_name
0,hs-cli,davidbalbert,2,https://avatars2.githubusercontent.com/u/12335...,https://api.github.com/users/davidbalbert,David Albert
1,webstack.jl,danielmendel,63,https://avatars3.githubusercontent.com/u/30420...,https://api.github.com/users/danielmendel,Daniel Espeset
2,webstack.jl,astrieanna,36,https://avatars3.githubusercontent.com/u/12053...,https://api.github.com/users/astrieanna,Leah Hanson
3,webstack.jl,zachallaun,19,https://avatars0.githubusercontent.com/u/50393...,https://api.github.com/users/zachallaun,Zach Allaun
4,webstack.jl,chuckha,8,https://avatars0.githubusercontent.com/u/98927...,https://api.github.com/users/chuckha,Chuck Ha


In [17]:
recurse.loc[0]['real_name'].split(' ')[0]

'David'

In [105]:
recurse.head(3)

Unnamed: 0,repo,username,contributions,avatar_url,profile_url,real_name
0,hs-cli,davidbalbert,2,https://avatars2.githubusercontent.com/u/12335...,https://api.github.com/users/davidbalbert,David Albert
1,webstack.jl,danielmendel,63,https://avatars3.githubusercontent.com/u/30420...,https://api.github.com/users/danielmendel,Daniel Espeset
2,webstack.jl,astrieanna,36,https://avatars3.githubusercontent.com/u/12053...,https://api.github.com/users/astrieanna,Leah Hanson


In [41]:
recurse.shape

(160, 7)

In [93]:
## Split first name from real_name, look up gender

gender_list = list()

for name in recurse['real_name'].unique():
    if type(name) == str: # don't call on NaN values (which are float - why?)
        first_name = name.split(' ')[0]
        gender_result = get_gender(first_name)
        gender_result['real_name'] = name # add real name back to dictionary
        gender_list.append(gender_result)

In the function above, we should use a set or unique method to only not duplicate lookups.

In [208]:
# Record with no value for real_name
recurse['real_name'][12]

nan

In [207]:
type(recurse['real_name'][12]) == str

False

In [None]:
recurse['real_name'][12]

In [106]:
gender_list[0]

{'name': 'David',
 'gender': 'male',
 'probability': 1,
 'count': 12593,
 'real_name': 'David Albert'}

In [112]:
genders = pd.DataFrame(gender_list)

In [146]:
genders.head(5)

Unnamed: 0,count,gender,name,probability,real_name
0,12593.0,male,David,1.0,David Albert
1,8180.0,male,Daniel,1.0,Daniel Espeset
2,904.0,female,Leah,1.0,Leah Hanson
3,673.0,male,Zach,0.99,Zach Allaun
4,512.0,male,Chuck,0.99,Chuck Ha


In [137]:
recurse.head()

Unnamed: 0,repo,username,contributions,avatar_url,profile_url,real_name
0,hs-cli,davidbalbert,2,https://avatars2.githubusercontent.com/u/12335...,https://api.github.com/users/davidbalbert,David Albert
1,webstack.jl,danielmendel,63,https://avatars3.githubusercontent.com/u/30420...,https://api.github.com/users/danielmendel,Daniel Espeset
2,webstack.jl,astrieanna,36,https://avatars3.githubusercontent.com/u/12053...,https://api.github.com/users/astrieanna,Leah Hanson
3,webstack.jl,zachallaun,19,https://avatars0.githubusercontent.com/u/50393...,https://api.github.com/users/zachallaun,Zach Allaun
4,webstack.jl,chuckha,8,https://avatars0.githubusercontent.com/u/98927...,https://api.github.com/users/chuckha,Chuck Ha


In [140]:
# Sanity check
print(genders.shape)
print(recurse.shape)

(160, 5)
(160, 6)


In [151]:
# This is unnecessary...
recurse_gendered = recurse.join(genders, lsuffix='_x')

In [159]:
recurse_gendered.head()

Unnamed: 0,repo,username,contributions,avatar_url,profile_url,real_name_x,count,gender,name,probability,real_name
0,hs-cli,davidbalbert,2,https://avatars2.githubusercontent.com/u/12335...,https://api.github.com/users/davidbalbert,David Albert,12593.0,male,David,1.0,David Albert
1,webstack.jl,danielmendel,63,https://avatars3.githubusercontent.com/u/30420...,https://api.github.com/users/danielmendel,Daniel Espeset,8180.0,male,Daniel,1.0,Daniel Espeset
2,webstack.jl,astrieanna,36,https://avatars3.githubusercontent.com/u/12053...,https://api.github.com/users/astrieanna,Leah Hanson,904.0,female,Leah,1.0,Leah Hanson
3,webstack.jl,zachallaun,19,https://avatars0.githubusercontent.com/u/50393...,https://api.github.com/users/zachallaun,Zach Allaun,673.0,male,Zach,0.99,Zach Allaun
4,webstack.jl,chuckha,8,https://avatars0.githubusercontent.com/u/98927...,https://api.github.com/users/chuckha,Chuck Ha,512.0,male,Chuck,0.99,Chuck Ha


In [158]:
recurse_gendered.shape

(160, 11)

### to-do:
Since the same people will be likely to contribute to multiple repos, we should work out how to de-duplicate the names before looking up on the API. (Maybe using set or unique?)

In [177]:
# simulate a list with no duplicates, as would happen when looking up names only once
g2 = genders.drop_duplicates()

In [182]:
df2 = pd.merge(recurse, g2[['real_name', 'gender', 'probability']], on='real_name')

In [183]:
df2.head(30)

Unnamed: 0,repo,username,contributions,avatar_url,profile_url,real_name,gender,probability
0,hs-cli,davidbalbert,2,https://avatars2.githubusercontent.com/u/12335...,https://api.github.com/users/davidbalbert,David Albert,male,1.0
1,blaggregator,davidbalbert,12,https://avatars2.githubusercontent.com/u/12335...,https://api.github.com/users/davidbalbert,David Albert,male,1.0
2,community,davidbalbert,335,https://avatars2.githubusercontent.com/u/12335...,https://api.github.com/users/davidbalbert,David Albert,male,1.0
3,proxy,davidbalbert,58,https://avatars2.githubusercontent.com/u/12335...,https://api.github.com/users/davidbalbert,David Albert,male,1.0
4,RSVPBot,davidbalbert,84,https://avatars2.githubusercontent.com/u/12335...,https://api.github.com/users/davidbalbert,David Albert,male,1.0
5,ca-tools,davidbalbert,11,https://avatars2.githubusercontent.com/u/12335...,https://api.github.com/users/davidbalbert,David Albert,male,1.0
6,webstack.jl,danielmendel,63,https://avatars3.githubusercontent.com/u/30420...,https://api.github.com/users/danielmendel,Daniel Espeset,male,1.0
7,webstack.jl,astrieanna,36,https://avatars3.githubusercontent.com/u/12053...,https://api.github.com/users/astrieanna,Leah Hanson,female,1.0
8,webstack.jl,zachallaun,19,https://avatars0.githubusercontent.com/u/50393...,https://api.github.com/users/zachallaun,Zach Allaun,male,0.99
9,blaggregator,zachallaun,1,https://avatars0.githubusercontent.com/u/50393...,https://api.github.com/users/zachallaun,Zach Allaun,male,0.99


When the name has a NaN value it looks like genderize is being passed 'nan' and guessing that the name is female.



In [None]:
## Split first name from real_name, look up gender

gender_list = list()

for name in recurse['real_name'].unique():
    if type(name) == str:
        first_name = name.split(' ')[0]
        gender_result = get_gender(first_name)
        gender_result['real_name'] = name # add real name back to dictionary
        gender_list.append(gender_result)