In [1]:
import json
import pandas as pd
import time
import gcsfs
import numpy as np
import gender_guesser.detector as gender
from datetime import date
from google.cloud import storage
from google.cloud import bigquery

from pathlib import Path
import os

In [2]:
PROJ_ROOT = Path().resolve().parent
KEYS_DIR = PROJ_ROOT / 'keys' 
keys = KEYS_DIR / 'Keys for Big Query Storage Admin - PEII.json'

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(keys)

In [None]:
f_names_query = """
#standardSQL
SELECT name, SUM(number) as records
FROM `bigquery-public-data.usa_names.usa_1910_current`
WHERE gender = 'F'
GROUP BY name
ORDER BY records DESC
"""

In [4]:
#read in CSV from storage_client
#override for debugging
    
today = date.today()
today = '2019-11-06'

uri = 'gs://impact-index-shared-resources/data/interim/{}/interim_entrepreneur_data.csv'.format(today)

#read in CSV from storage_client
entrepreneur_df = pd.read_csv(uri)
print('entrepreneur data acquired', entrepreneur_df.head())

#creating the first name column
for i in entrepreneur_df.index:
    entrepreneur_df.at[i, 'f_name'] = entrepreneur_df.at[i, 'name'].split(' ')[0]

#querying only null values
null = entrepreneur_df[pd.isnull(entrepreneur_df['gender'])]

#instantiate bigquery client
client = bigquery.Client()

#gets only unique names that are female, and aggregates the record number for each name
f_names_query = """
#standardSQL
SELECT name, SUM(number) as records
FROM `bigquery-public-data.usa_names.usa_1910_current`
WHERE gender = 'F'
GROUP BY name
ORDER BY records DESC
"""

job_config = bigquery.QueryJobConfig()
f_query_job = client.query(f_names_query, location = 'US')
f_res = f_query_job.result()

#creates dataframe of female names
f_names_df = f_res.to_dataframe()
#takes only the names and turns it into a list
f_names = f_names_df['name'].to_list()
#lowercase
for i in range(len(f_names)):
    f_names[i] =  f_names[i].lower()

#creates a separate list for M names because it is faster to use SQL than iterate through df
m_names_query="""
#standardSQL
SELECT name, SUM(number) as records
FROM `bigquery-public-data.usa_names.usa_1910_current`
WHERE gender = 'M'
GROUP BY name
ORDER BY records DESC
"""

job_config = bigquery.QueryJobConfig()
m_query_job = client.query(m_names_query,location = 'US')
m_res = m_query_job.result()

#creates dataframe of male names
m_names_df = m_res.to_dataframe()

#takes only the male names and turns it into a list
m_names = m_names_df['name'].to_list()
for i in range(len(m_names)):
    m_names[i] =  m_names[i].lower()

#find androgynous names
andro_names = (set(m_names).intersection(f_names))

#female names only names not in male list
f_names = set(f_names) - set(m_names)

#male names not in female list
m_names = set(m_names) - set(f_names)

#make everything into lists
andro_names = list(andro_names)
f_names = list(f_names)
m_names = list(m_names)

m_names_df = m_names_df.set_index('name')
f_names_df = f_names_df.set_index('name')

count_m = 0
count_f = 0
count_na = 0
femme = []
masc = []

#sort androgynous names
for name in andro_names:
    name = name.capitalize()
    male_count = m_names_df.loc[name]['records']
    female_count = f_names_df.loc[name]['records']
    total = male_count + female_count
    percentage_male = round(male_count / total, 2)
    percentage_female = round(female_count / total, 2)
    if percentage_male >= 0.75:
        gender_assign = 'M'
        count_m += 1
        masc.append(name.lower())
    elif percentage_female >= 0.75:
        gender_assign = 'F'
        count_f += 1
        femme.append(name.lower())
    else:
        gender_assign = 'N/A'
        count_na += 1

    # x = 10 - len(str(male_count + female_count))
    # y = 10 - len(name)
    #debugging
    #print(name, y*' ', male_count, female_count, x*' ', ' | ', percentage_male, percentage_female, ' | ', gender_assign)

#combine lists
f_names = f_names + femme
m_names = m_names + masc

#unidentified names
andro_ = set(andro_names) - set(masc)
andro_ = set(andro_) - set(femme)

#assigning gender from social security data
f_count = 0
m_count = 0
a_count = 0
i_count = 0

for i in null.index:
    name = null.at[i, 'f_name'].lower()

    if name in f_names:
        entrepreneur_df.at[i, 'gender'] = 'f'
        f_count += 1

    elif name in m_names:
        entrepreneur_df.at[i, 'gender'] = 'm'
        m_count += 1

    elif name in andro_:
        entrepreneur_df.at[i, 'gender'] = 'andro'
        a_count += 1

    else:
        entrepreneur_df.at[i, 'gender'] = 'unknown'
        i_count += 1

#using the gender guesser
d = gender.Detector()

# Create additional checks for gender
additional_female_names = [
    'Alika',
    'Ama',
    'Cibelle',
    'Kimberlina',
    'Leathia',
    'Shiri',
    'Lavena',
    'Tanjila',
    'Holley',
    'Anie',
    'Dionna',
    'Shanel',
    'Lakshya',
    'Shenda',
    'Madelena',
    'Kerranna',
    'Piya',
    'Paria',
    'Ylianna',
    'Ankita',
    'Isha',
    'Gabby',
    'Anjelika',
    'Nitha',
    'Adena',
    'Sumayah',
    'Louisea',
    'Cymphonique',
    'Charlyn',
    'Tari',
    'Graceann',
    'Kalia',
    'Annmarie',
    'Saira',
]
additional_male_names = [
    'Deward',
    'Omkar',
    'Rohit',
    'Mohit',
    'Vik',
    'Vishal',
    'Obed',
    'Ashwin',
    'Ozel',
    'Umed',
    'Demetri',
    'Tripp',
    'Jean-Marc',
    'Laszlo',
    'Allon',
    'Rishi'
    'Francois',
    'Clarkson',
    'Siddharth',
    'Sidharth',
    'Anup',
    'Anoop',
    'Zac',
    'Andras',
    'Abhishek',
    'Dil-Domine',
    'Varun',
    'Abhinav',
    'Chaitanya',
    'Rishi',
    'Raghu',
    'Anurag',
    'Akshay',
    'Gaurav',
    'Kunal',
]

# Create column 'female'
entrepreneur_df['female'] = 0

# Collect indices of female and androgynous names, and mark them as female
female_names = []
andy_names = []
unk_names = []

count_female = 0
count_andy = 0
count_unk = 0

for i in entrepreneur_df.index:
    name = entrepreneur_df.at[i, 'f_name']
    if d.get_gender(name) == 'female':
        female_names.append([i, name])
        entrepreneur_df.at[i, 'female'] = 1
        count_female += 1
    elif d.get_gender(name) == 'mostly_female':
        female_names.append([i, name])
        entrepreneur_df.at[i, 'female'] = 1
        count_female += 1
    elif d.get_gender(name) == 'andy':
        andy_names.append([i, name])
        count_andy += 1
    elif d.get_gender(name) == 'unknown':
        if name in additional_female_names:
            female_names.append([i, name])
            entrepreneur_df.at[i, 'female'] = 1
            count_female += 1
        elif name in additional_male_names:
            pass
        else:
            unk_names.append([i, name])
            count_unk += 1

for i in entrepreneur_df.index:
    #over-ride the gender_guesser with the social security data
    if entrepreneur_df.at[i, 'gender'] == 'f' and entrepreneur_df.at[i, 'female'] == 0:
        entrepreneur_df.at[i, 'gender'] = 'Female'
    if entrepreneur_df.at[i, 'gender'] == 'm' and entrepreneur_df.at[i, 'female'] == 1:
        entrepreneur_df.at[i, 'gender'] = 'Male'

    #assign female if both agree, and male if both agree
    if entrepreneur_df.at[i, 'gender'] == 'f' and entrepreneur_df.at[i, 'female'] == 1:
        entrepreneur_df.at[i, 'gender'] = 'Female'
    if entrepreneur_df.at[i, 'gender'] == 'm' and entrepreneur_df.at[i, 'female'] == 0:
        entrepreneur_df.at[i, 'gender'] = 'Male'

#extrapolating to the unknowns (which include androgenous)
male_num = entrepreneur_df['gender'].value_counts()['Male']
female_num = entrepreneur_df['gender'].value_counts()['Female']
unknown_num = entrepreneur_df['gender'].value_counts()['unknown']
percent_fem = female_num / (female_num + male_num)

unk_df = entrepreneur_df[entrepreneur_df['gender'] == 'unknown']
unk_names = list(unk_df.index)

# Unknown names to fem 
unk_to_fem = np.random.choice(
    [x for x in unk_names],
    int((np.round(percent_fem*len(unk_names), 4))),
    replace=False
)

for i in unk_to_fem:
    entrepreneur_df.at[i, 'gender'] = 'Female'   

unk_to_masc = set(unk_names) - set(unk_to_fem)

for i in unk_to_masc:
    entrepreneur_df.at[i, 'gender'] = 'Male'

#calculating ratios
print(entrepreneur_df['gender'].value_counts())

entrepreneur_df.to_csv(
        'gs://impact-index-shared-resources/data/processed/entrepreneur_gender_df.csv',
        index=False
    )


entrepreneur data acquired               name                     company_name  \
0      Scott Brown                        ColdSpark   
1    Robin Horwitz             Convo Communications   
2     Jiren Parikh                          SnapOne   
3  Matthew Slipper  Symphony Communication Services   
4         Ric Zhou                        Kika Tech   

                    crunchbase_uuid  crunchbase_permalink    funding  \
0  226a627bc92415995985cbd94743276b             coldspark    6500000   
1  7d0675cefac592615e1cbb6c29fd403a  convo-communications     500000   
2  ed5f8110a213395ececfe94660c0f602           snapone-inc          0   
3  5f89826c5031a1932f27525b505b0a7f            symphony-3  461000000   
4  437121710de6c1d5e8f8ebe555749fa8             kika-tech   63000000   

   jobs_created  patents    ipo  city_and_state  \
0        1000.0        0  False  Broomfield, CO   
1         100.0        0  False      Austin, TX   
2         100.0        1  False   Princeton, NJ   
3    