In [80]:
import json
import pandas as pd
import time
import gcsfs
import gender_guesser.detector as gender
from datetime import date

In [81]:
from google.cloud import storage 
from google.cloud import bigquery as bq
from pathlib import Path
import os

In [82]:
PROJ_ROOT = Path().resolve().parent
KEYS_DIR = PROJ_ROOT / 'keys' 
keys = KEYS_DIR / 'Keys for Big Query Storage Admin - PEII.json'

In [83]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(keys)

In [5]:
today = date.today()

#override for debugging
today = '2019-11-06'

uri = 'gs://impact-index-shared-resources/data/interim/{}/interim_entrepreneur_data.csv'.format(today)

entrepreneur_df = pd.read_csv(uri)

In [6]:
client = bq.Client()

#gets only unique names that are female, and aggregates the record number for each name
f_names_query="""
#standardSQL
SELECT name, SUM(number) as records 
FROM `bigquery-public-data.usa_names.usa_1910_current`
WHERE gender = 'F'
GROUP BY name 
ORDER BY records DESC
"""

job_config = bq.QueryJobConfig()
now = time.time()
query_job = client.query(f_names_query, location = 'US')
res = query_job.result()
print('query took:', round(time.time()-now,2), 's')

query took: 1.7 s


In [7]:
now = time.time()
f_names_df=res.to_dataframe()
print('load to pandas from BQ took:', round(time.time()-now, 2),'s')

load to pandas from BQ took: 1.14 s


In [8]:
#takes only the female names and turns it into a list
f_names = f_names_df['name'].to_list()

for i in range(len(f_names)):
    f_names[i] =  f_names[i].lower()

print(len(f_names))
f_names

20852


['mary',
 'patricia',
 'elizabeth',
 'jennifer',
 'linda',
 'barbara',
 'margaret',
 'susan',
 'dorothy',
 'jessica',
 'sarah',
 'nancy',
 'betty',
 'karen',
 'lisa',
 'helen',
 'sandra',
 'ashley',
 'kimberly',
 'emily',
 'donna',
 'carol',
 'michelle',
 'amanda',
 'melissa',
 'laura',
 'anna',
 'stephanie',
 'deborah',
 'rebecca',
 'ruth',
 'sharon',
 'cynthia',
 'kathleen',
 'amy',
 'shirley',
 'angela',
 'virginia',
 'catherine',
 'katherine',
 'brenda',
 'emma',
 'pamela',
 'nicole',
 'christine',
 'samantha',
 'rachel',
 'janet',
 'carolyn',
 'debra',
 'evelyn',
 'maria',
 'frances',
 'heather',
 'diane',
 'julie',
 'joyce',
 'martha',
 'alice',
 'victoria',
 'joan',
 'christina',
 'kelly',
 'lauren',
 'marie',
 'ann',
 'doris',
 'judith',
 'olivia',
 'jean',
 'cheryl',
 'megan',
 'kathryn',
 'andrea',
 'grace',
 'rose',
 'hannah',
 'jacqueline',
 'julia',
 'sara',
 'gloria',
 'teresa',
 'janice',
 'mildred',
 'theresa',
 'madison',
 'judy',
 'lillian',
 'beverly',
 'denise',
 'm

In [9]:
#creates a separate list for M names because it is faster to use SQL than iterate through df

m_names_query="""
#standardSQL
SELECT name, SUM(number) as records 
FROM `bigquery-public-data.usa_names.usa_1910_current`
WHERE gender = 'M'
GROUP BY name 
ORDER BY records DESC
"""

job_config = bq.QueryJobConfig()
now = time.time()
query_job = client.query(m_names_query,location = 'US')
res = query_job.result()
print('query took:', round(time.time()-now,2), 's')

query took: 1.27 s


In [10]:
now = time.time()
m_names_df = res.to_dataframe()
print('load to pandas from BQ took:', round(time.time()-now,2),'s')

load to pandas from BQ took: 0.63 s


In [11]:
#takes only the male names and turns it into a list
m_names = m_names_df['name'].to_list()
for i in range(len(m_names)):
    m_names[i] =  m_names[i].lower()
    
print(len(m_names))
m_names

13785


['james',
 'john',
 'robert',
 'michael',
 'william',
 'david',
 'richard',
 'joseph',
 'charles',
 'thomas',
 'christopher',
 'daniel',
 'matthew',
 'anthony',
 'donald',
 'paul',
 'mark',
 'george',
 'steven',
 'andrew',
 'kenneth',
 'edward',
 'joshua',
 'kevin',
 'brian',
 'ronald',
 'timothy',
 'jason',
 'jeffrey',
 'ryan',
 'jacob',
 'gary',
 'nicholas',
 'eric',
 'stephen',
 'jonathan',
 'frank',
 'larry',
 'justin',
 'scott',
 'brandon',
 'raymond',
 'samuel',
 'benjamin',
 'gregory',
 'jack',
 'patrick',
 'alexander',
 'henry',
 'dennis',
 'jerry',
 'tyler',
 'aaron',
 'walter',
 'peter',
 'jose',
 'douglas',
 'adam',
 'nathan',
 'zachary',
 'harold',
 'arthur',
 'carl',
 'kyle',
 'albert',
 'gerald',
 'lawrence',
 'roger',
 'jeremy',
 'keith',
 'joe',
 'ethan',
 'terry',
 'christian',
 'sean',
 'willie',
 'austin',
 'jesse',
 'ralph',
 'noah',
 'billy',
 'bruce',
 'bryan',
 'roy',
 'jordan',
 'dylan',
 'louis',
 'eugene',
 'harry',
 'wayne',
 'alan',
 'russell',
 'juan',
 'ga

In [12]:
andro_names = (set(m_names).intersection(f_names))

print(len(andro_names))
andro_names

3042


{'erin',
 'edell',
 'merritt',
 'darlene',
 'legend',
 'kamrin',
 'catalina',
 'oluwasemilore',
 'torrie',
 'loyce',
 'alvis',
 'messiah',
 'freedom',
 'dominick',
 'marie',
 'hanh',
 'leighton',
 'dustin',
 'rayaan',
 'conner',
 'kylan',
 'rogue',
 'kylin',
 'jeanette',
 'bee',
 'unborn',
 'yan',
 'hester',
 'jimi',
 'yoshimi',
 'sai',
 'sina',
 'camari',
 'audie',
 'decklyn',
 'brian',
 'jung',
 'jaydin',
 'caitlin',
 'derrick',
 'valeria',
 'zander',
 'numa',
 'raymond',
 'jada',
 'jaidyn',
 'shiloh',
 'maritza',
 'sheridan',
 'kelley',
 'juan',
 'danyel',
 'tyrese',
 'eleanor',
 'hartley',
 'paxtyn',
 'fabiola',
 'promise',
 'marisol',
 'sammie',
 'matisse',
 'baker',
 'william',
 'yadira',
 'shanti',
 'yared',
 'gaylen',
 'aldean',
 'marlowe',
 'kobie',
 'lavell',
 'murphy',
 'evin',
 'tucker',
 'english',
 'yu',
 'taylor',
 'ellery',
 'bradley',
 'maverick',
 'tammy',
 'lashun',
 'santiago',
 'perris',
 'aliyah',
 'kymani',
 'jacky',
 'arnold',
 'shellie',
 'ocean',
 'aaren',
 'b

In [13]:
#female names only names not in male list
f_names = set(f_names) - set(m_names)
len(f_names)

17810

In [14]:
#male names not in female list
m_names = set(m_names) - set(f_names)

In [15]:
andro_names = list(andro_names)
f_names = list(f_names)
m_names = list(m_names)

In [16]:
m_names_df = m_names_df.set_index('name')

In [17]:
f_names_df = f_names_df.set_index('name')

In [18]:
count_m = 0
count_f = 0
count_na = 0
femme = []
masc = []

for name in andro_names:
    name = name.capitalize()
    male_count = m_names_df.loc[name]['records']
    female_count = f_names_df.loc[name]['records']
    total = male_count + female_count
    percentage_male = round(male_count / total, 2)
    percentage_female = round(female_count / total, 2)
    if percentage_male >= 0.75:
        gender_assign = 'M'
        count_m += 1
        masc.append(name.lower())
    elif percentage_female >= 0.75:
        gender_assign = 'F'
        count_f += 1
        femme.append(name.lower())
    else:
        gender_assign = 'N/A'
        count_na += 1
        
    x = 10 - len(str(male_count + female_count))
    y = 10 - len(name)
    print(name, y*' ', male_count, female_count, x*' ', ' | ', percentage_male, percentage_female, ' | ', gender_assign)
    
print('count M', count_m)
print('count_F', count_f)
print('count_na', count_na)
    
    
#     if male_count > female_count:
#         m_names.append(name)
#     if female_count > male_count:
#         f_names.append(name)

Erin        6124 311331       |  0.02 0.98  |  F
Edell       5 29           |  0.15 0.85  |  F
Merritt     1415 585         |  0.71 0.29  |  N/A
Darlene     5 200667       |  0.0 1.0  |  F
Legend      6751 106         |  0.98 0.02  |  M
Kamrin      192 25          |  0.88 0.12  |  M
Catalina    6 16951        |  0.0 1.0  |  F
Oluwasemilore  9 12           |  0.43 0.57  |  N/A
Torrie      15 608          |  0.02 0.98  |  F
Loyce       77 1850         |  0.04 0.96  |  F
Alvis       2449 10         |  1.0 0.0  |  M
Messiah     12502 248        |  0.98 0.02  |  M
Freedom     18 64           |  0.22 0.78  |  F
Dominick    59352 5        |  1.0 0.0  |  M
Marie       183 463763       |  0.0 1.0  |  F
Hanh        6 160          |  0.04 0.96  |  F
Leighton    3069 5806         |  0.35 0.65  |  N/A
Dustin      203889 358       |  1.0 0.0  |  M
Rayaan      216 18          |  0.92 0.08  |  M
Conner      51693 159        |  1.0 0.0  |  M
Kylan       5482 90         |  0.98 0.02  |  M
Rogue       5 

Justis      282 10          |  0.97 0.03  |  M
Tavon       2517 5         |  1.0 0.0  |  M
Kashmere    16 5           |  0.76 0.24  |  M
Jonah       62440 28        |  1.0 0.0  |  M
Coby        5884 5         |  1.0 0.0  |  M
Eden        4904 32053        |  0.13 0.87  |  F
Demani      43 5           |  0.9 0.1  |  M
Ricci       140 72          |  0.66 0.34  |  N/A
Coy         11681 33        |  1.0 0.0  |  M
Lisa        1302 963129       |  0.0 1.0  |  F
Dallas      57307 8381        |  0.87 0.13  |  M
Kiernan     391 11          |  0.97 0.03  |  M
Wilfred     23225 5        |  1.0 0.0  |  M
Tennyson    85 33          |  0.72 0.28  |  N/A
Marin       119 3298         |  0.03 0.97  |  F
Emerson     18459 20175        |  0.48 0.52  |  N/A
Sylvia      17 227642       |  0.0 1.0  |  F
Niko        5830 12         |  1.0 0.0  |  M
Kaydence    69 15003        |  0.0 1.0  |  F
Kieran      10639 187        |  0.98 0.02  |  M
Farris      790 15          |  0.98 0.02  |  M
Giselle     5 42125   

Dylann      57 121          |  0.32 0.68  |  N/A
Shai        376 165          |  0.7 0.3  |  N/A
Kylar       443 172          |  0.72 0.28  |  N/A
Marquis     21177 12        |  1.0 0.0  |  M
Basil       7174 10         |  1.0 0.0  |  M
Verl        779 5          |  0.99 0.01  |  M
Mason       278779 1417       |  0.99 0.01  |  M
Maison      1998 28         |  0.99 0.01  |  M
Andie       5 1848         |  0.0 1.0  |  F
Fatima      5 30112        |  0.0 1.0  |  F
Kenyatta    1659 2879         |  0.37 0.63  |  N/A
Ivey        741 1392         |  0.35 0.65  |  N/A
Michele     948 220823       |  0.0 1.0  |  F
Kameryn     54 703          |  0.07 0.93  |  F
Eli         115207 123       |  1.0 0.0  |  M
Gwyn        49 378          |  0.11 0.89  |  F
Myka        5 737          |  0.01 0.99  |  F
Keelyn      10 25           |  0.29 0.71  |  N/A
Climmie     12 21           |  0.36 0.64  |  N/A
Kinsler     193 16          |  0.92 0.08  |  M
Omega       18 227          |  0.07 0.93  |  F
Madison 

Joey        48449 3116        |  0.94 0.06  |  M
Kym         6 322          |  0.02 0.98  |  F
Dora        16 77123        |  0.0 1.0  |  F
Dimitri     5388 10         |  1.0 0.0  |  M
Brandy      520 104302       |  0.0 1.0  |  F
Margarita   32 35045        |  0.0 1.0  |  F
Monserrate  16 54           |  0.23 0.77  |  F
Shia        839 176         |  0.83 0.17  |  M
Kenyata     15 162          |  0.08 0.92  |  F
Nery        783 17          |  0.98 0.02  |  M
Michaela    5 47549        |  0.0 1.0  |  F
Tandy       5 141          |  0.03 0.97  |  F
Mikel       6307 17         |  1.0 0.0  |  M
Gilberto    25075 14        |  1.0 0.0  |  M
Jaime       65083 46980       |  0.58 0.42  |  N/A
Omari       7835 11         |  1.0 0.0  |  M
Montana     1558 5588         |  0.22 0.78  |  F
Edna        15 226188       |  0.0 1.0  |  F
Trace       9122 5         |  1.0 0.0  |  M
Al          11628 5        |  1.0 0.0  |  M
Aramis      912 5          |  0.99 0.01  |  M
Raylin      32 198          |  0

Keona       13 1267         |  0.01 0.99  |  F
Schyler     32 40           |  0.44 0.56  |  N/A
Asencion    192 43          |  0.82 0.18  |  M
Cassandra   12 165570       |  0.0 1.0  |  F
Shayla      6 27952        |  0.0 1.0  |  F
Davis       23637 42        |  1.0 0.0  |  M
Corie       21 1323         |  0.02 0.98  |  F
Ivon        5 466          |  0.01 0.99  |  F
Ermal       26 5           |  0.84 0.16  |  M
Rita        22 276950       |  0.0 1.0  |  F
Koda        1435 5         |  1.0 0.0  |  M
Gabrial     231 6          |  0.97 0.03  |  M
Carlin      167 129          |  0.56 0.44  |  N/A
Nihal       135 5          |  0.96 0.04  |  M
Masyn       322 61          |  0.84 0.16  |  M
Jose        558546 2745       |  1.0 0.0  |  M
Lamarr      873 5          |  0.99 0.01  |  M
Zuri        38 6504         |  0.01 0.99  |  F
Tomie       150 78          |  0.66 0.34  |  N/A
Ryker       24152 26        |  1.0 0.0  |  M
Christain   541 11          |  0.98 0.02  |  M
Jude        34667 751    

Burnell     761 30          |  0.96 0.04  |  M
Montez      1153 90         |  0.93 0.07  |  M
Rylan       26752 3845        |  0.87 0.13  |  M
Elia        43 4564         |  0.01 0.99  |  F
Rochelle    5 42811        |  0.0 1.0  |  F
Sonam       10 86           |  0.1 0.9  |  F
Tramaine    501 94          |  0.84 0.16  |  M
Aden        17894 16        |  1.0 0.0  |  M
Martina     5 11671        |  0.0 1.0  |  F
Flynn       2464 16         |  0.99 0.01  |  M
Bianca      44 70418        |  0.0 1.0  |  F
Lindley     5 84           |  0.06 0.94  |  F
Gaynell     5 2197         |  0.0 1.0  |  F
Johnathon   22608 5        |  1.0 0.0  |  M
Monique     43 75710        |  0.0 1.0  |  F
Tristin     8017 911         |  0.9 0.1  |  M
Jessiah     647 12          |  0.98 0.02  |  M
Catherine   179 614366       |  0.0 1.0  |  F
Hayley      9 47154        |  0.0 1.0  |  F
Ekam        144 58          |  0.71 0.29  |  N/A
Dorris      798 4800         |  0.14 0.86  |  F
Diamond     657 30937        |  0.

Ajani       827 10          |  0.99 0.01  |  M
Cali        22 15504        |  0.0 1.0  |  F
Jaye        5 527          |  0.01 0.99  |  F
Emari       106 150          |  0.41 0.59  |  N/A
Jamile      6 13           |  0.32 0.68  |  N/A
Daniel      1902611 4468      |  1.0 0.0  |  M
Cletus      4235 6         |  1.0 0.0  |  M
Andree      186 438          |  0.3 0.7  |  N/A
Jaydn       80 35          |  0.7 0.3  |  N/A
Bowie       432 175          |  0.71 0.29  |  N/A
Lessie      178 9271         |  0.02 0.98  |  F
Aiden       204663 756       |  1.0 0.0  |  M
Le          5 56           |  0.08 0.92  |  F
Ares        2237 5         |  1.0 0.0  |  M
Amere       262 5          |  0.98 0.02  |  M
Frankie     36443 26867        |  0.58 0.42  |  N/A
Kadence     238 9607         |  0.02 0.98  |  F
Jared       198234 57       |  1.0 0.0  |  M
Theo        8716 655         |  0.93 0.07  |  M
Hamdi       5 82           |  0.06 0.94  |  F
Shaunte     6 671          |  0.01 0.99  |  F
Maricela    5 

Tatum       3891 18176        |  0.18 0.82  |  F
Hope        13 84158        |  0.0 1.0  |  F
Carla       6 145458       |  0.0 1.0  |  F
Luz         335 23154        |  0.01 0.99  |  F
Tobie       10 89           |  0.1 0.9  |  F
Raul        81428 184        |  1.0 0.0  |  M
Finnley     1960 712         |  0.73 0.27  |  N/A
Beverly     1749 373871       |  0.0 1.0  |  F
Anel        5 1206         |  0.0 1.0  |  F
Cooper      83383 616        |  0.99 0.01  |  M
Son         223 5          |  0.98 0.02  |  M
Gerardo     49134 97        |  1.0 0.0  |  M
Brandon     755460 2293       |  1.0 0.0  |  M
Taryn       16 22500        |  0.0 1.0  |  F
Consepcion  6 108          |  0.05 0.95  |  F
Jenny       22 86429        |  0.0 1.0  |  F
Golden      358 245          |  0.59 0.41  |  N/A
Berkley     288 2190         |  0.12 0.88  |  F
Sandra      776 871636       |  0.0 1.0  |  F
Alesha      5 4890         |  0.0 1.0  |  F
Alika       421 15          |  0.97 0.03  |  M
Brittani    8 10352      

Navdeep     5 35           |  0.12 0.88  |  F
Rennie      21 10           |  0.68 0.32  |  N/A
Halley      5 1671         |  0.0 1.0  |  F
Ngawang     5 11           |  0.31 0.69  |  N/A
Zane        43073 8        |  1.0 0.0  |  M
Mckinley    4338 8495        |  0.34 0.66  |  N/A
Marlen      21 3794         |  0.01 0.99  |  F
Quin        143 21          |  0.87 0.13  |  M
Teagan      3292 19980        |  0.14 0.86  |  F
Adel        378 87          |  0.81 0.19  |  M
Auden       424 29          |  0.94 0.06  |  M
Tiara       30 21260        |  0.0 1.0  |  F
Dream       6 1268         |  0.0 1.0  |  F
Stacy       18087 160482       |  0.1 0.9  |  F
Luverne     766 367         |  0.68 0.32  |  N/A
Amelia      18 178299       |  0.0 1.0  |  F
Danyell     11 898          |  0.01 0.99  |  F
Fallon      17 7004         |  0.0 1.0  |  F
Carmine     10762 20        |  1.0 0.0  |  M
Jeriah      285 5          |  0.98 0.02  |  M
Laverne     10354 39426        |  0.21 0.79  |  F
Christopher  20259

Bryce       110622 1387       |  0.99 0.01  |  M
Josefina    10 10714        |  0.0 1.0  |  F
Dexter      28403 6        |  1.0 0.0  |  M
Najee       1534 55         |  0.97 0.03  |  M
Skye        687 17043        |  0.04 0.96  |  F
Kylee       13 45942        |  0.0 1.0  |  F
Larry       802698 994       |  1.0 0.0  |  M
Lily        12 140898       |  0.0 1.0  |  F
Deavion     11 58           |  0.16 0.84  |  F
Dakoda      1236 42         |  0.97 0.03  |  M
Gloria      329 408866       |  0.0 1.0  |  F
Brooklynn   6 30560        |  0.0 1.0  |  F
Ami         20 4508         |  0.0 1.0  |  F
Bora        10 7           |  0.59 0.41  |  N/A
Aubrey      24828 109417       |  0.18 0.82  |  F
Everest     429 50          |  0.9 0.1  |  M
Angeles     5 1945         |  0.0 1.0  |  F
Maddux      1762 5         |  1.0 0.0  |  M
Charlotte   6 346161       |  0.0 1.0  |  F
Terryl      16 52           |  0.24 0.76  |  F
Juanita     174 198315       |  0.0 1.0  |  F
Janee       6 1348         |  0.0 

Amel        11 21           |  0.34 0.66  |  N/A
Virgie      27 14374        |  0.0 1.0  |  F
Clayton     126461 19       |  1.0 0.0  |  M
Alan        346669 54       |  1.0 0.0  |  M
Davie       657 5          |  0.99 0.01  |  M
Nai         21 97          |  0.18 0.82  |  F
Koi         37 13           |  0.74 0.26  |  N/A
Mylan       129 5          |  0.96 0.04  |  M
Amen        212 171          |  0.55 0.45  |  N/A
Unnamed     267 217          |  0.55 0.45  |  N/A
Aimar       16 34           |  0.32 0.68  |  N/A
Brit        10 5           |  0.67 0.33  |  N/A
Graysen     1929 61         |  0.97 0.03  |  M
Joanne      18 208091       |  0.0 1.0  |  F
Mervin      7097 6         |  1.0 0.0  |  M
Khali       37 289          |  0.11 0.89  |  F
Rayn        16 10           |  0.62 0.38  |  N/A
Breslin     16 7           |  0.7 0.3  |  N/A
Tal         340 139          |  0.71 0.29  |  N/A
Aarya       43 1197         |  0.03 0.97  |  F
Kenyetta    6 1560         |  0.0 1.0  |  F
Roma        5

Torey       967 172         |  0.85 0.15  |  M
Barrie      211 519          |  0.29 0.71  |  N/A
Jerry       607110 13820       |  0.98 0.02  |  M
Lexus       22 4931         |  0.0 1.0  |  F
Cameryn     48 1269         |  0.04 0.96  |  F
Jace        64886 20        |  1.0 0.0  |  M
Davi        617 5          |  0.99 0.01  |  M
Loren       40257 9244        |  0.81 0.19  |  M
Mavis       5 15936        |  0.0 1.0  |  F
Milagro     6 560          |  0.01 0.99  |  F
Lillian     71 374954       |  0.0 1.0  |  F
Felice      214 1779         |  0.11 0.89  |  F
Cody        281663 2514       |  0.99 0.01  |  M
Carl        478734 322       |  1.0 0.0  |  M
Will        25394 65        |  1.0 0.0  |  M
Johnnie     93262 43015       |  0.68 0.32  |  N/A
Kennedy     3661 70474        |  0.05 0.95  |  F
Natalia     5 62950        |  0.0 1.0  |  F
Ysabel      104 499          |  0.17 0.83  |  F
Kaoru       119 28          |  0.81 0.19  |  M
Pamela      114 591769       |  0.0 1.0  |  F
Rudell      5

Arda        109 5          |  0.96 0.04  |  M
Kimberly    1845 836000       |  0.0 1.0  |  F
Hoang       235 5          |  0.98 0.02  |  M
Esperanza   5 16377        |  0.0 1.0  |  F
Kamsiyochukwu  31 19           |  0.62 0.38  |  N/A
Elmo        6435 5         |  1.0 0.0  |  M
Glyn        252 5          |  0.98 0.02  |  M
Reese       12065 31965        |  0.27 0.73  |  N/A
Luka        6041 25         |  1.0 0.0  |  M
Abraham     83766 11        |  1.0 0.0  |  M
Marc        136621 11       |  1.0 0.0  |  M
Lacy        3510 16203        |  0.18 0.82  |  F
Enis        17 39           |  0.3 0.7  |  N/A
Nhia        91 7           |  0.93 0.07  |  M
Gregg       33131 5        |  1.0 0.0  |  M
Joshua      1206629 3013      |  1.0 0.0  |  M
Natalie     109 348489       |  0.0 1.0  |  F
Nina        14 114461       |  0.0 1.0  |  F
Bessie      5 102974       |  0.0 1.0  |  F
Kali        36 22594        |  0.0 1.0  |  F
Charlie     143609 21037       |  0.87 0.13  |  M
Everett     98280 39     

Tracey      5915 84112        |  0.07 0.93  |  F
Tobey       147 49          |  0.75 0.25  |  M
Jayden      209026 20355       |  0.91 0.09  |  M
Regan       839 12670        |  0.06 0.94  |  F
Asher       53882 171        |  1.0 0.0  |  M
Asani       5 65           |  0.07 0.93  |  F
Braedyn     986 5          |  0.99 0.01  |  M
Aspen       368 13880        |  0.03 0.97  |  F
Kenya       886 22362        |  0.04 0.96  |  F
Natividad   1156 971         |  0.54 0.46  |  N/A
Vernon      138637 298       |  1.0 0.0  |  M
Nicholas    895058 1336       |  1.0 0.0  |  M
Leroy       187718 24       |  1.0 0.0  |  M
Azariah     1961 3371         |  0.37 0.63  |  N/A
Dennie      1031 5         |  1.0 0.0  |  M
Armando     69597 129        |  1.0 0.0  |  M
Naseem      486 16          |  0.97 0.03  |  M
Navy        17 549          |  0.03 0.97  |  F
Rande       21 5           |  0.81 0.19  |  M
Huxley      1116 11         |  0.99 0.01  |  M
Eric        874544 2176       |  1.0 0.0  |  M
Elijah   

Ora         1979 25123        |  0.07 0.93  |  F
Ryland      7807 75         |  0.99 0.01  |  M
Rodney      242644 73       |  1.0 0.0  |  M
Child       77 53          |  0.59 0.41  |  N/A
Krystian    675 23          |  0.97 0.03  |  M
Braylen     8903 68         |  0.99 0.01  |  M
Alejandro   131080 482       |  1.0 0.0  |  M
Ashly       5 9107         |  0.0 1.0  |  F
Domanique   5 16           |  0.24 0.76  |  F
Sang        160 11          |  0.94 0.06  |  M
Darryl      88402 97        |  1.0 0.0  |  M
Lunden      5 7           |  0.42 0.58  |  N/A
Ka          5 507          |  0.01 0.99  |  F
Caidyn      129 21          |  0.86 0.14  |  M
Kenna       19 7131         |  0.0 1.0  |  F
Avian       444 5          |  0.99 0.01  |  M
Ayodele     17 5           |  0.77 0.23  |  M
Jacob       920982 618       |  1.0 0.0  |  M
Germaine    932 4737         |  0.16 0.84  |  F
Vestal      20 5           |  0.8 0.2  |  M
Rain        152 1010         |  0.13 0.87  |  F
Deundra     5 10          

Braden      42311 17        |  1.0 0.0  |  M
Gabriel     342672 4328       |  0.99 0.01  |  M
Linda       1763 1447943      |  0.0 1.0  |  F
Marlo       711 3976         |  0.15 0.85  |  F
Mitchell    158074 76       |  1.0 0.0  |  M
Jaleesa     5 2280         |  0.0 1.0  |  F
Darrell     155852 20       |  1.0 0.0  |  M
Felix       62919 52        |  1.0 0.0  |  M
Alexandr    22 301          |  0.07 0.93  |  F
Darnell     24471 634        |  0.97 0.03  |  M
Brice       10441 10        |  1.0 0.0  |  M
Marty       28233 1534        |  0.95 0.05  |  M
Rowyn       31 649          |  0.05 0.95  |  F
Kacey       334 7810         |  0.04 0.96  |  F
San         113 1407         |  0.07 0.93  |  F
Lee         215694 55614       |  0.8 0.2  |  M
Latoya      7 43965        |  0.0 1.0  |  F
Trine       30 124          |  0.19 0.81  |  F
Isa         900 541         |  0.62 0.38  |  N/A
Torrey      2508 249         |  0.91 0.09  |  M
Dominique   20341 47690        |  0.3 0.7  |  N/A
Rene        44

In [19]:
len(set(f_names) - set(femme))

17810

In [75]:
count_na

518

In [78]:
count_m = 0
count_f = 0
count_na = 0
femme = []
masc = []

for name in andro_names:
    name = name.capitalize()
    male_count = m_names_df.loc[name]['records']
    female_count = f_names_df.loc[name]['records']
    total = male_count + female_count
    percentage_male = round(male_count / total, 2)
    percentage_female = round(female_count / total, 2)
    if percentage_male >= 0.66:
        gender_assign = 'M'
        count_m += 1
        masc.append(name.lower())
    elif percentage_female >= 0.66:
        gender_assign = 'F'
        count_f += 1
        femme.append(name.lower())
    else:
        gender_assign = 'N/A'
        count_na += 1
        
    x = 10 - len(str(male_count + female_count))
    y = 10 - len(name)
    print(name, y*' ', male_count, female_count, x*' ', ' | ', percentage_male, percentage_female, ' | ', gender_assign)
    
print('count M', count_m)
print('count_F', count_f)
print('count_na', count_na)
    
    
#     if male_count > female_count:
#         m_names.append(name)
#     if female_count > male_count:
#         f_names.append(name)

Erin        6124 311331       |  0.02 0.98  |  F
Edell       5 29           |  0.15 0.85  |  F
Merritt     1415 585         |  0.71 0.29  |  M
Darlene     5 200667       |  0.0 1.0  |  F
Legend      6751 106         |  0.98 0.02  |  M
Kamrin      192 25          |  0.88 0.12  |  M
Catalina    6 16951        |  0.0 1.0  |  F
Oluwasemilore  9 12           |  0.43 0.57  |  N/A
Torrie      15 608          |  0.02 0.98  |  F
Loyce       77 1850         |  0.04 0.96  |  F
Alvis       2449 10         |  1.0 0.0  |  M
Messiah     12502 248        |  0.98 0.02  |  M
Freedom     18 64           |  0.22 0.78  |  F
Dominick    59352 5        |  1.0 0.0  |  M
Marie       183 463763       |  0.0 1.0  |  F
Hanh        6 160          |  0.04 0.96  |  F
Leighton    3069 5806         |  0.35 0.65  |  N/A
Dustin      203889 358       |  1.0 0.0  |  M
Rayaan      216 18          |  0.92 0.08  |  M
Conner      51693 159        |  1.0 0.0  |  M
Kylan       5482 90         |  0.98 0.02  |  M
Rogue       5 52

Coby        5884 5         |  1.0 0.0  |  M
Eden        4904 32053        |  0.13 0.87  |  F
Demani      43 5           |  0.9 0.1  |  M
Ricci       140 72          |  0.66 0.34  |  M
Coy         11681 33        |  1.0 0.0  |  M
Lisa        1302 963129       |  0.0 1.0  |  F
Dallas      57307 8381        |  0.87 0.13  |  M
Kiernan     391 11          |  0.97 0.03  |  M
Wilfred     23225 5        |  1.0 0.0  |  M
Tennyson    85 33          |  0.72 0.28  |  M
Marin       119 3298         |  0.03 0.97  |  F
Emerson     18459 20175        |  0.48 0.52  |  N/A
Sylvia      17 227642       |  0.0 1.0  |  F
Niko        5830 12         |  1.0 0.0  |  M
Kaydence    69 15003        |  0.0 1.0  |  F
Kieran      10639 187        |  0.98 0.02  |  M
Farris      790 15          |  0.98 0.02  |  M
Giselle     5 42125        |  0.0 1.0  |  F
Eliyah      46 248          |  0.16 0.84  |  F
Geraldine   20 204154       |  0.0 1.0  |  F
Jupiter     28 80          |  0.26 0.74  |  F
Kaylon      308 117       

Ivey        741 1392         |  0.35 0.65  |  N/A
Michele     948 220823       |  0.0 1.0  |  F
Kameryn     54 703          |  0.07 0.93  |  F
Eli         115207 123       |  1.0 0.0  |  M
Gwyn        49 378          |  0.11 0.89  |  F
Myka        5 737          |  0.01 0.99  |  F
Keelyn      10 25           |  0.29 0.71  |  F
Climmie     12 21           |  0.36 0.64  |  N/A
Kinsler     193 16          |  0.92 0.08  |  M
Omega       18 227          |  0.07 0.93  |  F
Madison     3215 380503       |  0.01 0.99  |  F
Nikola      1236 15         |  0.99 0.01  |  M
Ciel        11 45           |  0.2 0.8  |  F
Terrion     634 11          |  0.98 0.02  |  M
Cayden      32742 797        |  0.98 0.02  |  M
Merlin      10164 85        |  0.99 0.01  |  M
Emily       342 833584       |  0.0 1.0  |  F
Kainoa      1449 11         |  0.99 0.01  |  M
Chee        110 63          |  0.64 0.36  |  N/A
Augusta     206 5431         |  0.04 0.96  |  F
Deborah     398 737059       |  0.0 1.0  |  F
Rael     

Garrett     126842 56       |  1.0 0.0  |  M
Kelly       77633 469867       |  0.14 0.86  |  F
Joey        48449 3116        |  0.94 0.06  |  M
Kym         6 322          |  0.02 0.98  |  F
Dora        16 77123        |  0.0 1.0  |  F
Dimitri     5388 10         |  1.0 0.0  |  M
Brandy      520 104302       |  0.0 1.0  |  F
Margarita   32 35045        |  0.0 1.0  |  F
Monserrate  16 54           |  0.23 0.77  |  F
Shia        839 176         |  0.83 0.17  |  M
Kenyata     15 162          |  0.08 0.92  |  F
Nery        783 17          |  0.98 0.02  |  M
Michaela    5 47549        |  0.0 1.0  |  F
Tandy       5 141          |  0.03 0.97  |  F
Mikel       6307 17         |  1.0 0.0  |  M
Gilberto    25075 14        |  1.0 0.0  |  M
Jaime       65083 46980       |  0.58 0.42  |  N/A
Omari       7835 11         |  1.0 0.0  |  M
Montana     1558 5588         |  0.22 0.78  |  F
Edna        15 226188       |  0.0 1.0  |  F
Trace       9122 5         |  1.0 0.0  |  M
Al          11628 5        

Sora        8 385          |  0.02 0.98  |  F
Glenn       240716 77       |  1.0 0.0  |  M
Jasani      5 5           |  0.5 0.5  |  N/A
Desi        512 58          |  0.9 0.1  |  M
Adi         101 218          |  0.32 0.68  |  F
Elena       12 72860        |  0.0 1.0  |  F
Jorden      4596 766         |  0.86 0.14  |  M
Ollie       10265 26321        |  0.28 0.72  |  F
Gal         15 15           |  0.5 0.5  |  N/A
Barrett     15693 5        |  1.0 0.0  |  M
Grey        1896 126         |  0.94 0.06  |  M
Artha       10 5           |  0.67 0.33  |  M
Ladell      27 25           |  0.52 0.48  |  N/A
Keona       13 1267         |  0.01 0.99  |  F
Schyler     32 40           |  0.44 0.56  |  N/A
Asencion    192 43          |  0.82 0.18  |  M
Cassandra   12 165570       |  0.0 1.0  |  F
Shayla      6 27952        |  0.0 1.0  |  F
Davis       23637 42        |  1.0 0.0  |  M
Corie       21 1323         |  0.02 0.98  |  F
Ivon        5 466          |  0.01 0.99  |  F
Ermal       26 5        

Jesse       401322 7418       |  0.98 0.02  |  M
Ray         189233 1846       |  0.99 0.01  |  M
Zakaria     1140 49         |  0.96 0.04  |  M
Mikal       976 10          |  0.99 0.01  |  M
Kathy       32 330859       |  0.0 1.0  |  F
Jonnie      945 2517         |  0.27 0.73  |  F
Drew        71293 4755        |  0.94 0.06  |  M
Tory        3860 1096         |  0.78 0.22  |  M
Travis      299900 337       |  1.0 0.0  |  M
Nichelle    5 3273         |  0.0 1.0  |  F
Uriah       6351 77         |  0.99 0.01  |  M
Dion        14056 95        |  0.99 0.01  |  M
Kayce       18 854          |  0.02 0.98  |  F
Sky         1264 5221         |  0.19 0.81  |  F
Austen      5448 156         |  0.97 0.03  |  M
Kemoni      5 24           |  0.17 0.83  |  F
Ely         1690 84         |  0.95 0.05  |  M
Chiquita    6 4477         |  0.0 1.0  |  F
Jovan       7385 352         |  0.95 0.05  |  M
Rylie       151 17125        |  0.01 0.99  |  F
Tierra      27 13001        |  0.0 1.0  |  F
Mina       

Rhys        6790 31         |  1.0 0.0  |  M
Jenna       28 143070       |  0.0 1.0  |  F
Tonya       5 117859       |  0.0 1.0  |  F
Jasiyah     12 91          |  0.12 0.88  |  F
Kree        20 38           |  0.34 0.66  |  F
Zephyr      545 22          |  0.96 0.04  |  M
James       4997327 18257      |  1.0 0.0  |  M
Olamide     19 15           |  0.56 0.44  |  N/A
Pilar       321 2881         |  0.1 0.9  |  F
Garnell     5 7           |  0.42 0.58  |  N/A
Kiara       93 47587        |  0.0 1.0  |  F
Elie        713 6          |  0.99 0.01  |  M
Daelyn      40 253          |  0.14 0.86  |  F
Shannen     5 689          |  0.01 0.99  |  F
Essie       221 27661        |  0.01 0.99  |  F
Londen      5 32           |  0.14 0.86  |  F
Britt       1564 1157         |  0.57 0.43  |  N/A
Kamalei     18 54           |  0.25 0.75  |  F
Connor      209167 565       |  1.0 0.0  |  M
Carmel      151 3520         |  0.04 0.96  |  F
Paxton      17347 397        |  0.98 0.02  |  M
Miriam      24 100

Francis     268231 21339       |  0.93 0.07  |  M
Lennie      296 1434         |  0.17 0.83  |  F
Anderson    21790 230        |  0.99 0.01  |  M
Vernice     33 3672         |  0.01 0.99  |  F
Tatum       3891 18176        |  0.18 0.82  |  F
Hope        13 84158        |  0.0 1.0  |  F
Carla       6 145458       |  0.0 1.0  |  F
Luz         335 23154        |  0.01 0.99  |  F
Tobie       10 89           |  0.1 0.9  |  F
Raul        81428 184        |  1.0 0.0  |  M
Finnley     1960 712         |  0.73 0.27  |  M
Beverly     1749 373871       |  0.0 1.0  |  F
Anel        5 1206         |  0.0 1.0  |  F
Cooper      83383 616        |  0.99 0.01  |  M
Son         223 5          |  0.98 0.02  |  M
Gerardo     49134 97        |  1.0 0.0  |  M
Brandon     755460 2293       |  1.0 0.0  |  M
Taryn       16 22500        |  0.0 1.0  |  F
Consepcion  6 108          |  0.05 0.95  |  F
Jenny       22 86429        |  0.0 1.0  |  F
Golden      358 245          |  0.59 0.41  |  N/A
Berkley     288 219

Asha        6 5714         |  0.0 1.0  |  F
Carol       5343 812366       |  0.01 0.99  |  F
Corrie      59 4151         |  0.01 0.99  |  F
Sheron      13 1411         |  0.01 0.99  |  F
Braylin     1693 399         |  0.81 0.19  |  M
Kriss       10 5           |  0.67 0.33  |  M
Izel        36 693          |  0.05 0.95  |  F
Henri       1336 31         |  0.98 0.02  |  M
Kobi        325 15          |  0.96 0.04  |  M
Carmin      11 43           |  0.2 0.8  |  F
Infantof    57 67          |  0.46 0.54  |  N/A
Ottie       10 89           |  0.1 0.9  |  F
Zachary     533634 333       |  1.0 0.0  |  M
Avi         2636 67         |  0.98 0.02  |  M
Zephaniah   985 10          |  0.99 0.01  |  M
Adrian      232741 11276       |  0.95 0.05  |  M
Jasmine     454 245236       |  0.0 1.0  |  F
Chantelle   6 3916         |  0.0 1.0  |  F
Caroline    44 214033       |  0.0 1.0  |  F
Hendrix     4795 95         |  0.98 0.02  |  M
Presley     1347 20818        |  0.06 0.94  |  F
Lian        845 221

Oluwadamilola  8 6           |  0.57 0.43  |  N/A
Jeffrey     973514 1378       |  1.0 0.0  |  M
Nabil       569 18          |  0.97 0.03  |  M
Gagandeep   57 15           |  0.79 0.21  |  M
Wilder      1649 26         |  0.98 0.02  |  M
Ayan        1551 158         |  0.91 0.09  |  M
Zia         6 829          |  0.01 0.99  |  F
Sequoia     22 1586         |  0.01 0.99  |  F
Berlin      196 639          |  0.23 0.77  |  F
Kinta       11 6           |  0.65 0.35  |  N/A
Shae        193 2623         |  0.07 0.93  |  F
Tien        123 70          |  0.64 0.36  |  N/A
Bayler      10 5           |  0.67 0.33  |  M
Kiley       84 16594        |  0.01 0.99  |  F
Albie       5 5           |  0.5 0.5  |  N/A
Larrie      24 10           |  0.71 0.29  |  M
Lupe        4847 16686        |  0.23 0.77  |  F
Denny       10602 5        |  1.0 0.0  |  M
Annette     5 162332       |  0.0 1.0  |  F
Gavin       173016 38       |  1.0 0.0  |  M
Kimi        144 345          |  0.29 0.71  |  F
Riot        1

Wylie       1359 18         |  0.99 0.01  |  M
Mannie      101 17          |  0.86 0.14  |  M
Demari      1128 6         |  0.99 0.01  |  M
Chayanne    72 37          |  0.66 0.34  |  M
Dempsey     967 15          |  0.98 0.02  |  M
Kelechi     47 7           |  0.87 0.13  |  M
Sophana     5 5           |  0.5 0.5  |  N/A
Hinckley    12 5           |  0.71 0.29  |  M
Raleigh     3590 872         |  0.8 0.2  |  M
Jocelyn     56 102098       |  0.0 1.0  |  F
Harper      3547 79971        |  0.04 0.96  |  F
Herman      119126 15       |  1.0 0.0  |  M
Okie        21 11           |  0.66 0.34  |  M
Rian        2024 1120         |  0.64 0.36  |  N/A
Cypress     91 107          |  0.46 0.54  |  N/A
Antonio     243628 615       |  1.0 0.0  |  M
Daris       14 10           |  0.58 0.42  |  N/A
Shannan     5 2917         |  0.0 1.0  |  F
Haleigh     6 12412        |  0.0 1.0  |  F
Colby       48436 1490        |  0.97 0.03  |  M
Kameron     31060 2316        |  0.93 0.07  |  M
Teigan      10 25

Rashaun     1428 11         |  0.99 0.01  |  M
Aaron       576759 1803       |  1.0 0.0  |  M
Esmeralda   35 43725        |  0.0 1.0  |  F
Isabella    62 329663       |  0.0 1.0  |  F
Eri         21 22           |  0.49 0.51  |  N/A
Jacy        12 1053         |  0.01 0.99  |  F
Prince      12917 52        |  1.0 0.0  |  M
Breckyn     16 210          |  0.07 0.93  |  F
Beverley    78 9854         |  0.01 0.99  |  F
Crimson     121 266          |  0.31 0.69  |  F
Sylvester   35930 14        |  1.0 0.0  |  M
Nereida     5 2550         |  0.0 1.0  |  F
Terrell     31753 122        |  1.0 0.0  |  M
Anthony     1431059 3779      |  1.0 0.0  |  M
Lorin       1322 736         |  0.64 0.36  |  N/A
Jessy       2034 476         |  0.81 0.19  |  M
Charleston  483 432          |  0.53 0.47  |  N/A
Danielle    629 365512       |  0.0 1.0  |  F
Dillon      60889 86        |  1.0 0.0  |  M
Terren      19 5           |  0.79 0.21  |  M
Avery       51136 123927       |  0.29 0.71  |  F
Stormy      78 3

Cheyenne    630 65937        |  0.01 0.99  |  F
Jayme       1053 10206        |  0.09 0.91  |  F
Caprice     5 1179         |  0.0 1.0  |  F
Porter      12045 36        |  1.0 0.0  |  M
Homer       49458 5        |  1.0 0.0  |  M
Summer      35 76459        |  0.0 1.0  |  F
Willis      43361 27        |  1.0 0.0  |  M
Henley      215 3316         |  0.06 0.94  |  F
Alaa        28 361          |  0.07 0.93  |  F
Calvin      197462 22       |  1.0 0.0  |  M
Morrison    175 5          |  0.97 0.03  |  M
Aris        562 235          |  0.71 0.29  |  M
Easton      49577 99        |  1.0 0.0  |  M
Elan        1557 70         |  0.96 0.04  |  M
Mana        16 95          |  0.14 0.86  |  F
Wrigley     26 42           |  0.38 0.62  |  N/A
Shadi       99 29          |  0.77 0.23  |  M
Toy         105 27          |  0.8 0.2  |  M
Dawn        132 285534       |  0.0 1.0  |  F
Lennon      3339 3367         |  0.5 0.5  |  N/A
Reginald    105610 28       |  1.0 0.0  |  M
Reece       14374 2704      

Elian       5166 5         |  1.0 0.0  |  M
Savannah    31 166704       |  0.0 1.0  |  F
Kia         28 5484         |  0.01 0.99  |  F
Carrington  384 1264         |  0.23 0.77  |  F
Brantlee    1347 21         |  0.98 0.02  |  M
Phuc        47 5           |  0.9 0.1  |  M
Kamden      7904 30         |  1.0 0.0  |  M
Tyree       10899 16        |  1.0 0.0  |  M
Willie      412266 121147       |  0.77 0.23  |  M
Katrina     15 97877        |  0.0 1.0  |  F
Valen       173 5          |  0.97 0.03  |  M
Jaden       73385 12086        |  0.86 0.14  |  M
Janet       172 550956       |  0.0 1.0  |  F
Fay         1355 24310        |  0.05 0.95  |  F
Hassan      7861 5         |  1.0 0.0  |  M
Alice       106 485364       |  0.0 1.0  |  F
Ashlyn      6 44205        |  0.0 1.0  |  F
Rilee       5 1420         |  0.0 1.0  |  F
Ova         254 427          |  0.37 0.63  |  N/A
Layla       9 85582        |  0.0 1.0  |  F
Elliot      37886 3420        |  0.92 0.08  |  M
Manuel      177257 420     

Gerry       10317 3582        |  0.74 0.26  |  M
Ryver       16 6           |  0.73 0.27  |  M
Cleo        7442 22745        |  0.25 0.75  |  F
Erika       191 121919       |  0.0 1.0  |  F
Clide       64 5           |  0.93 0.07  |  M
Brenda      504 604975       |  0.0 1.0  |  F
Jorge       128560 289       |  1.0 0.0  |  M
Floy        5 3082         |  0.0 1.0  |  F
Zion        27650 5088        |  0.84 0.16  |  M
Angela      469 660522       |  0.0 1.0  |  F
Name        23 11           |  0.68 0.32  |  M
Abigayle    5 4266         |  0.0 1.0  |  F
Lajuan      97 150          |  0.39 0.61  |  N/A
Taran       393 15          |  0.96 0.04  |  M
Bergen      24 17           |  0.59 0.41  |  N/A
Nevada      5 202          |  0.02 0.98  |  F
Onyx        638 65          |  0.91 0.09  |  M
Griffin     37182 25        |  1.0 0.0  |  M
Clare       1432 23182        |  0.06 0.94  |  F
Carrol      3548 1644         |  0.68 0.32  |  M
Zuriel      585 28          |  0.95 0.05  |  M
Rosio       10

Arvis       36 5           |  0.88 0.12  |  M
Brylee      21 8234         |  0.0 1.0  |  F
Dyllan      1624 15         |  0.99 0.01  |  M
Alante      107 5          |  0.96 0.04  |  M
Gordon      147351 17       |  1.0 0.0  |  M
Santana     3173 3179         |  0.5 0.5  |  N/A
Fred        297485 119       |  1.0 0.0  |  M
Tamika      14 21074        |  0.0 1.0  |  F
Mildred     120 394861       |  0.0 1.0  |  F
Alec        48316 17        |  1.0 0.0  |  M
Landon      154999 109       |  1.0 0.0  |  M
Diamante    73 5           |  0.94 0.06  |  M
Darian      12904 4698        |  0.73 0.27  |  M
Timber      32 166          |  0.16 0.84  |  F
count M 1388
count_F 1348
count_na 306


In [79]:
count_na

306

In [20]:
femme

['erin',
 'edell',
 'darlene',
 'catalina',
 'torrie',
 'loyce',
 'freedom',
 'marie',
 'hanh',
 'rogue',
 'jeanette',
 'unborn',
 'hester',
 'jung',
 'caitlin',
 'valeria',
 'jada',
 'maritza',
 'sheridan',
 'kelley',
 'danyel',
 'eleanor',
 'hartley',
 'paxtyn',
 'fabiola',
 'promise',
 'marisol',
 'yadira',
 'shanti',
 'marlowe',
 'taylor',
 'ellery',
 'tammy',
 'lashun',
 'aliyah',
 'shellie',
 'irma',
 'kelcey',
 'rachael',
 'loise',
 'caitlyn',
 'abigail',
 'sheila',
 'aria',
 'emani',
 'perla',
 'alpha',
 'greer',
 'blessing',
 'kairi',
 'alexsis',
 'renee',
 'sherron',
 'marjorie',
 'christine',
 'tracy',
 'alexie',
 'ikea',
 'juno',
 'kaitlin',
 'alexys',
 'chaka',
 'shirley',
 'jaimie',
 'adriane',
 'hollie',
 'danell',
 'tara',
 'adley',
 'cynthia',
 'laura',
 'amani',
 'gennie',
 'princess',
 'soua',
 'imari',
 'amanda',
 'eden',
 'lisa',
 'marin',
 'sylvia',
 'kaydence',
 'giselle',
 'eliyah',
 'geraldine',
 'maureen',
 'tanis',
 'camile',
 'jo',
 'jordyn',
 'melanie',
 'l

In [21]:
len(f_names + femme)
f_names = f_names + femme

In [22]:
len(f_names)

19062

In [23]:
masc

['legend',
 'kamrin',
 'alvis',
 'messiah',
 'dominick',
 'dustin',
 'rayaan',
 'conner',
 'kylan',
 'bee',
 'yan',
 'jimi',
 'sai',
 'brian',
 'jaydin',
 'derrick',
 'zander',
 'numa',
 'raymond',
 'juan',
 'tyrese',
 'sammie',
 'baker',
 'william',
 'yared',
 'gaylen',
 'kobie',
 'lavell',
 'murphy',
 'evin',
 'tucker',
 'bradley',
 'maverick',
 'santiago',
 'kymani',
 'jacky',
 'arnold',
 'aaren',
 'ben',
 'moises',
 'ricky',
 'mikah',
 'terrence',
 'jamey',
 'mayson',
 'colin',
 'davion',
 'jeremiah',
 'refugio',
 'tristian',
 'brandyn',
 'noe',
 'zyon',
 'blake',
 'jayce',
 'sung',
 'tylor',
 'luther',
 'jin',
 'caelan',
 'javier',
 'mike',
 'franklin',
 'derek',
 'lovell',
 'cleofas',
 'tu',
 'johnathan',
 'sawyer',
 'mel',
 'koa',
 'kory',
 'shayne',
 'syncere',
 'fredi',
 'bodhi',
 'almer',
 'keelan',
 'shaun',
 'sergio',
 'jamari',
 'justis',
 'tavon',
 'kashmere',
 'jonah',
 'coby',
 'demani',
 'coy',
 'dallas',
 'kiernan',
 'wilfred',
 'niko',
 'kieran',
 'farris',
 'jacques

In [24]:
m_names = m_names + masc

In [25]:
andro_ = set(andro_names) - set(masc)
andro_ = set(andro_) - set(femme)

In [26]:
len(andro_)

518

In [27]:
andro_

{'abrar',
 'abriel',
 'adama',
 'addis',
 'adi',
 'afnan',
 'aiman',
 'aimar',
 'albie',
 'aldean',
 'alexiz',
 'alexx',
 'alik',
 'allyn',
 'altair',
 'alva',
 'amandeep',
 'amaree',
 'amari',
 'amel',
 'amen',
 'amil',
 'amrit',
 'an',
 'anay',
 'andra',
 'andree',
 'angel',
 'anmol',
 'ara',
 'ardell',
 'arden',
 'arie',
 'aries',
 'arin',
 'aris',
 'ariyan',
 'arlee',
 'arlin',
 'arlyn',
 'armani',
 'arnell',
 'arshdeep',
 'arshia',
 'artha',
 'artie',
 'arvie',
 'ary',
 'ascension',
 'ashten',
 'ashtin',
 'asuncion',
 'audie',
 'austyn',
 'aven',
 'avery',
 'ayomide',
 'azari',
 'azariah',
 'aziah',
 'baby',
 'bao',
 'barrie',
 'bayler',
 'bentlie',
 'bergen',
 'blair',
 'bora',
 'bowie',
 'braylyn',
 'breslin',
 'briar',
 'brighton',
 'brit',
 'britt',
 'brittan',
 'britten',
 'burnice',
 'caelin',
 'camari',
 'camdyn',
 'cameran',
 'campbell',
 'carey',
 'carlin',
 'carmon',
 'carrol',
 'casey',
 'cashmere',
 'chan',
 'channing',
 'charleston',
 'charley',
 'charly',
 'chayanne'

In [28]:
#creating the first name column
for i in entrepreneur_df.index:
    entrepreneur_df.at[i, 'f_name'] = entrepreneur_df.at[i, 'name'].split(' ')[0]

In [29]:
entrepreneur_df

Unnamed: 0,name,company_name,crunchbase_uuid,crunchbase_permalink,funding,jobs_created,patents,ipo,city_and_state,metro,gender,lgbtq,race_ethnicity,startout_id,f_name
0,Scott Brown,ColdSpark,226a627bc92415995985cbd94743276b,coldspark,6500000,1000.0,0,False,"Broomfield, CO","Denver-Aurora-Lakewood, CO",,,,,Scott
1,Robin Horwitz,Convo Communications,7d0675cefac592615e1cbb6c29fd403a,convo-communications,500000,100.0,0,False,"Austin, TX","Austin-Round Rock, TX",,,,,Robin
2,Jiren Parikh,SnapOne,ed5f8110a213395ececfe94660c0f602,snapone-inc,0,100.0,1,False,"Princeton, NJ","Trenton-Ewing, NJ",Male,,,,Jiren
3,Matthew Slipper,Symphony Communication Services,5f89826c5031a1932f27525b505b0a7f,symphony-3,461000000,500.0,1,False,"Palo Alto, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Matthew
4,Ric Zhou,Kika Tech,437121710de6c1d5e8f8ebe555749fa8,kika-tech,63000000,500.0,0,False,"San Jose, CA","San Jose-Sunnyvale-Santa Clara, CA",,,,,Ric
5,Kumaran Thillainadarajah,Smart Skin Technologies,5974376ec194ff0e4322caded8d049d5,smart-skin-technologies,11545000,50.0,2,False,"New Brunswick, NJ","New York-Northern New Jersey-Long Island, NY-N...",Male,,,,Kumaran
6,Sunil Agrawal,Armor5,3fca9b4b44b3ed61719f56c19af7387d,armor5,2000000,50.0,7,False,"Santa Clara, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Sunil
7,Suzy Batiz,Poo~Pourri,3ac4350d0bd841a98beb24341e43a63f,poo-pourri,0,50.0,1,False,"Addison, TX","Dallas-Fort Worth-Arlington, TX",,,,,Suzy
8,Brian Hastings,Gearbox Express,d4ed79c1d290badf8165832a8eed0d41,gearbox-express,2465000,50.0,0,False,"Mukwonago, WI","Milwaukee-Waukesha-West Allis, WI",Male,,,,Brian
9,Brian Petersen,Podimetrics,8cded10bbd3be6f33dcf72367e5fb604,podimetrics,15900000,50.0,5,False,"Somerville, MA","Boston-Cambridge-Newton, MA-NH",,,,,Brian


In [30]:
entrepreneur_df.shape

(51884, 15)

In [31]:
null = entrepreneur_df[pd.isnull(entrepreneur_df['gender'])]
null.shape

(25477, 15)

In [32]:
'Alexa' in list(f_names)

False

In [33]:
len(andro_)

518

In [34]:
#our bread and butter, the part that does it all...

In [35]:
f_count = 0
m_count = 0
a_count = 0
i_count = 0

for i in null.index:
    name = null.at[i, 'f_name'].lower()
    
    if name in f_names:
        entrepreneur_df.at[i, 'gender'] = 'f'
        f_count += 1
        
    elif name in m_names:
        entrepreneur_df.at[i, 'gender'] = 'm'
        m_count += 1
        
    elif name in andro_:
        entrepreneur_df.at[i, 'gender'] = 'andro'
        a_count += 1
        
    else:
        entrepreneur_df.at[i, 'gender'] = 'i donno'
        i_count += 1
        
print(f_count, m_count, a_count, i_count)

2386 20146 0 2945


In [36]:
entrepreneur_df

Unnamed: 0,name,company_name,crunchbase_uuid,crunchbase_permalink,funding,jobs_created,patents,ipo,city_and_state,metro,gender,lgbtq,race_ethnicity,startout_id,f_name
0,Scott Brown,ColdSpark,226a627bc92415995985cbd94743276b,coldspark,6500000,1000.0,0,False,"Broomfield, CO","Denver-Aurora-Lakewood, CO",m,,,,Scott
1,Robin Horwitz,Convo Communications,7d0675cefac592615e1cbb6c29fd403a,convo-communications,500000,100.0,0,False,"Austin, TX","Austin-Round Rock, TX",f,,,,Robin
2,Jiren Parikh,SnapOne,ed5f8110a213395ececfe94660c0f602,snapone-inc,0,100.0,1,False,"Princeton, NJ","Trenton-Ewing, NJ",Male,,,,Jiren
3,Matthew Slipper,Symphony Communication Services,5f89826c5031a1932f27525b505b0a7f,symphony-3,461000000,500.0,1,False,"Palo Alto, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Matthew
4,Ric Zhou,Kika Tech,437121710de6c1d5e8f8ebe555749fa8,kika-tech,63000000,500.0,0,False,"San Jose, CA","San Jose-Sunnyvale-Santa Clara, CA",m,,,,Ric
5,Kumaran Thillainadarajah,Smart Skin Technologies,5974376ec194ff0e4322caded8d049d5,smart-skin-technologies,11545000,50.0,2,False,"New Brunswick, NJ","New York-Northern New Jersey-Long Island, NY-N...",Male,,,,Kumaran
6,Sunil Agrawal,Armor5,3fca9b4b44b3ed61719f56c19af7387d,armor5,2000000,50.0,7,False,"Santa Clara, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Sunil
7,Suzy Batiz,Poo~Pourri,3ac4350d0bd841a98beb24341e43a63f,poo-pourri,0,50.0,1,False,"Addison, TX","Dallas-Fort Worth-Arlington, TX",f,,,,Suzy
8,Brian Hastings,Gearbox Express,d4ed79c1d290badf8165832a8eed0d41,gearbox-express,2465000,50.0,0,False,"Mukwonago, WI","Milwaukee-Waukesha-West Allis, WI",Male,,,,Brian
9,Brian Petersen,Podimetrics,8cded10bbd3be6f33dcf72367e5fb604,podimetrics,15900000,50.0,5,False,"Somerville, MA","Boston-Cambridge-Newton, MA-NH",m,,,,Brian


In [37]:
(f_count + m_count)/entrepreneur_df.shape[0]

0.43427646287872945

In [38]:
5452/entrepreneur_df.shape[0]

0.10508056433582608

In [39]:
d = gender.Detector()

In [40]:
# Create additional checks for gender
additional_female_names = [
    'Alika',
    'Ama',
    'Cibelle',
    'Kimberlina',
    'Leathia',
    'Shiri',
    'Lavena',
    'Tanjila',
    'Holley',
    'Anie',
    'Dionna',
    'Shanel',
    'Lakshya',
    'Shenda',
    'Madelena',
    'Kerranna',
    'Piya',
    'Paria',
    'Ylianna',
    'Ankita',
    'Isha',
    'Gabby',
    'Anjelika',
    'Nitha',
    'Adena',
    'Sumayah',
    'Louisea',
    'Cymphonique',
    'Charlyn',
    'Tari',
    'Graceann',
    'Kalia',
    'Annmarie',
    'Saira',
]
additional_male_names = [
    'Deward',
    'Omkar',
    'Rohit',
    'Mohit',
    'Vik',
    'Vishal',
    'Obed',
    'Ashwin',
    'Ozel',
    'Umed',
    'Demetri',
    'Tripp',
    'Jean-Marc',
    'Laszlo',
    'Allon',
    'Rishi'
    'Francois',
    'Clarkson',
    'Siddharth',
    'Sidharth',
    'Anup',
    'Anoop',
    'Zac',
    'Andras',
    'Abhishek',
    'Dil-Domine',
    'Varun',
    'Abhinav',
    'Chaitanya',
    'Rishi',
    'Raghu',
    'Anurag',
    'Akshay',
    'Gaurav',
    'Kunal',
]

In [41]:
# Create column 'female'
entrepreneur_df['female'] = 0

# Collect indices of female and androgynous names, and mark them as female
female_names = []
andy_names = []
unk_names = []

count_female = 0
count_andy = 0
count_unk = 0

for i in entrepreneur_df.index:
    name = entrepreneur_df.at[i, 'f_name']
    if d.get_gender(name) == 'female':
        female_names.append([i, name])
        entrepreneur_df.at[i, 'female'] = 1
        count_female += 1
    elif d.get_gender(name) == 'mostly_female':
        female_names.append([i, name])
        entrepreneur_df.at[i, 'female'] = 1
        count_female += 1
    elif d.get_gender(name) == 'andy':
        andy_names.append([i, name])
        count_andy += 1
    elif d.get_gender(name) == 'unknown':
        if name in additional_female_names:
            female_names.append([i, name])
            entrepreneur_df.at[i, 'female'] = 1
            count_female += 1
        elif name in additional_male_names:
            pass
        else:
            unk_names.append([i, name])
            count_unk += 1

print(count_female, 'female names.')
print(count_andy, 'androgynous names.')
print(count_unk, 'unknown names.')

5452 female names.
693 androgynous names.
4796 unknown names.


In [42]:
entrepreneur_df

Unnamed: 0,name,company_name,crunchbase_uuid,crunchbase_permalink,funding,jobs_created,patents,ipo,city_and_state,metro,gender,lgbtq,race_ethnicity,startout_id,f_name,female
0,Scott Brown,ColdSpark,226a627bc92415995985cbd94743276b,coldspark,6500000,1000.0,0,False,"Broomfield, CO","Denver-Aurora-Lakewood, CO",m,,,,Scott,0
1,Robin Horwitz,Convo Communications,7d0675cefac592615e1cbb6c29fd403a,convo-communications,500000,100.0,0,False,"Austin, TX","Austin-Round Rock, TX",f,,,,Robin,0
2,Jiren Parikh,SnapOne,ed5f8110a213395ececfe94660c0f602,snapone-inc,0,100.0,1,False,"Princeton, NJ","Trenton-Ewing, NJ",Male,,,,Jiren,0
3,Matthew Slipper,Symphony Communication Services,5f89826c5031a1932f27525b505b0a7f,symphony-3,461000000,500.0,1,False,"Palo Alto, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Matthew,0
4,Ric Zhou,Kika Tech,437121710de6c1d5e8f8ebe555749fa8,kika-tech,63000000,500.0,0,False,"San Jose, CA","San Jose-Sunnyvale-Santa Clara, CA",m,,,,Ric,0
5,Kumaran Thillainadarajah,Smart Skin Technologies,5974376ec194ff0e4322caded8d049d5,smart-skin-technologies,11545000,50.0,2,False,"New Brunswick, NJ","New York-Northern New Jersey-Long Island, NY-N...",Male,,,,Kumaran,0
6,Sunil Agrawal,Armor5,3fca9b4b44b3ed61719f56c19af7387d,armor5,2000000,50.0,7,False,"Santa Clara, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Sunil,0
7,Suzy Batiz,Poo~Pourri,3ac4350d0bd841a98beb24341e43a63f,poo-pourri,0,50.0,1,False,"Addison, TX","Dallas-Fort Worth-Arlington, TX",f,,,,Suzy,1
8,Brian Hastings,Gearbox Express,d4ed79c1d290badf8165832a8eed0d41,gearbox-express,2465000,50.0,0,False,"Mukwonago, WI","Milwaukee-Waukesha-West Allis, WI",Male,,,,Brian,0
9,Brian Petersen,Podimetrics,8cded10bbd3be6f33dcf72367e5fb604,podimetrics,15900000,50.0,5,False,"Somerville, MA","Boston-Cambridge-Newton, MA-NH",m,,,,Brian,0


In [43]:
entrepreneur_df[entrepreneur_df['gender'] == 'f'].shape

(2386, 16)

In [44]:
entrepreneur_df[entrepreneur_df['female'] == 1].shape

(5452, 16)

In [45]:
entrepreneur_df[(entrepreneur_df['gender'] == 'f') & (entrepreneur_df['female'] == 1)].shape

(1988, 16)

In [46]:
#places where the SS data covered spots that the gender_guesser missed
entrepreneur_df[(entrepreneur_df['gender'] == 'f') & (entrepreneur_df['female'] == 0)].shape

(398, 16)

In [47]:
#places where the SS data assigned male that the gender_guesser assigned female
male_changes = entrepreneur_df[(entrepreneur_df['gender'] == 'm') & (entrepreneur_df['female'] == 1)]
entrepreneur_df[(entrepreneur_df['gender'] == 'm') & (entrepreneur_df['female'] == 1)].shape

(220, 16)

In [48]:
entrepreneur_df[(entrepreneur_df['gender'] == 'Female') & (entrepreneur_df['female'] == 1)].shape

(2663, 16)

In [49]:
entrepreneur_df[(entrepreneur_df['gender'] == 'Female')].shape

(3142, 16)

In [50]:
accuracy = 2656/3142
accuracy

0.8453214513049013

In [51]:
entrepreneur_df[(entrepreneur_df['gender'] == 'i donno') & (entrepreneur_df['female'] == 1)].shape

(134, 16)

In [52]:
andro_id = entrepreneur_df[(entrepreneur_df['gender'] == 'i donno') & (entrepreneur_df['female'] == 1)]

In [53]:
unknowns = entrepreneur_df[(entrepreneur_df['gender'] == 'i donno') & (entrepreneur_df['female'] == 0)]

In [54]:
unknowns_df = unknowns

In [55]:
unk_names

[[2, 'Jiren'],
 [4, 'Ric'],
 [5, 'Kumaran'],
 [13, 'ANTHONY'],
 [24, 'Harini'],
 [26, 'Kestutis'],
 [35, 'Yet-Ming'],
 [47, 'Zor'],
 [55, 'Miller'],
 [59, 'Charles-Albert'],
 [68, 'Tingfan'],
 [76, 'Thabet'],
 [80, 'Norm'],
 [106, 'Wooshik'],
 [108, 'Izzy'],
 [163, 'Jean-Romain'],
 [184, 'Ahryun'],
 [193, 'Prasanna'],
 [194, 'Chandrasekar'],
 [196, 'Carman'],
 [213, 'Mairin'],
 [225, 'Chaitan'],
 [226, 'Laure-Cécile'],
 [243, 'Jan-Hein'],
 [251, 'Nanea'],
 [255, 'Heatherm'],
 [260, 'Iñigo'],
 [266, 'B.'],
 [277, 'Xuejun'],
 [291, 'Abhimanyu'],
 [310, 'MJ'],
 [316, 'Utkarsh'],
 [324, 'Srikanth'],
 [331, 'Jasdeep'],
 [340, 'Dhruv'],
 [350, 'Virasb'],
 [357, 'Naya'],
 [363, 'Subodh'],
 [366, 'Jenova'],
 [401, 'Anisha'],
 [410, 'Yadi'],
 [420, 'Terren'],
 [424, 'HAYDN'],
 [425, 'Branndon'],
 [448, 'Davidra'],
 [453, 'Anukool'],
 [468, 'Pargles'],
 [480, 'Murari'],
 [486, 'Shiliang'],
 [497, 'Sridhar'],
 [515, 'Dheeraj'],
 [522, 'Romanos'],
 [532, 'Zuk'],
 [545, 'CN'],
 [547, 'Faraz'],
 [56

In [56]:
andro_id.shape

(134, 16)

In [57]:
andro_id

Unnamed: 0,name,company_name,crunchbase_uuid,crunchbase_permalink,funding,jobs_created,patents,ipo,city_and_state,metro,gender,lgbtq,race_ethnicity,startout_id,f_name,female
437,Aviad Cahana,xF Technologies Inc.,b1252d64cad26da0aab88a0131040c6c,xf-technologies-inc,12500000,50.0,4,False,"Albuquerque, NM","Albuquerque, NM",i donno,,,,Aviad,1
1275,Peppi Prasit,Amira Pharmaceuticals,ef44a540055ae94c30e476ca0f187cac,amira-pharmaceuticals,32000000,50.0,0,False,"San Diego, CA","San Diego-Carlsbad-San Marcos, CA",i donno,,,,Peppi,1
1382,Indu Parikh,BioMarck Pharmaceuticals,a7a6069d3c963848c5e3cce520e710cf,biomarck-pharmaceuticals,15517766,10.0,0,False,"Durham, NC","Durham-Chapel Hill, NC",i donno,,,,Indu,1
2689,Kat Vorotova,Try The World,73a6b76d79f4c7d87bcd2d6d0b2882a5,try-the-world,6120106,50.0,0,False,"New York, NY","New York-Newark-Jersey City, NY-NJ-PA",i donno,,,,Kat,1
2708,Chitra Kanagaraj,PikMyKid,32cb603d1a5b01e95d684bce48d4f2f9,pikmykid,1750000,50.0,0,False,"Tampa, FL","Tampa-St Petersburg-Clearwater, FL",i donno,,,,Chitra,1
2855,Rotem Shor,Medisafe,f641d8ae98e5011e5c22c669baa29b46,medisafe-project,21500000,100.0,0,False,"Boston, MA","Boston-Cambridge-Newton, MA-NH",i donno,,,,Rotem,1
3430,Suman Kanuganti,Aira Tech Corp,990830059388f60c67a82949c0cc0995,aira,35340000,100.0,1,False,"La Jolla, CA","San Diego-Carlsbad-San Marcos, CA",i donno,,,,Suman,1
3453,Vesa Kupari,Taction Enterprises,68cd30cad9d8ae1152f27ca03e52a8d0,taction-enterprises,95344,10.0,1,False,"Beverly Hills, CA","Los Angeles-Long Beach-Santa Ana, CA",i donno,,,,Vesa,1
4062,Rotem Amar,Webpals Mobile,d84b933c5219e50bcf36e2f50cc40ded,dau-up,0,100.0,3,False,"San Francisco, CA","San Francisco-Oakland-Hayward, CA",i donno,,,,Rotem,1
5195,Pelin Kenez,Zeplin,389a85f37a8758b13050d3f1939a9e91,zeplin,1359541,10.0,1,False,"San Francisco, CA","San Francisco-Oakland-Hayward, CA",i donno,,,,Pelin,1


In [58]:
male_changes

Unnamed: 0,name,company_name,crunchbase_uuid,crunchbase_permalink,funding,jobs_created,patents,ipo,city_and_state,metro,gender,lgbtq,race_ethnicity,startout_id,f_name,female
113,Kazumi Shiosaki,Mitobridge,fcb8fa56ff6cc84f0cd64d33d274312f,mitobridge,5222321,50.0,0,False,"Cambridge, MA","Boston-Cambridge-Newton, MA-NH",m,,,,Kazumi,1
191,Devendra Raut,CiiNOW,b1eed4add891fc663df4d96cdb570f1c,ciinow,13000000,50.0,0,False,"Sunnyvale, CA","San Jose-Sunnyvale-Santa Clara, CA",m,,,,Devendra,1
398,Zane Groshelle,Shapr,5c7412e1c80148415fd29ed14ec1d719,shapr,16500000,50.0,0,False,"New York, NY","New York-Newark-Jersey City, NY-NJ-PA",m,,,,Zane,1
779,Nate Baker,Qualia,28815540d3fe9540d880ae066215fcd5,qualia-labs-inc,40000000,250.0,0,False,"San Francisco, CA","San Francisco-Oakland-Hayward, CA",m,,,,Nate,1
830,Prem Thomas,PROVEIT,b3ed3390cd89447ab86b2ed0704947d3,nobel-trivia-llc,2300000,10.0,0,False,"Raleigh, NC","Raleigh, NC",m,,,,Prem,1
1500,Kyle Zhang,SmartX,39f8d5dcc0345df4ba5f4836047741a3,smartx,18500000,100.0,0,False,"Palo Alto, CA","San Jose-Sunnyvale-Santa Clara, CA",m,,,,Kyle,1
1739,Eli Portnoy,Thinknear,8907ecf85b3bc94f76a07ca54afc29bf,thinknear,1630000,100.0,0,False,"Culver City, CA","Los Angeles-Long Beach-Santa Ana, CA",m,,,,Eli,1
2053,Bernie Brenner,TrueCar,f955cbc6d3254e9224ddd2174c8adf9b,truecar,340362625,1000.0,0,True,"Santa Monica, CA","Los Angeles-Long Beach-Santa Ana, CA",m,,,,Bernie,1
2486,Yael Peled,Vim,0d9d42788a4251494b7f9ae4a61df8d5,bookmd,34600378,100.0,1,False,"San Francisco, CA","San Francisco-Oakland-Hayward, CA",m,,,,Yael,1
2780,Kyle Widrick,BVAccel,0a16c3e1e6c4f7af7be9891006b01b75,brand-value-accelerator,200000,250.0,0,False,"San Diego, CA","San Diego-Carlsbad-San Marcos, CA",m,,,,Kyle,1


In [59]:
count_unknown = entrepreneur_df[entrepreneur_df['gender'] == 'i donno'].shape[0]
count_unknown

2945

In [60]:
count_all = entrepreneur_df.shape[0]
count_all

51884

In [61]:
2945/51884

0.05676123660473364

In [62]:
count_female = entrepreneur_df[entrepreneur_df['gender'] == 'Female'].shape[0]
count_female

3142

In [63]:
5528/51884

0.10654537044175469

In [64]:
for i in entrepreneur_df.index:
    #over-ride the gender_guesser with the social security data
    if entrepreneur_df.at[i, 'gender'] == 'f' and entrepreneur_df.at[i, 'female'] == 0:
        entrepreneur_df.at[i, 'gender'] = 'Female'
    if entrepreneur_df.at[i, 'gender'] == 'm' and entrepreneur_df.at[i, 'female'] == 1:
        entrepreneur_df.at[i, 'gender'] = 'Male'

    #assign female if both agree, and male if both agree
    if entrepreneur_df.at[i, 'gender'] == 'f' and entrepreneur_df.at[i, 'female'] == 1:
        entrepreneur_df.at[i, 'gender'] = 'Female'
    if entrepreneur_df.at[i, 'gender'] == 'm' and entrepreneur_df.at[i, 'female'] == 0:
        entrepreneur_df.at[i, 'gender'] = 'Male'

In [65]:
import numpy as np

In [66]:
entrepreneur_df['gender'].value_counts()

Male       43411
Female      5528
i donno     2945
Name: gender, dtype: int64

In [67]:
entrepreneur_df['gender'].value_counts()[1]

5528

In [68]:
fem_ratio_minus_unk = 5528 / (43411 + 5528)
fem_ratio_minus_unk

0.11295694640266454

In [69]:
percent_fem = np.round((count_female / (count_all - count_unknown)), 2)
print(f'{int(percent_fem*100)}% of gender-identified entrepreneurs are female.')

6% of gender-identified entrepreneurs are female.


In [70]:
andy_names

[[18, 'Chin'],
 [65, 'Rusty'],
 [135, 'Amrit'],
 [353, 'Pat'],
 [381, 'Jo'],
 [423, 'Ming'],
 [439, 'Chen'],
 [443, 'Krishna'],
 [466, 'Casey'],
 [738, 'Ashton'],
 [751, 'Ang'],
 [760, 'Heng'],
 [851, 'Tal'],
 [986, 'Yeng'],
 [1236, 'Yu'],
 [1315, 'Chi'],
 [1325, 'Kendall'],
 [1337, 'Ke'],
 [1421, 'Tal'],
 [1521, 'Hai'],
 [1536, 'Peng'],
 [1610, 'Tong'],
 [1732, 'Casey'],
 [1735, 'Tal'],
 [1841, 'Yin'],
 [1852, 'Rusty'],
 [1859, 'Krishna'],
 [1884, 'Kwang'],
 [1903, 'Ren'],
 [2046, 'Cheng'],
 [2117, 'Pat'],
 [2218, 'Sheng'],
 [2260, 'Si'],
 [2480, 'Ping'],
 [2489, 'Shu'],
 [2554, 'Blair'],
 [2609, 'Linh'],
 [2629, 'Kwang'],
 [2636, 'Bing'],
 [2724, 'Chee'],
 [2760, 'Lesley'],
 [2825, 'Ying'],
 [3014, 'Shea'],
 [3031, 'Krishna'],
 [3042, 'Xiao'],
 [3084, 'Shuo'],
 [3141, 'Chiu'],
 [3367, 'Casey'],
 [3511, 'Sri'],
 [3658, 'Wei'],
 [3850, 'Jing'],
 [3922, 'Li'],
 [4020, 'Fei'],
 [4058, 'Reese'],
 [4074, 'Bing'],
 [4103, 'Peyton'],
 [4107, 'Ming'],
 [4117, 'Pat'],
 [4160, 'Jackie'],
 [4461

In [71]:
# Androgynous names
andy_to_fem = np.random.choice(
    [x[0] for x in andy_names],
    int(np.round(percent_fem*len(andy_names))),
    replace=False
)

for i in andy_to_fem:
    entrepreneur_df.at[i, 'gender'] = 'Female'

In [72]:
unk_names

[[2, 'Jiren'],
 [4, 'Ric'],
 [5, 'Kumaran'],
 [13, 'ANTHONY'],
 [24, 'Harini'],
 [26, 'Kestutis'],
 [35, 'Yet-Ming'],
 [47, 'Zor'],
 [55, 'Miller'],
 [59, 'Charles-Albert'],
 [68, 'Tingfan'],
 [76, 'Thabet'],
 [80, 'Norm'],
 [106, 'Wooshik'],
 [108, 'Izzy'],
 [163, 'Jean-Romain'],
 [184, 'Ahryun'],
 [193, 'Prasanna'],
 [194, 'Chandrasekar'],
 [196, 'Carman'],
 [213, 'Mairin'],
 [225, 'Chaitan'],
 [226, 'Laure-Cécile'],
 [243, 'Jan-Hein'],
 [251, 'Nanea'],
 [255, 'Heatherm'],
 [260, 'Iñigo'],
 [266, 'B.'],
 [277, 'Xuejun'],
 [291, 'Abhimanyu'],
 [310, 'MJ'],
 [316, 'Utkarsh'],
 [324, 'Srikanth'],
 [331, 'Jasdeep'],
 [340, 'Dhruv'],
 [350, 'Virasb'],
 [357, 'Naya'],
 [363, 'Subodh'],
 [366, 'Jenova'],
 [401, 'Anisha'],
 [410, 'Yadi'],
 [420, 'Terren'],
 [424, 'HAYDN'],
 [425, 'Branndon'],
 [448, 'Davidra'],
 [453, 'Anukool'],
 [468, 'Pargles'],
 [480, 'Murari'],
 [486, 'Shiliang'],
 [497, 'Sridhar'],
 [515, 'Dheeraj'],
 [522, 'Romanos'],
 [532, 'Zuk'],
 [545, 'CN'],
 [547, 'Faraz'],
 [56

In [73]:
# Unknown names
unk_to_fem = np.random.choice(
    [x[0] for x in unk_names],
    int(np.round(percent_fem*len(unk_names))),
    replace=False
)

for i in unknown.index:
    entrepreneur_df.at[i, 'gender'] = 'Female'

NameError: name 'unknown' is not defined

In [None]:
entrepreneur_df['female'].value_counts()

In [None]:
entrepreneur_df['gender'].value_counts()

In [None]:
unknown = entrepreneur_df[entrepreneur_df['gender'] == 'i donno']

In [None]:
unk_lst = list(unknown.index)

In [None]:
len(unk_lst)

In [None]:
fem_ratio_minus_unk

In [None]:
fem_ratio_minus_unk * len(unk_lst) 

In [None]:
310/(len(unk_lst))

In [None]:
unk_lst

In [None]:
unk_to_fem = np.random.choice(
    [x for x in unk_lst],
    int(np.round(fem_ratio_minus_unk * len(unk_lst))),
    replace=False
)

for i in unk_to_fem:
    entrepreneur_df.at[i, 'gender'] = 'Female'

In [None]:
entrepreneur_df['gender'].value_counts()

In [None]:
male_unk_df = entrepreneur_df[entrepreneur_df['gender'] == 'i donno']

In [None]:
for i in entrepreneur_df.index:
    if entrepreneur_df.at[i, 'gender'] == 'i donno':
        print(entrepreneur_df.loc[i, 'gender'])

In [None]:
entrepreneur_df['gender'].value_counts()

In [None]:
6385/45499

In [None]:
male_unk_df['gender'] = 'Male'

In [None]:
for i in male_unk_df.index

In [None]:
new_fem_ratio = entrepreneur_df['female'].value_counts()[1] / entrepreneur_df.shape[0]
new_fem_ratio_rounded = np.round(entrepreneur_df['female'].value_counts()[1] / entrepreneur_df.shape[0], 2)
print(f'{int(new_fem_ratio_rounded*100)}% of all entrepreneurs are now marked female.')
print(new_fem_ratio)

In [None]:
from guess_indian_gender import IndianGenderPredictor

In [None]:
i = IndianGenderPredictor()

In [None]:
i.predict(name="raghav")

In [None]:
entrepreneur_df['indian_female'] = 0

# Collect indices of female and androgynous names, and mark them as female
female_indian_names = []
andy_indian_names = []
unk_indian_names = []

count_indian_female = 0
count_indian_andy = 0
count_indian_unk = 0

for i in entrepreneur_df.index:
    name = entrepreneur_df.at[i, 'f_name']
    if d.get_gender(name) == 'female':
        female_indian_names.append([i, name])
        entrepreneur_df.at[i, 'indian_female'] = 1
        count_indian_female += 1
    elif d.get_gender(name) == 'mostly_female':
        female_indian_names.append([i, name])
        entrepreneur_df.at[i, 'indian_female'] = 1
        count_indian_female += 1
    elif d.get_gender(name) == 'andy':
        andy_indian_names.append([i, name])
        count_indian_andy += 1
    elif d.get_gender(name) == 'unknown':
        if name in additional_female_names:
            female_indian_names.append([i, name])
            entrepreneur_df.at[i, 'indian_female'] = 1
            count_indian_female += 1
        elif name in additional_male_names:
            pass
        else:
            unk_indian_names.append([i, name])
            count_indian_unk += 1

print(count_indian_female, 'female names.')
print(count_indian_andy, 'androgynous names.')
print(count_indian_unk, 'unknown names.')

In [None]:
entrepreneur_df[(entrepreneur_df['female'] == 1) & (entrepreneur_df['indian_female'] == 1)].shape

In [None]:
entrepreneur_df[(entrepreneur_df['female'] == 0) & (entrepreneur_df['indian_female'] == 1)].shape

In [74]:
entrepreneur_df

Unnamed: 0,name,company_name,crunchbase_uuid,crunchbase_permalink,funding,jobs_created,patents,ipo,city_and_state,metro,gender,lgbtq,race_ethnicity,startout_id,f_name,female
0,Scott Brown,ColdSpark,226a627bc92415995985cbd94743276b,coldspark,6500000,1000.0,0,False,"Broomfield, CO","Denver-Aurora-Lakewood, CO",Male,,,,Scott,0
1,Robin Horwitz,Convo Communications,7d0675cefac592615e1cbb6c29fd403a,convo-communications,500000,100.0,0,False,"Austin, TX","Austin-Round Rock, TX",Female,,,,Robin,0
2,Jiren Parikh,SnapOne,ed5f8110a213395ececfe94660c0f602,snapone-inc,0,100.0,1,False,"Princeton, NJ","Trenton-Ewing, NJ",Male,,,,Jiren,0
3,Matthew Slipper,Symphony Communication Services,5f89826c5031a1932f27525b505b0a7f,symphony-3,461000000,500.0,1,False,"Palo Alto, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Matthew,0
4,Ric Zhou,Kika Tech,437121710de6c1d5e8f8ebe555749fa8,kika-tech,63000000,500.0,0,False,"San Jose, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Ric,0
5,Kumaran Thillainadarajah,Smart Skin Technologies,5974376ec194ff0e4322caded8d049d5,smart-skin-technologies,11545000,50.0,2,False,"New Brunswick, NJ","New York-Northern New Jersey-Long Island, NY-N...",Male,,,,Kumaran,0
6,Sunil Agrawal,Armor5,3fca9b4b44b3ed61719f56c19af7387d,armor5,2000000,50.0,7,False,"Santa Clara, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Sunil,0
7,Suzy Batiz,Poo~Pourri,3ac4350d0bd841a98beb24341e43a63f,poo-pourri,0,50.0,1,False,"Addison, TX","Dallas-Fort Worth-Arlington, TX",Female,,,,Suzy,1
8,Brian Hastings,Gearbox Express,d4ed79c1d290badf8165832a8eed0d41,gearbox-express,2465000,50.0,0,False,"Mukwonago, WI","Milwaukee-Waukesha-West Allis, WI",Male,,,,Brian,0
9,Brian Petersen,Podimetrics,8cded10bbd3be6f33dcf72367e5fb604,podimetrics,15900000,50.0,5,False,"Somerville, MA","Boston-Cambridge-Newton, MA-NH",Male,,,,Brian,0


In [None]:
# Collect indices of female and androgynous names, and mark them as female
female_indian_names = []
andy_indian_names = []
unk_indian_names = []

count_indian_female = 0
count_indian_andy = 0
count_indian_unk = 0

for i in andro_id.index:
    name = andro_id.at[i, 'f_name']
    if d.get_gender(name) == 'female':
        female_indian_names.append([i, name])
        entrepreneur_df.at[i, 'female'] = 1
        count_indian_female += 1
    elif d.get_gender(name) == 'mostly_female':
        female_indian_names.append([i, name])
        entrepreneur_df.at[i, 'female'] = 1
        count_indian_female += 1
    elif d.get_gender(name) == 'andy':
        andy_indian_names.append([i, name])
        count_indian_andy += 1
    elif d.get_gender(name) == 'unknown':
        if name in additional_female_names:
            female_indian_names.append([i, name])
            entrepreneur_df.at[i, 'female'] = 1
            count_indian_female += 1
        elif name in additional_male_names:
            pass
        else:
            unk_indian_names.append([i, name])
            count_indian_unk += 1

print(count_indian_female, 'female names.')
print(count_indian_andy, 'androgynous names.')
print(count_indian_unk, 'unknown names.')

In [None]:
entrepreneur_df.to_csv(
        'gs://impact-index-shared-resources/data/processed/test.csv',
        index=False
    )

In [None]:
print(df[ df['gender'] == 'i donno'].shape)
df[ df['gender'] == 'i donno']

In [None]:
import gender_guesser.detector as gender

In [None]:
d = gender.Detector()

In [None]:
d.get_gender('Chantal')

In [None]:
df[df.f_name.isnull() == True]

In [None]:
for i in df.index:
    first = df.at[i, 'f_name']
    if 

In [None]:
name_gender = {}

for i in social_security_df.index:
    social_security_name = social_security_df.at[i, 'name']
    social_security_gender = social_security_df.at[i, 'name']
    name_gender[social]
    for j in social_security_df.index:
        
        if name == social_security_name:
            print('true')

In [None]:
project = 'peii-242723'
dataset_id = 'entrepreneur'
table_id = 'Combined_tables_39k'
bucket_name = 'muse-data'

# Setup Storage vars
storage_client = storage.Client(project=project)
bucket = storage_client.get_bucket('muse-data')

# Setup BigQuery vars
bq_client = bigquery.Client()
dataset_ref = bq_client.dataset(dataset_id, project=project)
table_ref = dataset_ref.table(table_id)

#destination
destination_uri = "gs://{}/{}".format(bucket_name, "interim_entrepreneur_data_gender.json")

In [None]:
#set job configuration to extract job as JSON files
job_config = bigquery.ExtractJobConfig()
job_config.destination_format = 'NEWLINE_DELIMITED_JSON'
job_config.write_disposition = 'WRITE_TRUNCATE'

In [None]:
extract_job = bq_client.extract_table(
    table_ref,
    destination_uri,
    # Location must match that of the source table.
    location="US",
    job_config=job_config
)  # API request
extract_job.result()  # Waits for job to complete.

print(
    "Exported {}:{}.{} to {}".format(project, dataset_id, table_id, destination_uri)
)

In [None]:
df = pd.read_json(destination_uri, lines=True)

In [None]:
df.shape

In [None]:
df

In [None]:
df = df[df.columns[df.isnull().all()]]
df

In [None]:
df['learner'].drop_duplicates()

2734 in 'Learners' SQL

In [None]:
df['last_modified'] = pd.to_datetime(df['last_modified'])
least_recent_date = df['last_modified'].min()
recent_date = df['last_modified'].max()

In [None]:
recent_date

In [None]:
latest = df.loc[df.groupby('name').last_modified.idxmax()]

In [None]:
latest.shape

In [None]:
lastest = df.sort_values('last_modified').groupby('name').tail(1)

In [None]:
lastest.shape

In [None]:
andy_to_fem = np.random.choice(
    [x[0] for x in andy_names],
    int(np.round(percent_fem*len(andy_names))),
    replace=False
)

for i in andy_to_fem:
    entrepreneur_df.at[i, 'gender'] = 'Female'