In [1]:
import json
import pandas as pd
import time
import gcsfs
import gender_guesser.detector as gender
from datetime import date

In [2]:
from google.cloud import storage 
from google.cloud import bigquery as bq
from pathlib import Path
import os

In [3]:
PROJ_ROOT = Path().resolve().parent
KEYS_DIR = PROJ_ROOT / 'keys' 
keys = KEYS_DIR / 'Keys for Big Query Storage Admin - PEII.json'

In [4]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(keys)

In [5]:
today = date.today()

#override for debugging
today = '2019-11-06'

uri = 'gs://impact-index-shared-resources/data/interim/{}/interim_entrepreneur_data.csv'.format(today)

entrepreneur_df = pd.read_csv(uri)

In [6]:
client = bq.Client()

#gets only unique names that are female, and aggregates the record number for each name
f_names_query="""
#standardSQL
SELECT name, SUM(number) as records 
FROM `bigquery-public-data.usa_names.usa_1910_current`
WHERE gender = 'F'
GROUP BY name 
ORDER BY records DESC
"""

job_config = bq.QueryJobConfig()
now = time.time()
query_job = client.query(f_names_query, location = 'US')
res = query_job.result()
print('query took:', round(time.time()-now,2), 's')

query took: 1.99 s


In [7]:
now = time.time()
f_names_df=res.to_dataframe()
print('load to pandas from BQ took:', round(time.time()-now, 2),'s')

load to pandas from BQ took: 0.77 s


In [8]:
#takes only the female names and turns it into a list
f_names = f_names_df['name'].to_list()

for i in range(len(f_names)):
    f_names[i] =  f_names[i].lower()

print(len(f_names))
f_names

20852


['mary',
 'patricia',
 'elizabeth',
 'jennifer',
 'linda',
 'barbara',
 'margaret',
 'susan',
 'dorothy',
 'jessica',
 'sarah',
 'nancy',
 'betty',
 'karen',
 'lisa',
 'helen',
 'sandra',
 'ashley',
 'kimberly',
 'emily',
 'donna',
 'carol',
 'michelle',
 'amanda',
 'melissa',
 'laura',
 'anna',
 'stephanie',
 'deborah',
 'rebecca',
 'ruth',
 'sharon',
 'cynthia',
 'kathleen',
 'amy',
 'shirley',
 'angela',
 'virginia',
 'catherine',
 'katherine',
 'brenda',
 'emma',
 'pamela',
 'nicole',
 'christine',
 'samantha',
 'rachel',
 'janet',
 'carolyn',
 'debra',
 'evelyn',
 'maria',
 'frances',
 'heather',
 'diane',
 'julie',
 'joyce',
 'martha',
 'alice',
 'victoria',
 'joan',
 'christina',
 'kelly',
 'lauren',
 'marie',
 'ann',
 'doris',
 'judith',
 'olivia',
 'jean',
 'cheryl',
 'megan',
 'kathryn',
 'andrea',
 'grace',
 'rose',
 'hannah',
 'jacqueline',
 'julia',
 'sara',
 'gloria',
 'teresa',
 'janice',
 'mildred',
 'theresa',
 'madison',
 'judy',
 'lillian',
 'beverly',
 'denise',
 'm

In [9]:
#creates a separate list for M names because it is faster to use SQL than iterate through df

m_names_query="""
#standardSQL
SELECT name, SUM(number) as records 
FROM `bigquery-public-data.usa_names.usa_1910_current`
WHERE gender = 'M'
GROUP BY name 
ORDER BY records DESC
"""

job_config = bq.QueryJobConfig()
now = time.time()
query_job = client.query(m_names_query,location = 'US')
res = query_job.result()
print('query took:', round(time.time()-now,2), 's')

query took: 1.36 s


In [10]:
now = time.time()
m_names_df = res.to_dataframe()
print('load to pandas from BQ took:', round(time.time()-now,2),'s')

load to pandas from BQ took: 0.7 s


In [11]:
#takes only the male names and turns it into a list
m_names = m_names_df['name'].to_list()
for i in range(len(m_names)):
    m_names[i] =  m_names[i].lower()
    
print(len(m_names))
m_names

13785


['james',
 'john',
 'robert',
 'michael',
 'william',
 'david',
 'richard',
 'joseph',
 'charles',
 'thomas',
 'christopher',
 'daniel',
 'matthew',
 'anthony',
 'donald',
 'paul',
 'mark',
 'george',
 'steven',
 'andrew',
 'kenneth',
 'edward',
 'joshua',
 'kevin',
 'brian',
 'ronald',
 'timothy',
 'jason',
 'jeffrey',
 'ryan',
 'jacob',
 'gary',
 'nicholas',
 'eric',
 'stephen',
 'jonathan',
 'frank',
 'larry',
 'justin',
 'scott',
 'brandon',
 'raymond',
 'samuel',
 'benjamin',
 'gregory',
 'jack',
 'patrick',
 'alexander',
 'henry',
 'dennis',
 'jerry',
 'tyler',
 'aaron',
 'walter',
 'peter',
 'jose',
 'douglas',
 'adam',
 'nathan',
 'zachary',
 'harold',
 'arthur',
 'carl',
 'kyle',
 'albert',
 'gerald',
 'lawrence',
 'roger',
 'jeremy',
 'keith',
 'joe',
 'ethan',
 'terry',
 'christian',
 'sean',
 'willie',
 'austin',
 'jesse',
 'ralph',
 'noah',
 'billy',
 'bruce',
 'bryan',
 'roy',
 'jordan',
 'dylan',
 'louis',
 'eugene',
 'harry',
 'wayne',
 'alan',
 'russell',
 'juan',
 'ga

In [12]:
andro_names = (set(m_names).intersection(f_names))

print(len(andro_names))
andro_names

3042


{'cartier',
 'jaiden',
 'gene',
 'arlyn',
 'najee',
 'alexa',
 'mark',
 'brooklynn',
 'daryan',
 'hugh',
 'blake',
 'michel',
 'larue',
 'andres',
 'preston',
 'kaylan',
 'april',
 'monica',
 'lian',
 'levon',
 'marrion',
 'avery',
 'kathryn',
 'holly',
 'trace',
 'angelita',
 'wren',
 'tamara',
 'gayle',
 'ozie',
 'audra',
 'harbor',
 'paxtyn',
 'karel',
 'tammy',
 'carmine',
 'arlee',
 'val',
 'kainoa',
 'conner',
 'nikia',
 'arley',
 'marley',
 'sierra',
 'hong',
 'halley',
 'aydan',
 'remi',
 'virgie',
 'brion',
 'kao',
 'ana',
 'denali',
 'hildred',
 'leshawn',
 'kazi',
 'marcus',
 'braylen',
 'ava',
 'vicky',
 'deandrea',
 'bernell',
 'kayron',
 'dessie',
 'addie',
 'murrell',
 'rain',
 'amarii',
 'rowan',
 'coby',
 'ainsley',
 'carolyn',
 'devan',
 'barrie',
 'san',
 'latasha',
 'rachel',
 'zia',
 'loy',
 'hester',
 'hamdi',
 'marcy',
 'lavelle',
 'kassandra',
 'evann',
 'niko',
 'carl',
 'will',
 'emery',
 'roni',
 'yan',
 'collins',
 'jamal',
 'august',
 'stacey',
 'alesha',
 

In [13]:
#female names only names not in male list
f_names = set(f_names) - set(m_names)
len(f_names)

17810

In [14]:
#male names not in female list
m_names = set(m_names) - set(f_names)

In [15]:
andro_names = list(andro_names)
f_names = list(f_names)
m_names = list(m_names)

In [16]:
m_names_df = m_names_df.set_index('name')

In [17]:
f_names_df = f_names_df.set_index('name')

In [42]:
count_m = 0
count_f = 0
count_na = 0
femme = []
masc = []

for name in andro_names:
    name = name.capitalize()
    male_count = m_names_df.loc[name]['records']
    female_count = f_names_df.loc[name]['records']
    total = male_count + female_count
    percentage_male = round(male_count / total, 2)
    percentage_female = round(female_count / total, 2)
    if percentage_male >= 0.75:
        gender_assign = 'M'
        count_m += 1
        masc.append(name.lower())
    elif percentage_female >= 0.75:
        gender_assign = 'F'
        count_f += 1
        femme.append(name.lower())
    else:
        gender_assign = 'N/A'
        count_na += 1
        
    x = 10 - len(str(male_count + female_count))
    y = 10 - len(name)
    print(name, y*' ', male_count, female_count, x*' ', ' | ', percentage_male, percentage_female, ' | ', gender_assign)
    
print('count M', count_m)
print('count_F', count_f)
print('count_na', count_na)
    
    
#     if male_count > female_count:
#         m_names.append(name)
#     if female_count > male_count:
#         f_names.append(name)

Cartier     542 5          |  0.99 0.01  |  M
Jaiden      27795 7004        |  0.8 0.2  |  M
Gene        121876 3430       |  0.97 0.03  |  M
Arlyn       635 655         |  0.49 0.51  |  N/A
Najee       1534 55         |  0.97 0.03  |  M
Alexa       20 123671       |  0.0 1.0  |  F
Mark        1347680 1837      |  1.0 0.0  |  M
Brooklynn   6 30560        |  0.0 1.0  |  F
Daryan      32 6           |  0.84 0.16  |  M
Hugh        70278 6        |  1.0 0.0  |  M
Blake       172330 8382       |  0.95 0.05  |  M
Michel      5762 1289         |  0.82 0.18  |  M
Larue       254 1915         |  0.12 0.88  |  F
Andres      81179 122        |  1.0 0.0  |  M
Preston     108463 18       |  1.0 0.0  |  M
Kaylan      17 2576         |  0.01 0.99  |  F
April       52 236721       |  0.0 1.0  |  F
Monica      367 244894       |  0.0 1.0  |  F
Lian        845 221         |  0.79 0.21  |  M
Levon       1729 16         |  0.99 0.01  |  M
Marrion     5 5           |  0.5 0.5  |  N/A
Avery       51136 1239

Tiana       9 24436        |  0.0 1.0  |  F
Jodie       946 23393        |  0.04 0.96  |  F
Briana      25 91794        |  0.0 1.0  |  F
Kaoru       119 28          |  0.81 0.19  |  M
Kris        10874 11020        |  0.5 0.5  |  N/A
Venice      5 475          |  0.01 0.99  |  F
Memphis     4864 413         |  0.92 0.08  |  M
Kaitlin     6 55909        |  0.0 1.0  |  F
Danell      5 156          |  0.03 0.97  |  F
Gustavo     34969 29        |  1.0 0.0  |  M
Jacy        12 1053         |  0.01 0.99  |  F
Ezra        46587 1139        |  0.98 0.02  |  M
Kamari      5957 2871         |  0.67 0.33  |  N/A
Torrey      2508 249         |  0.91 0.09  |  M
Adison      34 1967         |  0.02 0.98  |  F
Jordan      371032 130045       |  0.74 0.26  |  N/A
Conley      969 28          |  0.97 0.03  |  M
Fay         1355 24310        |  0.05 0.95  |  F
Daylen      1739 5         |  1.0 0.0  |  M
Chai        55 11           |  0.83 0.17  |  M
Cher        37 577          |  0.06 0.94  |  F
Caitlin 

Beth        15 154945       |  0.0 1.0  |  F
Talyn       64 91          |  0.41 0.59  |  N/A
Regina      26 170834       |  0.0 1.0  |  F
Esperanza   5 16377        |  0.0 1.0  |  F
Osiris      913 345         |  0.73 0.27  |  N/A
Ann         109 450384       |  0.0 1.0  |  F
Naveen      255 52          |  0.83 0.17  |  M
Verlon      262 30          |  0.9 0.1  |  M
Myka        5 737          |  0.01 0.99  |  F
Mildred     120 394861       |  0.0 1.0  |  F
Hudsyn      5 39           |  0.11 0.89  |  F
Darien      7476 581         |  0.93 0.07  |  M
Patty       22 59346        |  0.0 1.0  |  F
Maeson      44 5           |  0.9 0.1  |  M
Isabelle    7 83764        |  0.0 1.0  |  F
Thanh       378 183          |  0.67 0.33  |  N/A
Jamari      10632 125        |  0.99 0.01  |  M
Alpha       477 2920         |  0.14 0.86  |  F
Rian        2024 1120         |  0.64 0.36  |  N/A
Lenny       4104 16         |  1.0 0.0  |  M
Eastyn      145 47          |  0.76 0.24  |  M
Remy        4904 3163  

Nicola      1036 2223         |  0.32 0.68  |  N/A
Elizabeth   1634 1537684      |  0.0 1.0  |  F
Olie        6 6           |  0.5 0.5  |  N/A
Teegan      377 505          |  0.43 0.57  |  N/A
Dru         363 165          |  0.69 0.31  |  N/A
Brittany    410 357306       |  0.0 1.0  |  F
Kaya        102 6856         |  0.01 0.99  |  F
Jeffrey     973514 1378       |  1.0 0.0  |  M
Toi         15 313          |  0.05 0.95  |  F
Brittani    8 10352        |  0.0 1.0  |  F
Emersyn     5 7936         |  0.0 1.0  |  F
Norman      243136 72       |  1.0 0.0  |  M
Tom         125494 10       |  1.0 0.0  |  M
Sincere     5909 327         |  0.95 0.05  |  M
Kinsley     5 25297        |  0.0 1.0  |  F
Dereon      752 154          |  0.83 0.17  |  M
Laine       74 890          |  0.08 0.92  |  F
Lawrence    441700 102       |  1.0 0.0  |  M
Ottie       10 89           |  0.1 0.9  |  F
Jane        42 358170       |  0.0 1.0  |  F
Elian       5166 5         |  1.0 0.0  |  M
Rhys        6790 31     

Unkown      49 41           |  0.54 0.46  |  N/A
Laverne     10354 39426        |  0.21 0.79  |  F
Teagen      62 46          |  0.57 0.43  |  N/A
Jaskirat    6 5           |  0.55 0.45  |  N/A
Jona        10 109          |  0.08 0.92  |  F
Tory        3860 1096         |  0.78 0.22  |  M
Henley      215 3316         |  0.06 0.94  |  F
Delma       432 4595         |  0.09 0.91  |  F
Grey        1896 126         |  0.94 0.06  |  M
Sivan       11 172          |  0.06 0.94  |  F
Kirin       76 24          |  0.76 0.24  |  M
Zen         442 16          |  0.97 0.03  |  M
Yang        40 12           |  0.77 0.23  |  M
Keylen      5 47           |  0.1 0.9  |  F
Corbyn      1088 6         |  0.99 0.01  |  M
Luis        270048 690       |  1.0 0.0  |  M
Autumn      24 122181       |  0.0 1.0  |  F
Paris       4865 26475        |  0.16 0.84  |  F
Austin      407233 2379       |  0.99 0.01  |  M
Vernice     33 3672         |  0.01 0.99  |  F
Gagandeep   57 15           |  0.79 0.21  |  M
Rashaw

Yvonne      16 150867       |  0.0 1.0  |  F
Ronda       46 31727        |  0.0 1.0  |  F
Nour        37 1287         |  0.03 0.97  |  F
Latrell     1968 48         |  0.98 0.02  |  M
Trinidad    3997 2992         |  0.57 0.43  |  N/A
Destin      3357 64         |  0.98 0.02  |  M
Dell        655 399         |  0.62 0.38  |  N/A
Mika        121 2916         |  0.04 0.96  |  F
Dolores     565 206084       |  0.0 1.0  |  F
Zander      20712 5        |  1.0 0.0  |  M
Walker      20112 12        |  1.0 0.0  |  M
Madison     3215 380503       |  0.01 0.99  |  F
Ingrid      5 21502        |  0.0 1.0  |  F
Earle       6373 5         |  1.0 0.0  |  M
Aster       5 20           |  0.2 0.8  |  F
Daisy       159 118034       |  0.0 1.0  |  F
Rande       21 5           |  0.81 0.19  |  M
Mykah       149 226          |  0.4 0.6  |  N/A
Unnamed     267 217          |  0.55 0.45  |  N/A
Okie        21 11           |  0.66 0.34  |  N/A
Goldie      5 22686        |  0.0 1.0  |  F
Kristopher  59684 11  

Salvador    43522 86        |  1.0 0.0  |  M
Damaris     10 9354         |  0.0 1.0  |  F
Indie       5 724          |  0.01 0.99  |  F
Yessenia    5 5763         |  0.0 1.0  |  F
Edwin       229023 98       |  1.0 0.0  |  M
Noe         18979 64        |  1.0 0.0  |  M
Michele     948 220823       |  0.0 1.0  |  F
Korey       8193 109         |  0.99 0.01  |  M
Syncere     1043 20         |  0.98 0.02  |  M
Amori       5 17           |  0.23 0.77  |  F
Ary         11 5           |  0.69 0.31  |  N/A
Catalina    6 16951        |  0.0 1.0  |  F
Adrian      232741 11276       |  0.95 0.05  |  M
Reiley      5 21           |  0.19 0.81  |  F
Connor      209167 565       |  1.0 0.0  |  M
Marin       119 3298         |  0.03 0.97  |  F
Ryder       43459 436        |  0.99 0.01  |  M
Shanon      392 2590         |  0.13 0.87  |  F
Linden      434 188          |  0.7 0.3  |  N/A
Lashon      86 674          |  0.11 0.89  |  F
Darrian     1435 345         |  0.81 0.19  |  M
Victor      310563 419

Wynn        180 41          |  0.81 0.19  |  M
Tobie       10 89           |  0.1 0.9  |  F
Otha        3845 333         |  0.92 0.08  |  M
Paul        1363248 1959      |  1.0 0.0  |  M
Hien        42 59          |  0.42 0.58  |  N/A
Sandra      776 871636       |  0.0 1.0  |  F
Ester       138 10157        |  0.01 0.99  |  F
Kaiden      36701 547        |  0.99 0.01  |  M
Joud        20 16           |  0.56 0.44  |  N/A
Alexie      16 452          |  0.03 0.97  |  F
Meng        357 6          |  0.98 0.02  |  M
Itzel       6 15510        |  0.0 1.0  |  F
Pablo       41687 7        |  1.0 0.0  |  M
Marquelle   5 5           |  0.5 0.5  |  N/A
Merle       25291 10031        |  0.72 0.28  |  N/A
Ramsey      2302 343         |  0.87 0.13  |  M
Tandy       5 141          |  0.03 0.97  |  F
Tatum       3891 18176        |  0.18 0.82  |  F
Brian       1164234 2324      |  1.0 0.0  |  M
Rashida     6 1879         |  0.0 1.0  |  F
Verlyn      466 10          |  0.98 0.02  |  M
Krishna     753

Lucia       5 37398        |  0.0 1.0  |  F
Alante      107 5          |  0.96 0.04  |  M
Pearly      6 367          |  0.02 0.98  |  F
Jada        22 62550        |  0.0 1.0  |  F
Babygirl    6 1070         |  0.01 0.99  |  F
Anne        48 306106       |  0.0 1.0  |  F
Parker      100004 15391       |  0.87 0.13  |  M
Kyndall     5 4194         |  0.0 1.0  |  F
Bentlee     2565 189         |  0.93 0.07  |  M
Azriel      774 10          |  0.99 0.01  |  M
Jess        16253 5        |  1.0 0.0  |  M
Mckenna     10 36388        |  0.0 1.0  |  F
Aly         98 303          |  0.24 0.76  |  F
Yeng        243 29          |  0.89 0.11  |  M
Bawi        67 5           |  0.93 0.07  |  M
Ikea        7 252          |  0.03 0.97  |  F
Deundra     5 10           |  0.33 0.67  |  N/A
Posey       36 28           |  0.56 0.44  |  N/A
Page        133 849          |  0.14 0.86  |  F
Regan       839 12670        |  0.06 0.94  |  F
Kendall     29237 56967        |  0.34 0.66  |  N/A
Kobe        15685 7

Sora        8 385          |  0.02 0.98  |  F
Kelcey      31 670          |  0.04 0.96  |  F
Dewey       26806 5        |  1.0 0.0  |  M
Norvell     26 16           |  0.62 0.38  |  N/A
Adler       1025 15         |  0.99 0.01  |  M
Josue       48045 35        |  1.0 0.0  |  M
Deavion     11 58           |  0.16 0.84  |  F
Liam        211240 16       |  1.0 0.0  |  M
Darrien     1376 19         |  0.99 0.01  |  M
Macy        13 27070        |  0.0 1.0  |  F
Garnell     5 7           |  0.42 0.58  |  N/A
Jazmin      79 39825        |  0.0 1.0  |  F
Braden      42311 17        |  1.0 0.0  |  M
Sara        151 413765       |  0.0 1.0  |  F
Royal       8304 1681         |  0.83 0.17  |  M
Derrick     140629 105       |  1.0 0.0  |  M
Evelyn      190 545790       |  0.0 1.0  |  F
Sally       10 195005       |  0.0 1.0  |  F
Akira       790 4633         |  0.15 0.85  |  F
Samari      68 376          |  0.15 0.85  |  F
Diamond     657 30937        |  0.02 0.98  |  F
Rosie       31 66485      

Graycen     15 48           |  0.24 0.76  |  F
Adriane     8 3295         |  0.0 1.0  |  F
Jessiah     647 12          |  0.98 0.02  |  M
Avion       879 19          |  0.98 0.02  |  M
Phillip     303468 81       |  1.0 0.0  |  M
Santiago    59050 5        |  1.0 0.0  |  M
Bao         175 387          |  0.31 0.69  |  N/A
Jaziah      314 189          |  0.62 0.38  |  N/A
Nikola      1236 15         |  0.99 0.01  |  M
Bela        5 82           |  0.06 0.94  |  F
Rodolfo     29876 13        |  1.0 0.0  |  M
Mae         11 77332        |  0.0 1.0  |  F
Kayle       5 2002         |  0.0 1.0  |  F
Jovan       7385 352         |  0.95 0.05  |  M
Li          12 38           |  0.24 0.76  |  F
Ocie        1083 1937         |  0.36 0.64  |  N/A
Alissa      5 35269        |  0.0 1.0  |  F
Jonell      5 562          |  0.01 0.99  |  F
Hendrix     4795 95         |  0.98 0.02  |  M
Vola        12 48           |  0.2 0.8  |  F
Elisabeth   7 39501        |  0.0 1.0  |  F
Henri       1336 31        

Darion      4799 111         |  0.98 0.02  |  M
Ronit       169 84          |  0.67 0.33  |  N/A
Aidan       108871 1259       |  0.99 0.01  |  M
Zyan        253 5          |  0.98 0.02  |  M
Mackenzie   5442 126541       |  0.04 0.96  |  F
Redell      11 5           |  0.69 0.31  |  N/A
Jalin       867 10          |  0.99 0.01  |  M
Amauri      709 44          |  0.94 0.06  |  M
Shine       6 5           |  0.55 0.45  |  N/A
Dung        166 87          |  0.66 0.34  |  N/A
Landis      120 11          |  0.92 0.08  |  M
Ramon       69400 51        |  1.0 0.0  |  M
Patsy       6479 115344       |  0.05 0.95  |  F
Joseph      2522812 5625      |  1.0 0.0  |  M
Claudia     233 120462       |  0.0 1.0  |  F
Yi          25 27           |  0.48 0.52  |  N/A
Tyree       10899 16        |  1.0 0.0  |  M
Daris       14 10           |  0.58 0.42  |  N/A
Zeppelin    199 5          |  0.98 0.02  |  M
Azari       38 109          |  0.26 0.74  |  N/A
Jaidyn      2095 4407         |  0.32 0.68  |  N/

Kamauri     190 21          |  0.9 0.1  |  M
Morgan      36589 213105       |  0.15 0.85  |  F
Andra       885 2041         |  0.3 0.7  |  N/A
Lori        90 338788       |  0.0 1.0  |  F
Cecilia     19 88397        |  0.0 1.0  |  F
Kawena      15 62           |  0.19 0.81  |  F
Amir        30971 5        |  1.0 0.0  |  M
Kallen      664 5          |  0.99 0.01  |  M
Jere        1738 96         |  0.95 0.05  |  M
Montie      157 27          |  0.85 0.15  |  M
Larkin      148 377          |  0.28 0.72  |  N/A
Rasheen     732 10          |  0.99 0.01  |  M
Gianni      8791 1073         |  0.89 0.11  |  M
Lexie       6 11630        |  0.0 1.0  |  F
Shavon      21 2425         |  0.01 0.99  |  F
Jacky       3897 81         |  0.98 0.02  |  M
Abriel      43 15           |  0.74 0.26  |  N/A
Taylan      59 6           |  0.91 0.09  |  M
Cleo        7442 22745        |  0.25 0.75  |  F
Haylee      8 27094        |  0.0 1.0  |  F
Rilyn       5 245          |  0.02 0.98  |  F
Reed        25192 

Sherry      16 227004       |  0.0 1.0  |  F
Burnice     318 336          |  0.49 0.51  |  N/A
Gerardo     49134 97        |  1.0 0.0  |  M
Aman        990 10         |  0.99 0.01  |  M
Matisse     22 23           |  0.49 0.51  |  N/A
Raquel      37 45772        |  0.0 1.0  |  F
Amara       18 11973        |  0.0 1.0  |  F
Kathleen    296 704489       |  0.0 1.0  |  F
Indigo      107 815          |  0.12 0.88  |  F
Jaycee      216 7564         |  0.03 0.97  |  F
Adell       150 3253         |  0.04 0.96  |  F
Shelley     115 69879        |  0.0 1.0  |  F
Kinnick     327 5          |  0.98 0.02  |  M
Corie       21 1323         |  0.02 0.98  |  F
Darby       409 4433         |  0.08 0.92  |  F
Seng        93 20          |  0.82 0.18  |  M
Amere       262 5          |  0.98 0.02  |  M
Lacey       43 49272        |  0.0 1.0  |  F
Alec        48316 17        |  1.0 0.0  |  M
Lemmie      26 5           |  0.84 0.16  |  M
Lucas       250302 44       |  1.0 0.0  |  M
Hailey      27 159584    

Tracy       56511 247858       |  0.19 0.81  |  F
Susana      25 17284        |  0.0 1.0  |  F
Lavell      902 10          |  0.99 0.01  |  M
Mandy       6 36914        |  0.0 1.0  |  F
Caden       59831 202        |  1.0 0.0  |  M
Everett     98280 39        |  1.0 0.0  |  M
Caleb       280881 53       |  1.0 0.0  |  M
Ossie       201 1384         |  0.13 0.87  |  F
Joanne      18 208091       |  0.0 1.0  |  F
Charley     12757 5951        |  0.68 0.32  |  N/A
Whitley     32 4352         |  0.01 0.99  |  F
Hallie      120 22798        |  0.01 0.99  |  F
James       4997327 18257      |  1.0 0.0  |  M
Kaegan      21 26           |  0.45 0.55  |  N/A
Khamani     249 43          |  0.85 0.15  |  M
Maxie       2136 1827         |  0.54 0.46  |  N/A
Rene        44691 17522        |  0.72 0.28  |  N/A
Consuelo    39 13923        |  0.0 1.0  |  F
Matilde     148 1612         |  0.08 0.92  |  F
Kamdyn      1728 686         |  0.72 0.28  |  N/A
Jaelyn      344 11642        |  0.03 0.97  |  F
T

In [19]:
len(set(f_names) - set(femme))

17810

In [43]:
femme

['alexa',
 'brooklynn',
 'larue',
 'kaylan',
 'april',
 'monica',
 'kathryn',
 'holly',
 'angelita',
 'wren',
 'tamara',
 'gayle',
 'ozie',
 'audra',
 'harbor',
 'paxtyn',
 'karel',
 'tammy',
 'nikia',
 'marley',
 'sierra',
 'halley',
 'remi',
 'virgie',
 'ana',
 'denali',
 'hildred',
 'ava',
 'vicky',
 'deandrea',
 'dessie',
 'addie',
 'rain',
 'ainsley',
 'carolyn',
 'san',
 'latasha',
 'rachel',
 'zia',
 'hester',
 'hamdi',
 'marcy',
 'kassandra',
 'roni',
 'collins',
 'stacey',
 'alesha',
 'gabriele',
 'jill',
 'jersey',
 'myrle',
 'eunice',
 'kary',
 'gay',
 'bellamy',
 'marilyn',
 'christin',
 'pressley',
 'emily',
 'felice',
 'mavis',
 'my',
 'janelle',
 'kelli',
 'darlene',
 'latisha',
 'kamalani',
 'sequoyah',
 'kiana',
 'ellie',
 'bertha',
 'serenity',
 'presley',
 'ariel',
 'dominga',
 'caprice',
 'jaye',
 'tiana',
 'jodie',
 'briana',
 'venice',
 'kaitlin',
 'danell',
 'jacy',
 'adison',
 'fay',
 'cher',
 'caitlin',
 'lan',
 'giselle',
 'catlin',
 'beatriz',
 'thelma',
 'ky

In [44]:
len(f_names + femme)
f_names = f_names + femme

In [45]:
len(f_names)

20314

In [46]:
masc

['cartier',
 'jaiden',
 'gene',
 'najee',
 'mark',
 'daryan',
 'hugh',
 'blake',
 'michel',
 'andres',
 'preston',
 'lian',
 'levon',
 'trace',
 'carmine',
 'val',
 'kainoa',
 'conner',
 'arley',
 'aydan',
 'brion',
 'leshawn',
 'kazi',
 'marcus',
 'braylen',
 'bernell',
 'murrell',
 'amarii',
 'coby',
 'devan',
 'loy',
 'niko',
 'carl',
 'will',
 'yan',
 'jamal',
 'august',
 'rodney',
 'layne',
 'quin',
 'lior',
 'brennen',
 'calvin',
 'isaac',
 'jonte',
 'rayshawn',
 'codie',
 'kayden',
 'giovanni',
 'levern',
 'sawyer',
 'nathan',
 'andrew',
 'trystin',
 'davon',
 'quinton',
 'meyer',
 'brentley',
 'terrell',
 'sylvester',
 'clair',
 'von',
 'max',
 'graysen',
 'rashaun',
 'sol',
 'jaydin',
 'donnell',
 'randall',
 'jaeden',
 'dillon',
 'brantley',
 'kaoru',
 'memphis',
 'gustavo',
 'ezra',
 'torrey',
 'conley',
 'daylen',
 'chai',
 'benjamin',
 'dan',
 'shia',
 'milton',
 'ari',
 'arlis',
 'leyton',
 'craig',
 'chaz',
 'cedar',
 'enrique',
 'deandre',
 'kyle',
 'leo',
 'michael',
 

In [47]:
m_names = m_names + masc

In [48]:
andro_ = set(andro_names) - set(masc)
andro_ = set(andro_) - set(femme)

In [49]:
len(andro_)

518

In [50]:
andro_

{'abrar',
 'abriel',
 'adama',
 'addis',
 'adi',
 'afnan',
 'aiman',
 'aimar',
 'albie',
 'aldean',
 'alexiz',
 'alexx',
 'alik',
 'allyn',
 'altair',
 'alva',
 'amandeep',
 'amaree',
 'amari',
 'amel',
 'amen',
 'amil',
 'amrit',
 'an',
 'anay',
 'andra',
 'andree',
 'angel',
 'anmol',
 'ara',
 'ardell',
 'arden',
 'arie',
 'aries',
 'arin',
 'aris',
 'ariyan',
 'arlee',
 'arlin',
 'arlyn',
 'armani',
 'arnell',
 'arshdeep',
 'arshia',
 'artha',
 'artie',
 'arvie',
 'ary',
 'ascension',
 'ashten',
 'ashtin',
 'asuncion',
 'audie',
 'austyn',
 'aven',
 'avery',
 'ayomide',
 'azari',
 'azariah',
 'aziah',
 'baby',
 'bao',
 'barrie',
 'bayler',
 'bentlie',
 'bergen',
 'blair',
 'bora',
 'bowie',
 'braylyn',
 'breslin',
 'briar',
 'brighton',
 'brit',
 'britt',
 'brittan',
 'britten',
 'burnice',
 'caelin',
 'camari',
 'camdyn',
 'cameran',
 'campbell',
 'carey',
 'carlin',
 'carmon',
 'carrol',
 'casey',
 'cashmere',
 'chan',
 'channing',
 'charleston',
 'charley',
 'charly',
 'chayanne'

In [55]:
#creating the first name column
for i in entrepreneur_df.index:
    entrepreneur_df.at[i, 'f_name'] = entrepreneur_df.at[i, 'name'].split(' ')[0]

In [56]:
entrepreneur_df

Unnamed: 0,name,company_name,crunchbase_uuid,crunchbase_permalink,funding,jobs_created,patents,ipo,city_and_state,metro,gender,lgbtq,race_ethnicity,startout_id,f_name
0,Scott Brown,ColdSpark,226a627bc92415995985cbd94743276b,coldspark,6500000,1000.0,0,False,"Broomfield, CO","Denver-Aurora-Lakewood, CO",,,,,Scott
1,Robin Horwitz,Convo Communications,7d0675cefac592615e1cbb6c29fd403a,convo-communications,500000,100.0,0,False,"Austin, TX","Austin-Round Rock, TX",,,,,Robin
2,Jiren Parikh,SnapOne,ed5f8110a213395ececfe94660c0f602,snapone-inc,0,100.0,1,False,"Princeton, NJ","Trenton-Ewing, NJ",Male,,,,Jiren
3,Matthew Slipper,Symphony Communication Services,5f89826c5031a1932f27525b505b0a7f,symphony-3,461000000,500.0,1,False,"Palo Alto, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Matthew
4,Ric Zhou,Kika Tech,437121710de6c1d5e8f8ebe555749fa8,kika-tech,63000000,500.0,0,False,"San Jose, CA","San Jose-Sunnyvale-Santa Clara, CA",,,,,Ric
5,Kumaran Thillainadarajah,Smart Skin Technologies,5974376ec194ff0e4322caded8d049d5,smart-skin-technologies,11545000,50.0,2,False,"New Brunswick, NJ","New York-Northern New Jersey-Long Island, NY-N...",Male,,,,Kumaran
6,Sunil Agrawal,Armor5,3fca9b4b44b3ed61719f56c19af7387d,armor5,2000000,50.0,7,False,"Santa Clara, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Sunil
7,Suzy Batiz,Poo~Pourri,3ac4350d0bd841a98beb24341e43a63f,poo-pourri,0,50.0,1,False,"Addison, TX","Dallas-Fort Worth-Arlington, TX",,,,,Suzy
8,Brian Hastings,Gearbox Express,d4ed79c1d290badf8165832a8eed0d41,gearbox-express,2465000,50.0,0,False,"Mukwonago, WI","Milwaukee-Waukesha-West Allis, WI",Male,,,,Brian
9,Brian Petersen,Podimetrics,8cded10bbd3be6f33dcf72367e5fb604,podimetrics,15900000,50.0,5,False,"Somerville, MA","Boston-Cambridge-Newton, MA-NH",,,,,Brian


In [58]:
entrepreneur_df.shape

(51884, 15)

In [57]:
null = entrepreneur_df[pd.isnull(entrepreneur_df['gender'])]
null.shape

(25477, 15)

In [97]:
'Alexa' in list(f_names)

True

In [63]:
len(andro_)

518

In [None]:
#our bread and butter, the part that does it all...

In [62]:
f_count = 0
m_count = 0
a_count = 0
i_count = 0

for i in null.index:
    name = null.at[i, 'f_name'].lower()
    
    if name in f_names:
        entrepreneur_df.at[i, 'gender'] = 'f'
        f_count += 1
        
    elif name in m_names:
        entrepreneur_df.at[i, 'gender'] = 'm'
        m_count += 1
        
    elif name in andro_:
        entrepreneur_df.at[i, 'gender'] = 'andro'
        a_count += 1
        
    else:
        entrepreneur_df.at[i, 'gender'] = 'i donno'
        i_count += 1
        
print(f_count, m_count, a_count, i_count)

2386 20146 0 2945


In [65]:
entrepreneur_df

Unnamed: 0,name,company_name,crunchbase_uuid,crunchbase_permalink,funding,jobs_created,patents,ipo,city_and_state,metro,gender,lgbtq,race_ethnicity,startout_id,f_name
0,Scott Brown,ColdSpark,226a627bc92415995985cbd94743276b,coldspark,6500000,1000.0,0,False,"Broomfield, CO","Denver-Aurora-Lakewood, CO",m,,,,Scott
1,Robin Horwitz,Convo Communications,7d0675cefac592615e1cbb6c29fd403a,convo-communications,500000,100.0,0,False,"Austin, TX","Austin-Round Rock, TX",f,,,,Robin
2,Jiren Parikh,SnapOne,ed5f8110a213395ececfe94660c0f602,snapone-inc,0,100.0,1,False,"Princeton, NJ","Trenton-Ewing, NJ",Male,,,,Jiren
3,Matthew Slipper,Symphony Communication Services,5f89826c5031a1932f27525b505b0a7f,symphony-3,461000000,500.0,1,False,"Palo Alto, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Matthew
4,Ric Zhou,Kika Tech,437121710de6c1d5e8f8ebe555749fa8,kika-tech,63000000,500.0,0,False,"San Jose, CA","San Jose-Sunnyvale-Santa Clara, CA",m,,,,Ric
5,Kumaran Thillainadarajah,Smart Skin Technologies,5974376ec194ff0e4322caded8d049d5,smart-skin-technologies,11545000,50.0,2,False,"New Brunswick, NJ","New York-Northern New Jersey-Long Island, NY-N...",Male,,,,Kumaran
6,Sunil Agrawal,Armor5,3fca9b4b44b3ed61719f56c19af7387d,armor5,2000000,50.0,7,False,"Santa Clara, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Sunil
7,Suzy Batiz,Poo~Pourri,3ac4350d0bd841a98beb24341e43a63f,poo-pourri,0,50.0,1,False,"Addison, TX","Dallas-Fort Worth-Arlington, TX",f,,,,Suzy
8,Brian Hastings,Gearbox Express,d4ed79c1d290badf8165832a8eed0d41,gearbox-express,2465000,50.0,0,False,"Mukwonago, WI","Milwaukee-Waukesha-West Allis, WI",Male,,,,Brian
9,Brian Petersen,Podimetrics,8cded10bbd3be6f33dcf72367e5fb604,podimetrics,15900000,50.0,5,False,"Somerville, MA","Boston-Cambridge-Newton, MA-NH",m,,,,Brian


In [66]:
2386/20146

0.11843542142360766

In [70]:
d = gender.Detector()

In [71]:
# Create column 'female'
entrepreneur_df['female'] = 0

# Collect indices of female and androgynous names, and mark them as female
female_names = []
andy_names = []
unk_names = []

count_female = 0
count_andy = 0
count_unk = 0

for i in entrepreneur_df.index:
    name = entrepreneur_df.at[i, 'f_name']
    if d.get_gender(name) == 'female':
        female_names.append([i, name])
        entrepreneur_df.at[i, 'female'] = 1
        count_female += 1
    elif d.get_gender(name) == 'mostly_female':
        female_names.append([i, name])
        entrepreneur_df.at[i, 'female'] = 1
        count_female += 1
    elif d.get_gender(name) == 'andy':
        andy_names.append([i, name])
        count_andy += 1
    elif d.get_gender(name) == 'unknown':
        #if name in additional_female_names:
#             female_names.append([i, name])
#             df.at[i, 'female'] = 1
#             count_female += 1
#         #elif name in additional_male_names:
# #             pass
#         else:
#             unk_names.append([i, name])
            count_unk += 1

print(count_female, 'female names.')
print(count_andy, 'androgynous names.')
print(count_unk, 'unknown names.')

5439 female names.
693 androgynous names.
5040 unknown names.


In [72]:
entrepreneur_df

Unnamed: 0,name,company_name,crunchbase_uuid,crunchbase_permalink,funding,jobs_created,patents,ipo,city_and_state,metro,gender,lgbtq,race_ethnicity,startout_id,f_name,female
0,Scott Brown,ColdSpark,226a627bc92415995985cbd94743276b,coldspark,6500000,1000.0,0,False,"Broomfield, CO","Denver-Aurora-Lakewood, CO",m,,,,Scott,0
1,Robin Horwitz,Convo Communications,7d0675cefac592615e1cbb6c29fd403a,convo-communications,500000,100.0,0,False,"Austin, TX","Austin-Round Rock, TX",f,,,,Robin,0
2,Jiren Parikh,SnapOne,ed5f8110a213395ececfe94660c0f602,snapone-inc,0,100.0,1,False,"Princeton, NJ","Trenton-Ewing, NJ",Male,,,,Jiren,0
3,Matthew Slipper,Symphony Communication Services,5f89826c5031a1932f27525b505b0a7f,symphony-3,461000000,500.0,1,False,"Palo Alto, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Matthew,0
4,Ric Zhou,Kika Tech,437121710de6c1d5e8f8ebe555749fa8,kika-tech,63000000,500.0,0,False,"San Jose, CA","San Jose-Sunnyvale-Santa Clara, CA",m,,,,Ric,0
5,Kumaran Thillainadarajah,Smart Skin Technologies,5974376ec194ff0e4322caded8d049d5,smart-skin-technologies,11545000,50.0,2,False,"New Brunswick, NJ","New York-Northern New Jersey-Long Island, NY-N...",Male,,,,Kumaran,0
6,Sunil Agrawal,Armor5,3fca9b4b44b3ed61719f56c19af7387d,armor5,2000000,50.0,7,False,"Santa Clara, CA","San Jose-Sunnyvale-Santa Clara, CA",Male,,,,Sunil,0
7,Suzy Batiz,Poo~Pourri,3ac4350d0bd841a98beb24341e43a63f,poo-pourri,0,50.0,1,False,"Addison, TX","Dallas-Fort Worth-Arlington, TX",f,,,,Suzy,1
8,Brian Hastings,Gearbox Express,d4ed79c1d290badf8165832a8eed0d41,gearbox-express,2465000,50.0,0,False,"Mukwonago, WI","Milwaukee-Waukesha-West Allis, WI",Male,,,,Brian,0
9,Brian Petersen,Podimetrics,8cded10bbd3be6f33dcf72367e5fb604,podimetrics,15900000,50.0,5,False,"Somerville, MA","Boston-Cambridge-Newton, MA-NH",m,,,,Brian,0


In [75]:
entrepreneur_df[entrepreneur_df['gender'] == 'f'].shape

(2386, 16)

In [76]:
entrepreneur_df[entrepreneur_df['female'] == 1].shape

(5439, 16)

In [79]:
entrepreneur_df[(entrepreneur_df['gender'] == 'f') & (entrepreneur_df['female'] == 1)].shape

(1984, 16)

In [80]:
#places where the SS data covered spots that the gender_guesser missed
entrepreneur_df[(entrepreneur_df['gender'] == 'f') & (entrepreneur_df['female'] == 0)].shape

(402, 16)

In [81]:
#places where the SS data assigned male that the gender_guesser assigned female
entrepreneur_df[(entrepreneur_df['gender'] == 'm') & (entrepreneur_df['female'] == 1)].shape

(220, 16)

In [82]:
entrepreneur_df[(entrepreneur_df['gender'] == 'Female') & (entrepreneur_df['female'] == 1)].shape

(2656, 16)

In [83]:
entrepreneur_df[(entrepreneur_df['gender'] == 'Female')].shape

(3142, 16)

In [85]:
accuracy = 2656/3142
accuracy

0.8453214513049013

In [89]:
entrepreneur_df[(entrepreneur_df['gender'] == 'i donno') & (entrepreneur_df['female'] == 1)].shape

(132, 16)

In [90]:
andro_id = entrepreneur_df[(entrepreneur_df['gender'] == 'i donno') & (entrepreneur_df['female'] == 1)]

In [91]:
andro_id

Unnamed: 0,name,company_name,crunchbase_uuid,crunchbase_permalink,funding,jobs_created,patents,ipo,city_and_state,metro,gender,lgbtq,race_ethnicity,startout_id,f_name,female
437,Aviad Cahana,xF Technologies Inc.,b1252d64cad26da0aab88a0131040c6c,xf-technologies-inc,12500000,50.0,4,False,"Albuquerque, NM","Albuquerque, NM",i donno,,,,Aviad,1
1275,Peppi Prasit,Amira Pharmaceuticals,ef44a540055ae94c30e476ca0f187cac,amira-pharmaceuticals,32000000,50.0,0,False,"San Diego, CA","San Diego-Carlsbad-San Marcos, CA",i donno,,,,Peppi,1
1382,Indu Parikh,BioMarck Pharmaceuticals,a7a6069d3c963848c5e3cce520e710cf,biomarck-pharmaceuticals,15517766,10.0,0,False,"Durham, NC","Durham-Chapel Hill, NC",i donno,,,,Indu,1
2689,Kat Vorotova,Try The World,73a6b76d79f4c7d87bcd2d6d0b2882a5,try-the-world,6120106,50.0,0,False,"New York, NY","New York-Newark-Jersey City, NY-NJ-PA",i donno,,,,Kat,1
2708,Chitra Kanagaraj,PikMyKid,32cb603d1a5b01e95d684bce48d4f2f9,pikmykid,1750000,50.0,0,False,"Tampa, FL","Tampa-St Petersburg-Clearwater, FL",i donno,,,,Chitra,1
2855,Rotem Shor,Medisafe,f641d8ae98e5011e5c22c669baa29b46,medisafe-project,21500000,100.0,0,False,"Boston, MA","Boston-Cambridge-Newton, MA-NH",i donno,,,,Rotem,1
3430,Suman Kanuganti,Aira Tech Corp,990830059388f60c67a82949c0cc0995,aira,35340000,100.0,1,False,"La Jolla, CA","San Diego-Carlsbad-San Marcos, CA",i donno,,,,Suman,1
3453,Vesa Kupari,Taction Enterprises,68cd30cad9d8ae1152f27ca03e52a8d0,taction-enterprises,95344,10.0,1,False,"Beverly Hills, CA","Los Angeles-Long Beach-Santa Ana, CA",i donno,,,,Vesa,1
4062,Rotem Amar,Webpals Mobile,d84b933c5219e50bcf36e2f50cc40ded,dau-up,0,100.0,3,False,"San Francisco, CA","San Francisco-Oakland-Hayward, CA",i donno,,,,Rotem,1
5195,Pelin Kenez,Zeplin,389a85f37a8758b13050d3f1939a9e91,zeplin,1359541,10.0,1,False,"San Francisco, CA","San Francisco-Oakland-Hayward, CA",i donno,,,,Pelin,1


In [87]:
entrepreneur_df[entrepreneur_df['gender'] == 'i donno']

Unnamed: 0,name,company_name,crunchbase_uuid,crunchbase_permalink,funding,jobs_created,patents,ipo,city_and_state,metro,gender,lgbtq,race_ethnicity,startout_id,f_name,female
18,Chin Beckmann,DSP Concepts,deb1ab07d9a538dcdeeaf8f32f018d34,dsp-concepts,10000000,50.0,0,False,"Santa Clara, CA","San Jose-Sunnyvale-Santa Clara, CA",i donno,,,,Chin,0
59,Charles-Albert Gorra,Trendlee,3d69dea93303ddecc5fb4213e92d9219,trendlee,29775093,50.0,0,False,"New York, NY","New York-Newark-Jersey City, NY-NJ-PA",i donno,,,,Charles-Albert,0
68,Tingfan Wu,Umbo Computer Vision,4070b2d86ee801dc443c0c36883f1e10,umbo-computer-vision,17600000,50.0,0,False,"San Francisco, CA","San Francisco-Oakland-Hayward, CA",i donno,,,,Tingfan,0
106,Wooshik Jung,"Stratio, Inc.",cdc2715a0fb79c82079f247ef732749e,stratio,4295000,50.0,0,False,"San Jose, CA","San Jose-Sunnyvale-Santa Clara, CA",i donno,,,,Wooshik,0
160,Ofer Klein,kwik,5455829eba3e88aa2a2c185045d70e27,kwik,3120000,50.0,1,False,"Palo Alto, CA","San Jose-Sunnyvale-Santa Clara, CA",i donno,,,,Ofer,0
161,Umesh Maheshwari,Nimble Storage,856fe87b5f5da5ce99814eb1a0f9a080,nimble-storage,98751230,5000.0,36,True,"San Jose, CA","San Jose-Sunnyvale-Santa Clara, CA",i donno,,,,Umesh,0
193,Prasanna Raghavendra,CloudMunch,4616010dedb90e16451b2139727c86df,cloudmunch,3400000,50.0,0,False,"Bellevue, WA","Seattle-Tacoma-Bellevue, WA",i donno,,,,Prasanna,0
225,Chaitan S. Khosla,Sitari Pharmaceuticals,97dc4266c35276394702635f5d357725,sitari-pharmaceuticals,10000000,,0,False,"San Diego, CA","San Diego-Carlsbad-San Marcos, CA",i donno,,,,Chaitan,0
226,Laure-Cécile Lafond-Fenonjoie,WhatsMode,9512e7107c8645b8b67186a1d33368cd,whatsmode,5000000,1000.0,0,False,"West Hollywood, CA","Los Angeles-Long Beach-Santa Ana, CA",i donno,,,,Laure-Cécile,0
234,Farzad (Zod) Nazem,The HAND Foundation,7388caf8f093247326baebcbf47df3ad,the-hand-foundation,0,10.0,1,False,"Redwood City, CA","San Francisco-Oakland-Hayward, CA",i donno,,,,Farzad,0


In [None]:
print(df[ df['gender'] == 'i donno'].shape)
df[ df['gender'] == 'i donno']

In [None]:
import gender_guesser.detector as gender

In [None]:
d = gender.Detector()

In [None]:
d.get_gender('Chantal')

In [None]:
df[df.f_name.isnull() == True]

In [None]:
for i in df.index:
    first = df.at[i, 'f_name']
    if 

In [None]:
name_gender = {}

for i in social_security_df.index:
    social_security_name = social_security_df.at[i, 'name']
    social_security_gender = social_security_df.at[i, 'name']
    name_gender[social]
    for j in social_security_df.index:
        
        if name == social_security_name:
            print('true')

In [None]:
project = 'peii-242723'
dataset_id = 'entrepreneur'
table_id = 'Combined_tables_39k'
bucket_name = 'muse-data'

# Setup Storage vars
storage_client = storage.Client(project=project)
bucket = storage_client.get_bucket('muse-data')

# Setup BigQuery vars
bq_client = bigquery.Client()
dataset_ref = bq_client.dataset(dataset_id, project=project)
table_ref = dataset_ref.table(table_id)

#destination
destination_uri = "gs://{}/{}".format(bucket_name, "interim_entrepreneur_data_gender.json")

In [None]:
#set job configuration to extract job as JSON files
job_config = bigquery.ExtractJobConfig()
job_config.destination_format = 'NEWLINE_DELIMITED_JSON'
job_config.write_disposition = 'WRITE_TRUNCATE'

In [None]:
extract_job = bq_client.extract_table(
    table_ref,
    destination_uri,
    # Location must match that of the source table.
    location="US",
    job_config=job_config
)  # API request
extract_job.result()  # Waits for job to complete.

print(
    "Exported {}:{}.{} to {}".format(project, dataset_id, table_id, destination_uri)
)

In [None]:
df = pd.read_json(destination_uri, lines=True)

In [None]:
df.shape

In [None]:
df

In [None]:
df = df[df.columns[df.isnull().all()]]
df

In [None]:
df['learner'].drop_duplicates()

2734 in 'Learners' SQL

In [None]:
df['last_modified'] = pd.to_datetime(df['last_modified'])
least_recent_date = df['last_modified'].min()
recent_date = df['last_modified'].max()

In [None]:
recent_date

In [None]:
latest = df.loc[df.groupby('name').last_modified.idxmax()]

In [None]:
latest.shape

In [None]:
lastest = df.sort_values('last_modified').groupby('name').tail(1)

In [None]:
lastest.shape