In [1]:
import pandas as pd
import os

In [2]:
filename = '../data/genni-ethnea-authority2009.tsv'

# Read the TSV file into a pandas DataFrame
df = pd.read_csv(filename, delimiter='\t', on_bad_lines='warn')

b'Skipping line 1864435: expected 10 fields, saw 13\n'


In [3]:
df

Unnamed: 0,auid,name,EthnicSeer,prop,lastname,firstname,Ethnea,Genni,SexMac,SSNgender
0,9731334_2,Cameron 'Dale' Bass,ITA,0.653567,'Dale' Bass,Cameron,ENGLISH,M,mostly_male,-
1,2155715_1,Bert Hart,ENG,0.772359,Hart,Bert,DUTCH,M,male,M
2,7867892_1,Leen Hart,ENG,0.980865,Hart,Leen,DUTCH,-,male,F
3,14609221_2,Esther Nolte- Hoen,GER,0.665081,Nolte- Hoen,Esther,GERMAN,F,female,F
4,8101337_1,Ellen 't Hoen,CHI,0.665526,'t Hoen,Ellen,DUTCH,F,female,F
...,...,...,...,...,...,...,...,...,...,...
2799447,849591_2,Haddad,ARA,0.927991,Haddad,X,ARAB-FRENCH,-,andy,-
2799448,8683651_5,Haddad,ARA,0.927991,Haddad,X,ARAB-FRENCH,-,andy,-
2799449,8714244_2,monia Haddad,ARA,0.988955,Haddad,monia,ARAB,F,female,F
2799450,9289496_7,Haddad,ARA,0.927991,Haddad,X,ARAB-FRENCH,-,andy,-


In [12]:
# Assuming you have a DataFrame named 'df' with a column named 'firstname'

# Capitalize all first names
df['firstname'] = df['firstname'].str.capitalize()

# Remove names with spaces or hyphens or len 1 or less
df = df[df['firstname'].str.len() > 1]
df = df[df['firstname'].str.len() < 14]
df = df[~df['firstname'].str.contains(r'[\s-]')]

# Remove names without gender info
df = df[df['Genni'] != "-"]

# Create a new DataFrame with unique names
unique_names_df = df.drop_duplicates(subset='firstname', keep='first', ignore_index=True)
unique_names_df


Unnamed: 0,auid,name,EthnicSeer,prop,lastname,firstname,Ethnea,Genni,SexMac,SSNgender
0,9731334_2,Cameron 'Dale' Bass,ITA,0.653567,'Dale' Bass,Cameron,ENGLISH,M,mostly_male,-
1,2155715_1,Bert Hart,ENG,0.772359,Hart,Bert,DUTCH,M,male,M
2,14609221_2,Esther Nolte- Hoen,GER,0.665081,Nolte- Hoen,Esther,GERMAN,F,female,F
3,8101337_1,Ellen 't Hoen,CHI,0.665526,'t Hoen,Ellen,DUTCH,F,female,F
4,9804785_2,Peter 't Hoen,GER,0.330864,'t Hoen,Peter,DUTCH,M,male,M
...,...,...,...,...,...,...,...,...,...,...
35887,18365203_1,Zaiyi Liu,CHI,0.990008,Liu,Zaiyi,CHINESE,M,andy,-
35888,17531377_1,Milady Renata da Silva,ITA,0.627304,Renata da Silva,Milady,HISPANIC,F,andy,F
35889,1887140_4,mariangelica da Silva,ITA,0.292196,da Silva,Mariangelica,HISPANIC,F,andy,-
35890,13975819_1,norbertberto SA,ITA,0.820337,SA,Norbertberto,HISPANIC,M,andy,-


In [13]:
ethnea_counts = unique_names_df['Ethnea'].value_counts()
print(ethnea_counts.to_string())

ENGLISH                4926
HISPANIC               3752
INDIAN                 3186
ARAB                   2757
JAPANESE               2028
CHINESE                1888
SLAV                   1802
NORDIC                 1697
GERMAN                 1235
FRENCH                 1048
TURKISH                 977
DUTCH                   941
ITALIAN                 907
GREEK                   623
ISRAELI                 619
AFRICAN                 617
KOREAN                  555
THAI                    443
ENGLISH-HISPANIC        297
BALTIC                  261
UNKNOWN                 186
HUNGARIAN               131
ENGLISH-FRENCH          121
GERMAN-ENGLISH          119
ROMANIAN                118
ENGLISH-INDIAN          114
FRENCH-ENGLISH          113
ENGLISH-NORDIC          100
HISPANIC-ENGLISH         97
ENGLISH-GERMAN           94
ENGLISH-ARAB             84
ENGLISH-CHINESE          84
ENGLISH-ITALIAN          79
GERMAN-HISPANIC          78
INDIAN-ENGLISH           73
ARAB-INDIAN         

In [14]:
unique_names_df['Genni'].value_counts()

F    17966
M    17926
Name: Genni, dtype: int64

In [18]:
import requests
from bs4 import BeautifulSoup
import time

def scrape_ethnicity_table(first_name, last_name):
    url = f"http://abel.lis.illinois.edu/cgi-bin/ethnea/search.py?Fname={first_name}&Lname={last_name}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')

    # Extract the table rows
    rows = table.find_all('tr')

    # Extract the table headers
    # headers = [header.text for header in rows[0].find_all('th')]

    # Extract the table data
    data = [[cell.text for cell in row.find_all('td')] for row in rows[1:]]

    # Find the ethnicity with the highest probF
    max_probF_ethnicity = None
    max_probF_value = 0.0
    for row in data:
        ethnicity = row[0]
        probF = float(row[4])
        if probF > max_probF_value:
            max_probF_ethnicity = ethnicity
            max_probF_value = probF

    # Update the DataFrame with the highest probF ethnicity and probF value
    unique_names_df.loc[(unique_names_df['firstname'] == first_name) & (unique_names_df['lastname'] == last_name), 'Highest_probF_ethnicity'] = max_probF_ethnicity
    unique_names_df.loc[(unique_names_df['firstname'] == first_name) & (unique_names_df['lastname'] == last_name), 'Highest_probF_value'] = max_probF_value

chunk_size = 1000

# Iterate over the DataFrame in chunks
for i in range(0, len(unique_names_df), chunk_size):
    chunk_df = unique_names_df.iloc[i:i+chunk_size]  # Get the current chunk of rows

    # Perform the scraping for each row in the chunk
    for index, row in chunk_df.iterrows():
        first_name = row['firstname']
        last_name = row['lastname']
        scrape_ethnicity_table(first_name, last_name)
        time.sleep(1)  # Delay for 1 second before the next request
    
    # Generate a unique filename for each chunk
    filename = f"../data/1_first_names_{i//chunk_size}.csv"

    # Save the chunk DataFrame to a CSV file
    unique_names_df.iloc[i:i+chunk_size].to_csv(filename, index=False)


In [28]:
import os
import pandas as pd

# Step 1: Define the folder path where the CSV files are located
folder_path = '../data/'

# Step 2: Get a list of all CSV files in the folder, sorted alphabetically
csv_files = [file for file in os.listdir(folder_path) if file.startswith('1_first_names_') and file.endswith('.csv')]
csv_files.sort()  # Sort the file list alphabetically

# Step 3: Merge all CSV files into a single DataFrame
df_list = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    df_list.append(df)

merged_df = pd.concat(df_list, ignore_index=True)
merged_df.drop_duplicates(inplace=True)

# Step 4: Save the merged DataFrame to a new CSV file
merged_csv_path = '../data/1_first_names_raw_data.csv'
merged_df.to_csv(merged_csv_path, index=False)

print("Merged CSV file has been created successfully!")


Merged CSV file has been created successfully!
