In [None]:
import pandas as pd
import os

In [None]:
filename = '../data/genni-ethnea-authority2009.tsv'

df = pd.read_csv(filename, delimiter='\t', on_bad_lines='warn')

In [None]:
df

In [None]:
df['firstname'] = df['firstname'].str.capitalize()

# Remove names with spaces or hyphens or len 1 or less
df = df[df['firstname'].str.len() > 1]
df = df[df['firstname'].str.len() < 14]
df = df[~df['firstname'].str.contains(r'[\s-]')]

# Remove names without gender info
df = df[df['Genni'] != "-"]

# Create a new df with unique names
unique_names_df = df.drop_duplicates(subset='firstname', keep='first', ignore_index=True)
unique_names_df


In [None]:
import requests
from bs4 import BeautifulSoup
import time

def scrape_ethnicity_table(first_name, last_name):
    url = f"http://abel.lis.illinois.edu/cgi-bin/ethnea/search.py?Fname={first_name}&Lname={last_name}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')

    rows = table.find_all('tr')

    # Extract the table data
    data = [[cell.text for cell in row.find_all('td')] for row in rows[1:]]

    # Find the ethnicity with the highest probF
    max_probF_ethnicity = None
    max_probF_value = 0.0
    for row in data:
        ethnicity = row[0]
        probF = float(row[4])
        if probF > max_probF_value:
            max_probF_ethnicity = ethnicity
            max_probF_value = probF

    # Update the DataFrame with the highest probF ethnicity and probF value
    unique_names_df.loc[(unique_names_df['firstname'] == first_name) & (unique_names_df['lastname'] == last_name), 'Highest_probF_ethnicity'] = max_probF_ethnicity
    unique_names_df.loc[(unique_names_df['firstname'] == first_name) & (unique_names_df['lastname'] == last_name), 'Highest_probF_value'] = max_probF_value

chunk_size = 1000

for i in range(0, len(unique_names_df), chunk_size):
    chunk_df = unique_names_df.iloc[i:i+chunk_size]

    for index, row in chunk_df.iterrows():
        first_name = row['firstname']
        last_name = row['lastname']
        scrape_ethnicity_table(first_name, last_name)
        time.sleep(1)  # Delay for 1 second before the next request
    
    filename = f"../data/1_first_names_{i//chunk_size}.csv"

    unique_names_df.iloc[i:i+chunk_size].to_csv(filename, index=False)


In [None]:
import os
import pandas as pd

folder_path = '../data/'

csv_files = [file for file in os.listdir(folder_path) if file.startswith('1_first_names_') and file.endswith('.csv')]
csv_files.sort()  # Sort the file list alphabetically

df_list = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    df_list.append(df)

merged_df = pd.concat(df_list, ignore_index=True)
merged_df.drop_duplicates(inplace=True)

merged_csv_path = '../data/1_first_names_raw_data.csv'
merged_df.to_csv(merged_csv_path, index=False)

print("Merged CSV file has been created")
