In [5]:
import pandas as pd
df = pd.read_csv("mercury_craters.csv")
df['cleaned_links'] = df['Link'].str.replace("https://en.wikipedia.org", "", regex=False)
df.to_csv("linkstoUSsites.csv", index=False)  # Save cleaned file
print ("Done")

Done


## Scraping US site

In [17]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import re

# Read the CSV file
csv_filename = "linkstoUSsites.csv"
df = pd.read_csv(csv_filename)

# Normalize column names
df.columns = df.columns.str.strip().str.lower()

# Check if 'cleaned_links' column exists
if 'cleaned_links' not in df.columns:
    print("Error: 'cleaned_links' column not found in CSV. Available columns:", df.columns)
    exit()

# Create a new column for extracted ethnicity data
df["ethnicity"] = ""

# Loop through links and scrape data
for index, row in df.iterrows():
    url = row["cleaned_links"]
    if pd.isna(url) or not isinstance(url, str) or not url.startswith("http"):
        continue  # Skip invalid links

    print(f"Processing: {url}")

    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")

            ### 1️⃣ Extract Ethnicity from the 2nd Table ###
            tables = soup.find_all("table", class_="usa-table")

            if len(tables) >= 2:  # Ensure there are at least two tables
                table = tables[1]  # Select the second table
                rows = table.find_all("tr")

                # Ensure there are at least 4 rows before extracting ethnicity
                if len(rows) >= 4:
                    ethnicity_value = rows[3].find_all("td")[0].get_text(strip=True)  # Extract from 4th row, first <td>
                else:
                    ethnicity_value = "No Data"
            else:
                ethnicity_value = "No Table"

            div_content = soup.find("div", class_="desktop:grid-col-7")
            if div_content:
                div_text = div_content.get_text(" ", strip=True) 
                
                # Use regex to find "Ethnicity" and extract the country name
                match = re.search(r"Ethnicity\s*[:\-\–]?\s*([\w]+\s*[\w]*\s*[\w]*)", div_text)
                if match:
                    ethnicity_value = match.group(1)  # Extract only the country name
                
            print(f"Extracted Ethnicity: {ethnicity_value}")

        else:
            ethnicity_value = "Error"

    except Exception as e:
        print(f"Error processing {url}: {e}")
        ethnicity_value = "Error"

    # Save extracted ethnicity into the DataFrame
    df.at[index, "ethnicity"] = ethnicity_value

    time.sleep(2)  # Delay to avoid overwhelming the server

# Save updated CSV
output_filename = "Ethnicity_Data.csv"
df.to_csv(output_filename, index=False)
print(f"Scraping complete. Data saved to '{output_filename}'")


Processing: https://planetarynames.wr.usgs.gov/Feature/14574
Extracted Ethnicity: Bangladesh Origin Zainul
Processing: https://planetarynames.wr.usgs.gov/Feature/21
Extracted Ethnicity: Syria Origin Arab
Processing: https://planetarynames.wr.usgs.gov/Feature/76
Extracted Ethnicity: Sierra Leone Origin
Processing: https://planetarynames.wr.usgs.gov/Feature/99
Extracted Ethnicity: Sudan Origin Abu
Processing: https://planetarynames.wr.usgs.gov/Feature/14953
Extracted Ethnicity: United States Origin
Processing: https://planetarynames.wr.usgs.gov/Feature/14954
Extracted Ethnicity: Russia Origin Sergey
Processing: https://planetarynames.wr.usgs.gov/Feature/15386
Extracted Ethnicity: Japan Origin Ryunosuke
Processing: https://planetarynames.wr.usgs.gov/Feature/144
Extracted Ethnicity: Arabian Origin Arab
Processing: https://planetarynames.wr.usgs.gov/Feature/176
Extracted Ethnicity: Brazil Origin Jose
Processing: https://planetarynames.wr.usgs.gov/Feature/147
Extracted Ethnicity: Arabian Ori