In [7]:
import pandas as pd
import asyncio
from playwright.async_api import async_playwright
import time

async def main():
    # Load CSV
    df = pd.read_csv("lei_info/bloomberg_scraped_structured.csv")

    # Ensure 'Legal Name' column exists
    if 'Legal Name' not in df.columns:
        raise ValueError("Missing 'Legal Name' column in CSV.")

    # Find rows missing legal names but with valid URL
    missing_df = df[df['Legal Name'].isna() & df['url'].notna()].copy()

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        for idx, row in missing_df.iterrows():
            url = row['url']
            try:
                print(f"Scraping: {url}")
                await page.goto(url, timeout=60000)
                await page.wait_for_selector('[data-testid="h1-legalname"]', timeout=15000)
                legal_name_elem = await page.query_selector('[data-testid="h1-legalname"]')
                legal_name = await legal_name_elem.inner_text()
                df.loc[idx, 'Legal Name'] = legal_name
            except Exception as e:
                print(f"❌ Failed to scrape {url}: {e}")
                continue
            await asyncio.sleep(1)

        await browser.close()

    # Save updated file
    df.to_csv("lei_info/bloomberg_scraped_structured_filled.csv", index=False)
    print("✅ Updated CSV saved!")

# If you're in Jupyter or an async environment, run this:
await main()

# Otherwise, in a regular Python script use:
# asyncio.run(main())


Scraping: https://lei.bloomberg.com/leis/view/549300OYN1WD4APNOU82
Scraping: https://lei.bloomberg.com/leis/view/5493004UNJQCCNMHSR95
Scraping: https://lei.bloomberg.com/leis/view/894500SYP6ZV71JDZH10
Scraping: https://lei.bloomberg.com/leis/view/549300BDZCCADL2HG292
Scraping: https://lei.bloomberg.com/leis/view/5493007FRLGNQEXCGS66
Scraping: https://lei.bloomberg.com/leis/view/549300VHITS0GXJRXV39
Scraping: https://lei.bloomberg.com/leis/view/984500FC8569569B5B24
Scraping: https://lei.bloomberg.com/leis/view/549300PDELOGUKYYDA02
Scraping: https://lei.bloomberg.com/leis/view/549300D5ZV9HW2DYO911
Scraping: https://lei.bloomberg.com/leis/view/984500T0AY1EOD7AFB91
Scraping: https://lei.bloomberg.com/leis/view/54930051L9BKHSHDSD90
Scraping: https://lei.bloomberg.com/leis/view/5493006P5IHSK77SZ855
Scraping: https://lei.bloomberg.com/leis/view/549300D2IR01BV6NPA49
Scraping: https://lei.bloomberg.com/leis/view/549300B934MYWT57IX38
Scraping: https://lei.bloomberg.com/leis/view/1TMVIO1SD0RLIPEI

In [27]:
import pandas as pd

# Load the full data
df = pd.read_csv("lei_info/bloomberg_scraped_structured_interim.csv")

# Clean column names
df.columns = df.columns.str.strip()

# Export only rows with missing Legal Name (and valid URL, if exists)
if "Legal Name" not in df.columns:
    raise ValueError("Missing 'Legal Name' column")

missing = df[df["Legal Name"].isna()]
missing.to_csv("lei_info/rows_missing_legal_name.csv", index=False)
print(f"🔎 Exported {len(missing)} rows with missing legal names.")


🔎 Exported 4909 rows with missing legal names.


In [28]:
import pandas as pd
import asyncio
from playwright.async_api import async_playwright
import time

async def main():
    # Load rows with missing legal name only
    df = pd.read_csv("lei_info/rows_missing_legal_name.csv")

    if "url" not in df.columns or "lei_number" not in df.columns:
        raise ValueError("Missing required columns ('url', 'lei_number')")

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        for idx, row in df.iterrows():
            url = row.get('url')
            lei = row.get('lei_number')
            if not isinstance(url, str) or not url.startswith("http"):
                print(f"⚠️ Skipping invalid URL for {lei}")
                continue
            try:
                print(f"Scraping: {url}")
                await page.goto(url, timeout=60000)
                await page.wait_for_selector('[data-testid="h1-legalname"]', timeout=15000)
                elem = await page.query_selector('[data-testid="h1-legalname"]')
                if elem:
                    df.loc[idx, "Legal Name"] = await elem.inner_text()
            except Exception as e:
                print(f"❌ Failed to scrape {lei}: {e}")
            await asyncio.sleep(1)

        await browser.close()

    # Save filled file
    df.to_csv("lei_info/rows_missing_legal_name_filled.csv", index=False)
    print("✅ Finished: rows_missing_legal_name_filled.csv")

# Run in Jupyter or async:
await main()
# Or in script:
# asyncio.run(main())


Scraping: https://lei.bloomberg.com/leis/view/549300DD4R4SYK5RAQ92


  df.loc[idx, "Legal Name"] = await elem.inner_text()


Scraping: https://lei.bloomberg.com/leis/view/2549002H3CEW0748X068
Scraping: https://lei.bloomberg.com/leis/view/254900II3J6G6WWO1O83
Scraping: https://lei.bloomberg.com/leis/view/RVDPPPGHCGZ40J4VQ731
Scraping: https://lei.bloomberg.com/leis/view/549300RP4LE08QWSLQ50
Scraping: https://lei.bloomberg.com/leis/view/549300214PKB2Y1ZWH75
Scraping: https://lei.bloomberg.com/leis/view/254900QQIJN4KR6ZBW98
Scraping: https://lei.bloomberg.com/leis/view/B4TYDEB6GKMZO031MB27
Scraping: https://lei.bloomberg.com/leis/view/549300MSETJUOU1OY757
Scraping: https://lei.bloomberg.com/leis/view/549300V2YLC1I721HE07
Scraping: https://lei.bloomberg.com/leis/view/549300ZW58TM1KU8CK35
Scraping: https://lei.bloomberg.com/leis/view/7H6GLXDRUGQFU57RNE97
Scraping: https://lei.bloomberg.com/leis/view/5493005ES4J6H2VR7264
Scraping: https://lei.bloomberg.com/leis/view/984500U2F9A83N391A91
Scraping: https://lei.bloomberg.com/leis/view/549300038578ZJ284Y20
Scraping: https://lei.bloomberg.com/leis/view/549300DUTN4RUBI8

CancelledError: 

In [30]:
import pandas as pd
import asyncio
from playwright.async_api import async_playwright
import time

INTERIM_SAVE_EVERY = 20
INPUT_PATH = "bloomberg_entity_classified.csv"
INTERIM_PATH = "bloomberg_entity_classified_interim.csv"
FINAL_PATH = "bloomberg_entity_classified_filled.csv"

async def main():
    df = pd.read_csv(INPUT_PATH)

    # Ensure required columns exist
    if "url" not in df.columns or "lei_number" not in df.columns:
        raise ValueError("Missing 'url' or 'lei_number' column")
    if "Legal Name" not in df.columns:
        df["Legal Name"] = None

    # Find only rows missing Legal Name
    missing_mask = df["Legal Name"].isna()
    rows_to_scrape = df[missing_mask & df["url"].notna()].copy()

    print(f"🔍 Found {len(rows_to_scrape)} rows to scrape.")

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        for i, (idx, row) in enumerate(rows_to_scrape.iterrows()):
            url = row["url"]
            lei = row["lei_number"]

            try:
                print(f"[{i+1}] Scraping: {url}")
                await page.goto(url, timeout=60000)
                await page.wait_for_selector('[data-testid="h1-legalname"]', timeout=15000)
                elem = await page.query_selector('[data-testid="h1-legalname"]')
                if elem:
                    legal_name = await elem.inner_text()
                    df.at[idx, "Legal Name"] = legal_name
            except Exception as e:
                print(f"❌ Failed for {lei}: {e}")
            await asyncio.sleep(1)

            if (i + 1) % INTERIM_SAVE_EVERY == 0:
                df.to_csv(INTERIM_PATH, index=False)
                print(f"💾 Interim save after {i+1} rows.")

        await browser.close()

    df.to_csv(FINAL_PATH, index=False)
    print(f"✅ Final saved to {FINAL_PATH}")

# For Jupyter
await main()

# In script: asyncio.run(main())


🔍 Found 4909 rows to scrape.
[1] Scraping: https://lei.bloomberg.com/leis/view/549300DD4R4SYK5RAQ92
[2] Scraping: https://lei.bloomberg.com/leis/view/2549002H3CEW0748X068
[3] Scraping: https://lei.bloomberg.com/leis/view/254900II3J6G6WWO1O83
[4] Scraping: https://lei.bloomberg.com/leis/view/RVDPPPGHCGZ40J4VQ731
[5] Scraping: https://lei.bloomberg.com/leis/view/549300RP4LE08QWSLQ50
[6] Scraping: https://lei.bloomberg.com/leis/view/549300214PKB2Y1ZWH75
[7] Scraping: https://lei.bloomberg.com/leis/view/254900QQIJN4KR6ZBW98
[8] Scraping: https://lei.bloomberg.com/leis/view/B4TYDEB6GKMZO031MB27
[9] Scraping: https://lei.bloomberg.com/leis/view/549300MSETJUOU1OY757
[10] Scraping: https://lei.bloomberg.com/leis/view/549300V2YLC1I721HE07
[11] Scraping: https://lei.bloomberg.com/leis/view/549300ZW58TM1KU8CK35
[12] Scraping: https://lei.bloomberg.com/leis/view/7H6GLXDRUGQFU57RNE97
[13] Scraping: https://lei.bloomberg.com/leis/view/5493005ES4J6H2VR7264
[14] Scraping: https://lei.bloomberg.com/lei