In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import os

input_file = "Harris_CivilHearingsFilings_Jan22Dec25_CLEAN.csv"
output_file = "Harris_CivilHearingsFilings_Jan22Dec25_GRANULARv2.csv"

chunk_size = 1000000
sleep_seconds = 0.001

df_cases = pd.read_csv(
    input_file,
    usecols=["Case Number", "court_number", "Case Type"],
    dtype={"Case Number": "string", "court_number": "string", "Case Type": "string"},
    low_memory=False
)

df_cases = df_cases.dropna(subset=["Case Number"])
df_cases["Case Number"] = df_cases["Case Number"].astype("string").str.strip()

cases_to_scrape = df_cases.drop_duplicates(subset=["Case Number"]).reset_index(drop=True)

if os.path.exists(output_file):
    os.remove(output_file)

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0"})

def scrape_case(case_number, court_number, case_type):
    case_number = str(case_number).strip()
    url = f"https://jpwebsite.harriscountytx.gov/CaseInfo/GetCaseInfo?case={case_number}"

    last_err = None
    html = None
    for _ in range(3):
        try:
            r = session.get(url, timeout=10)
            r.raise_for_status()
            html = r.text
            break
        except requests.RequestException as e:
            last_err = e
            time.sleep(1)

    result = {
        "Case Number": case_number,
        "court_number": court_number,
        "Case Type": case_type,
        "Plaintiff": "",
        "Defendant": "",
        "Receiver": ""
    }

    if html is None:
        result["Error"] = str(last_err)
        return result

    soup = BeautifulSoup(html, "html.parser")

    def extract_field(label):
        tag = soup.find("span", string=lambda s: s and s.strip() == label)
        if tag:
            sib = tag.find_next_sibling("span")
            if sib:
                return sib.get_text(strip=True)
        return ""

    result["Filed Date"] = extract_field("Filed Date:")
    result["Case Status"] = extract_field("Case Status:")
    result["Nature of Claim"] = extract_field("Nature of Claim:")
    result["Disposition Desc"] = extract_field("Disposition:")
    result["Disposition Date"] = extract_field("Disposition Date:")
    result["Judgment Date"] = extract_field("Judgment Date:")
    result["Claim Amount"] = extract_field("Claim Amount:")

    party_blocks = soup.select("#partyInfo div.even, #partyInfo div.odd")
    for block in party_blocks:
        role = block.find("span", string=lambda s: s and s.strip() == "Party Type:")
        name = block.find("span", string=lambda s: s and s.strip() == "Party Name:")

        role_span = role.find_next_sibling("span") if role else None
        name_span = name.find_next_sibling("span") if name else None

        if not (role_span and name_span):
            continue

        role_value = role_span.get_text(strip=True)
        name_value = name_span.get_text(strip=True)

        if role_value == "Plaintiff" and not result["Plaintiff"]:
            result["Plaintiff"] = name_value
        elif role_value == "Defendant" and not result["Defendant"]:
            result["Defendant"] = name_value
        elif role_value == "Receiver" and not result["Receiver"]:
            result["Receiver"] = name_value

    hearing_blocks = soup.select("#eventInfo div.even, #eventInfo div.odd")[:10]
    for i, block in enumerate(hearing_blocks, start=1):
        desc = block.find("span", string=lambda s: s and s.strip() == "Hearing Description:")
        date = block.find("span", string=lambda s: s and s.strip() == "Hearing Date/Time:")

        result[f"Hearing {i} Description"] = (
            desc.find_next_sibling("span").get_text(strip=True)
            if desc and desc.find_next_sibling("span") else ""
        )
        result[f"Hearing {i} Date"] = (
            date.find_next_sibling("span").get_text(strip=True)
            if date and date.find_next_sibling("span") else ""
        )

    event_blocks = soup.select("#filingInfo div.even, #filingInfo div.odd")[:25]
    for i, block in enumerate(event_blocks, start=1):
        ev = block.find("span", string=lambda s: s and s.strip().startswith("Event"))
        dt = block.find("span", string=lambda s: s and s.strip() == "Date Added:")

        result[f"Event {i} Description"] = (
            ev.find_next_sibling("span").get_text(strip=True)
            if ev and ev.find_next_sibling("span") else ""
        )
        result[f"Event {i} Date"] = (
            dt.find_next_sibling("span").get_text(strip=True)
            if dt and dt.find_next_sibling("span") else ""
        )

    if not result.get("Filed Date") or not result.get("Plaintiff"):
        result["Error"] = result.get("Error", "Missing core data")

    return result

total = len(cases_to_scrape)
for start in range(0, total, chunk_size):
    chunk = cases_to_scrape.iloc[start:start + chunk_size]
    rows = []
    for _, row in tqdm(chunk.iterrows(), total=len(chunk), desc=f"Chunk {start // chunk_size + 1}"):
        rows.append(scrape_case(row["Case Number"], row["court_number"], row["Case Type"]))
        time.sleep(sleep_seconds)

    df_out = pd.DataFrame(rows)
    write_header = not os.path.exists(output_file)
    df_out.to_csv(output_file, mode="a", index=False, header=write_header)

print(f"✅ Scraped {total} unique cases into '{output_file}'")


Chunk 1:   0%|                                                                 | 44/636793 [00:34<136:42:48,  1.29it/s]


KeyboardInterrupt: 

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import os

input_file = "Harris_CivilHearingsFilings_Jan22Dec25_CLEAN.csv"
output_file = "Harris_HearingsFilings_Jan22Dec25_GRANULARv2_DEBTCLAIM.csv"

chunk_size = 1_000_000
sleep_seconds = 0.001

# -----------------------------
# Load + filter FIRST (Debt Claim only)
# -----------------------------
df_cases = pd.read_csv(
    input_file,
    usecols=["Case Number", "court_number", "Case Type"],
    dtype={"Case Number": "string", "court_number": "string", "Case Type": "string"},
    low_memory=False
)

df_cases["Case Type"] = df_cases["Case Type"].astype("string").str.strip()
df_cases = df_cases.loc[df_cases["Case Type"].eq("Debt Claim")].copy()

df_cases = df_cases.dropna(subset=["Case Number"])
df_cases["Case Number"] = df_cases["Case Number"].astype("string").str.strip()

cases_to_scrape = df_cases.drop_duplicates(subset=["Case Number"]).reset_index(drop=True)

# Fresh output each run
if os.path.exists(output_file):
    os.remove(output_file)

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0"})

def scrape_case(case_number, court_number, case_type):
    case_number = str(case_number).strip()
    url = f"https://jpwebsite.harriscountytx.gov/CaseInfo/GetCaseInfo?case={case_number}"

    last_err = None
    html = None
    for _ in range(3):
        try:
            r = session.get(url, timeout=10)
            r.raise_for_status()
            html = r.text
            break
        except requests.RequestException as e:
            last_err = e
            time.sleep(1)

    result = {
        "Case Number": case_number,
        "court_number": court_number,
        "Case Type": case_type,
        "Plaintiff": "",
        "Defendant": "",
        "Receiver": ""
    }

    if html is None:
        result["Error"] = str(last_err)
        return result

    soup = BeautifulSoup(html, "html.parser")

    def extract_field(label):
        tag = soup.find("span", string=lambda s: s and s.strip() == label)
        if tag:
            sib = tag.find_next_sibling("span")
            if sib:
                return sib.get_text(strip=True)
        return ""

    result["Filed Date"] = extract_field("Filed Date:")
    result["Case Status"] = extract_field("Case Status:")
    result["Nature of Claim"] = extract_field("Nature of Claim:")
    result["Disposition Desc"] = extract_field("Disposition:")
    result["Disposition Date"] = extract_field("Disposition Date:")
    result["Judgment Date"] = extract_field("Judgment Date:")
    result["Claim Amount"] = extract_field("Claim Amount:")

    party_blocks = soup.select("#partyInfo div.even, #partyInfo div.odd")
    for block in party_blocks:
        role = block.find("span", string=lambda s: s and s.strip() == "Party Type:")
        name = block.find("span", string=lambda s: s and s.strip() == "Party Name:")

        role_span = role.find_next_sibling("span") if role else None
        name_span = name.find_next_sibling("span") if name else None

        if not (role_span and name_span):
            continue

        role_value = role_span.get_text(strip=True)
        name_value = name_span.get_text(strip=True)

        if role_value == "Plaintiff" and not result["Plaintiff"]:
            result["Plaintiff"] = name_value
        elif role_value == "Defendant" and not result["Defendant"]:
            result["Defendant"] = name_value
        elif role_value == "Receiver" and not result["Receiver"]:
            result["Receiver"] = name_value

    hearing_blocks = soup.select("#eventInfo div.even, #eventInfo div.odd")[:10]
    for i, block in enumerate(hearing_blocks, start=1):
        desc = block.find("span", string=lambda s: s and s.strip() == "Hearing Description:")
        date = block.find("span", string=lambda s: s and s.strip() == "Hearing Date/Time:")

        result[f"Hearing {i} Description"] = (
            desc.find_next_sibling("span").get_text(strip=True)
            if desc and desc.find_next_sibling("span") else ""
        )
        result[f"Hearing {i} Date"] = (
            date.find_next_sibling("span").get_text(strip=True)
            if date and date.find_next_sibling("span") else ""
        )

    event_blocks = soup.select("#filingInfo div.even, #filingInfo div.odd")[:25]
    for i, block in enumerate(event_blocks, start=1):
        ev = block.find("span", string=lambda s: s and s.strip().startswith("Event"))
        dt = block.find("span", string=lambda s: s and s.strip() == "Date Added:")

        result[f"Event {i} Description"] = (
            ev.find_next_sibling("span").get_text(strip=True)
            if ev and ev.find_next_sibling("span") else ""
        )
        result[f"Event {i} Date"] = (
            dt.find_next_sibling("span").get_text(strip=True)
            if dt and dt.find_next_sibling("span") else ""
        )

    if not result.get("Filed Date") or not result.get("Plaintiff"):
        result["Error"] = result.get("Error", "Missing core data")

    return result

total = len(cases_to_scrape)
for start in range(0, total, chunk_size):
    chunk = cases_to_scrape.iloc[start:start + chunk_size]
    rows = []

    for _, row in tqdm(chunk.iterrows(), total=len(chunk), desc=f"Chunk {start // chunk_size + 1}"):
        rows.append(scrape_case(row["Case Number"], row["court_number"], row["Case Type"]))
        time.sleep(sleep_seconds)

    df_out = pd.DataFrame(rows)
    write_header = not os.path.exists(output_file)
    df_out.to_csv(output_file, mode="a", index=False, header=write_header)

print(f"✅ Scraped {total} unique Debt Claim cases into '{output_file}'")


Chunk 1:  24%|██████████████▍                                             | 68728/284846 [17:34:19<45:18:22,  1.33it/s]

In [10]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import os

input_file = "Harris_CivilHearingsFilings_Jan22Dec25_CLEAN.csv"
output_file = "Harris_CivilHearingsFilings_Jan22Dec25_GRANULARv2_DEBTCLAIM_2023.csv"

chunk_size = 1_000_000
sleep_seconds = 0.001

# -----------------------------
# Load + filter FIRST (Debt Claim + Case File Date in 2023 only)
# -----------------------------
df_cases = pd.read_csv(
    input_file,
    usecols=["Case Number", "court_number", "Case Type", "Case File Date"],
    dtype={
        "Case Number": "string",
        "court_number": "string",
        "Case Type": "string",
        "Case File Date": "string",
    },
    low_memory=False
)

# Clean strings
df_cases["Case Type"] = df_cases["Case Type"].astype("string").str.strip()
df_cases["Case Number"] = df_cases["Case Number"].astype("string").str.strip()

# Parse Case File Date and filter to calendar year 2023
df_cases["Case File Date"] = pd.to_datetime(df_cases["Case File Date"], errors="coerce")
df_cases = df_cases.dropna(subset=["Case Number", "Case File Date"])

df_cases = df_cases.loc[
    (df_cases["Case Type"].eq("Debt Claim")) &
    (df_cases["Case File Date"].dt.year.eq(2023))
].copy()

# De-dupe after filtering
cases_to_scrape = df_cases.drop_duplicates(subset=["Case Number"]).reset_index(drop=True)

# Fresh output each run
if os.path.exists(output_file):
    os.remove(output_file)

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0"})

def scrape_case(case_number, court_number, case_type):
    case_number = str(case_number).strip()
    url = f"https://jpwebsite.harriscountytx.gov/CaseInfo/GetCaseInfo?case={case_number}"

    last_err = None
    html = None
    for _ in range(3):
        try:
            r = session.get(url, timeout=10)
            r.raise_for_status()
            html = r.text
            break
        except requests.RequestException as e:
            last_err = e
            time.sleep(1)

    result = {
        "Case Number": case_number,
        "court_number": court_number,
        "Case Type": case_type,
        "Plaintiff": "",
        "Defendant": "",
        "Receiver": ""
    }

    if html is None:
        result["Error"] = str(last_err)
        return result

    soup = BeautifulSoup(html, "html.parser")

    def extract_field(label):
        tag = soup.find("span", string=lambda s: s and s.strip() == label)
        if tag:
            sib = tag.find_next_sibling("span")
            if sib:
                return sib.get_text(strip=True)
        return ""

    result["Filed Date"] = extract_field("Filed Date:")
    result["Case Status"] = extract_field("Case Status:")
    result["Nature of Claim"] = extract_field("Nature of Claim:")
    result["Disposition Desc"] = extract_field("Disposition:")
    result["Disposition Date"] = extract_field("Disposition Date:")
    result["Judgment Date"] = extract_field("Judgment Date:")
    result["Claim Amount"] = extract_field("Claim Amount:")

    party_blocks = soup.select("#partyInfo div.even, #partyInfo div.odd")
    for block in party_blocks:
        role = block.find("span", string=lambda s: s and s.strip() == "Party Type:")
        name = block.find("span", string=lambda s: s and s.strip() == "Party Name:")

        role_span = role.find_next_sibling("span") if role else None
        name_span = name.find_next_sibling("span") if name else None
        if not (role_span and name_span):
            continue

        role_value = role_span.get_text(strip=True)
        name_value = name_span.get_text(strip=True)

        if role_value == "Plaintiff" and not result["Plaintiff"]:
            result["Plaintiff"] = name_value
        elif role_value == "Defendant" and not result["Defendant"]:
            result["Defendant"] = name_value
        elif role_value == "Receiver" and not result["Receiver"]:
            result["Receiver"] = name_value

    hearing_blocks = soup.select("#eventInfo div.even, #eventInfo div.odd")[:10]
    for i, block in enumerate(hearing_blocks, start=1):
        desc = block.find("span", string=lambda s: s and s.strip() == "Hearing Description:")
        date = block.find("span", string=lambda s: s and s.strip() == "Hearing Date/Time:")

        result[f"Hearing {i} Description"] = (
            desc.find_next_sibling("span").get_text(strip=True)
            if desc and desc.find_next_sibling("span") else ""
        )
        result[f"Hearing {i} Date"] = (
            date.find_next_sibling("span").get_text(strip=True)
            if date and date.find_next_sibling("span") else ""
        )

    event_blocks = soup.select("#filingInfo div.even, #filingInfo div.odd")[:25]
    for i, block in enumerate(event_blocks, start=1):
        ev = block.find("span", string=lambda s: s and s.strip().startswith("Event"))
        dt = block.find("span", string=lambda s: s and s.strip() == "Date Added:")

        result[f"Event {i} Description"] = (
            ev.find_next_sibling("span").get_text(strip=True)
            if ev and ev.find_next_sibling("span") else ""
        )
        result[f"Event {i} Date"] = (
            dt.find_next_sibling("span").get_text(strip=True)
            if dt and dt.find_next_sibling("span") else ""
        )

    if not result.get("Filed Date") or not result.get("Plaintiff"):
        result["Error"] = result.get("Error", "Missing core data")

    return result

total = len(cases_to_scrape)
for start in range(0, total, chunk_size):
    chunk = cases_to_scrape.iloc[start:start + chunk_size]
    rows = []

    for _, row in tqdm(chunk.iterrows(), total=len(chunk), desc=f"Chunk {start // chunk_size + 1}"):
        rows.append(scrape_case(row["Case Number"], row["court_number"], row["Case Type"]))
        time.sleep(sleep_seconds)

    df_out = pd.DataFrame(rows)
    write_header = not os.path.exists(output_file)
    df_out.to_csv(output_file, mode="a", index=False, header=write_header)

print(f"✅ Scraped {total} unique Debt Claim cases filed in 2023 into '{output_file}'")


Chunk 1:   2%|█▌                                                               | 1365/55648 [18:28<12:14:27,  1.23it/s]


KeyboardInterrupt: 

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import os

input_file = "JP12_CivilHearingsFilings_Jan15Dec25_CLEAN.csv"
output_file = "JP12_CivilHearingsFilings_Jan15Dec25_GRANULAR.csv"

chunk_size = 1000000
sleep_seconds = 0.5

df_cases = pd.read_csv(
    input_file,
    usecols=["Case Number", "court_number", "Case Type"],
    dtype={"Case Number": "string", "court_number": "string", "Case Type": "string"},
    low_memory=False
)

df_cases = df_cases.dropna(subset=["Case Number"])
df_cases["Case Number"] = df_cases["Case Number"].astype("string").str.strip()

cases_to_scrape = df_cases.drop_duplicates(subset=["Case Number"]).reset_index(drop=True)

if os.path.exists(output_file):
    os.remove(output_file)

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0"})

def scrape_case(case_number, court_number, case_type):
    case_number = str(case_number).strip()
    url = f"https://jpwebsite.harriscountytx.gov/CaseInfo/GetCaseInfo?case={case_number}"

    last_err = None
    html = None
    for _ in range(3):
        try:
            r = session.get(url, timeout=10)
            r.raise_for_status()
            html = r.text
            break
        except requests.RequestException as e:
            last_err = e
            time.sleep(1)

    result = {
        "Case Number": case_number,
        "court_number": court_number,
        "Case Type": case_type,
        "Plaintiff": "",
        "Defendant": "",
        "Receiver": ""
    }

    if html is None:
        result["Error"] = str(last_err)
        return result

    soup = BeautifulSoup(html, "html.parser")

    def extract_field(label):
        tag = soup.find("span", string=lambda s: s and s.strip() == label)
        if tag:
            sib = tag.find_next_sibling("span")
            if sib:
                return sib.get_text(strip=True)
        return ""

    result["Filed Date"] = extract_field("Filed Date:")
    result["Case Status"] = extract_field("Case Status:")
    result["Nature of Claim"] = extract_field("Nature of Claim:")
    result["Disposition Desc"] = extract_field("Disposition:")
    result["Disposition Date"] = extract_field("Disposition Date:")
    result["Judgment Date"] = extract_field("Judgment Date:")
    result["Claim Amount"] = extract_field("Claim Amount:")

    party_blocks = soup.select("#partyInfo div.even, #partyInfo div.odd")
    for block in party_blocks:
        role = block.find("span", string=lambda s: s and s.strip() == "Party Type:")
        name = block.find("span", string=lambda s: s and s.strip() == "Party Name:")

        role_span = role.find_next_sibling("span") if role else None
        name_span = name.find_next_sibling("span") if name else None

        if not (role_span and name_span):
            continue

        role_value = role_span.get_text(strip=True)
        name_value = name_span.get_text(strip=True)

        if role_value == "Plaintiff" and not result["Plaintiff"]:
            result["Plaintiff"] = name_value
        elif role_value == "Defendant" and not result["Defendant"]:
            result["Defendant"] = name_value
        elif role_value == "Receiver" and not result["Receiver"]:
            result["Receiver"] = name_value

    hearing_blocks = soup.select("#eventInfo div.even, #eventInfo div.odd")[:10]
    for i, block in enumerate(hearing_blocks, start=1):
        desc = block.find("span", string=lambda s: s and s.strip() == "Hearing Description:")
        date = block.find("span", string=lambda s: s and s.strip() == "Hearing Date/Time:")

        result[f"Hearing {i} Description"] = (
            desc.find_next_sibling("span").get_text(strip=True)
            if desc and desc.find_next_sibling("span") else ""
        )
        result[f"Hearing {i} Date"] = (
            date.find_next_sibling("span").get_text(strip=True)
            if date and date.find_next_sibling("span") else ""
        )

    event_blocks = soup.select("#filingInfo div.even, #filingInfo div.odd")[:25]
    for i, block in enumerate(event_blocks, start=1):
        ev = block.find("span", string=lambda s: s and s.strip().startswith("Event"))
        dt = block.find("span", string=lambda s: s and s.strip() == "Date Added:")

        result[f"Event {i} Description"] = (
            ev.find_next_sibling("span").get_text(strip=True)
            if ev and ev.find_next_sibling("span") else ""
        )
        result[f"Event {i} Date"] = (
            dt.find_next_sibling("span").get_text(strip=True)
            if dt and dt.find_next_sibling("span") else ""
        )

    if not result.get("Filed Date") or not result.get("Plaintiff"):
        result["Error"] = result.get("Error", "Missing core data")

    return result

total = len(cases_to_scrape)
for start in range(0, total, chunk_size):
    chunk = cases_to_scrape.iloc[start:start + chunk_size]
    rows = []
    for _, row in tqdm(chunk.iterrows(), total=len(chunk), desc=f"Chunk {start // chunk_size + 1}"):
        rows.append(scrape_case(row["Case Number"], row["court_number"], row["Case Type"]))
        time.sleep(sleep_seconds)

    df_out = pd.DataFrame(rows)
    write_header = not os.path.exists(output_file)
    df_out.to_csv(output_file, mode="a", index=False, header=write_header)

print(f"✅ Scraped {total} unique cases into '{output_file}'")


In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import os

input_file = "JP12_CivilHearingsFilings_Jan22Dec25_CLEAN.csv"
output_file = "JP12_CivilHearingsFilings_Jan22Dec25_GRANULAR.csv"

chunk_size = 40000
sleep_seconds = 0.5

df_cases = pd.read_csv(
    input_file,
    usecols=["Case Number", "court_number", "Case Type"],
    dtype={"Case Number": "string", "court_number": "string", "Case Type": "string"},
    low_memory=False
)

df_cases = df_cases.dropna(subset=["Case Number"])
df_cases["Case Number"] = df_cases["Case Number"].astype("string").str.strip()

cases_to_scrape = df_cases.drop_duplicates(subset=["Case Number"]).reset_index(drop=True)

if os.path.exists(output_file):
    os.remove(output_file)

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0"})

def scrape_case(case_number, court_number, case_type):
    case_number = str(case_number).strip()
    url = f"https://jpwebsite.harriscountytx.gov/CaseInfo/GetCaseInfo?case={case_number}"

    last_err = None
    html = None
    for _ in range(3):
        try:
            r = session.get(url, timeout=10)
            r.raise_for_status()
            html = r.text
            break
        except requests.RequestException as e:
            last_err = e
            time.sleep(1)

    result = {"Case Number": case_number, "court_number": court_number, "Case Type": case_type}

    if html is None:
        result["Error"] = str(last_err)
        return result

    soup = BeautifulSoup(html, "html.parser")

    def extract_field(label):
        tag = soup.find("span", string=lambda s: s and s.strip() == label)
        if tag:
            sib = tag.find_next_sibling("span")
            if sib:
                return sib.get_text(strip=True)
        return ""

    result["Filed Date"] = extract_field("Filed Date:")
    result["Case Status"] = extract_field("Case Status:")
    result["Nature of Claim"] = extract_field("Nature of Claim:")
    result["Disposition Desc"] = extract_field("Disposition:")
    result["Disposition Date"] = extract_field("Disposition Date:")
    result["Judgment Date"] = extract_field("Judgment Date:")
    result["Claim Amount"] = extract_field("Claim Amount:")

    party_blocks = soup.select("#partyInfo div.even, #partyInfo div.odd")
    for block in party_blocks:
        role = block.find("span", string=lambda s: s and s.strip() == "Party Type:")
        name = block.find("span", string=lambda s: s and s.strip() == "Party Name:")
        role_span = role.find_next_sibling("span") if role else None
        name_span = name.find_next_sibling("span") if name else None
        if role_span and name_span:
            role_value = role_span.get_text(strip=True)
            name_value = name_span.get_text(strip=True)
            if role_value == "Plaintiff":
                result["Plaintiff"] = name_value
            elif role_value == "Defendant":
                result["Defendant"] = name_value

    hearing_blocks = soup.select("#eventInfo div.even, #eventInfo div.odd")[:10]
    for i, block in enumerate(hearing_blocks, start=1):
        desc = block.find("span", string=lambda s: s and s.strip() == "Hearing Description:")
        date = block.find("span", string=lambda s: s and s.strip() == "Hearing Date/Time:")
        hres = block.find("span", string=lambda s: s and s.strip() == "Hearing Result:")
        result[f"Hearing {i} Description"] = desc.find_next_sibling("span").get_text(strip=True) if desc and desc.find_next_sibling("span") else ""
        result[f"Hearing {i} Date"] = date.find_next_sibling("span").get_text(strip=True) if date and date.find_next_sibling("span") else ""
        result[f"Hearing {i} Result"] = hres.find_next_sibling("span").get_text(strip=True) if hres and hres.find_next_sibling("span") else ""

    event_blocks = soup.select("#filingInfo div.even, #filingInfo div.odd")[:25]
    for i, block in enumerate(event_blocks, start=1):
        ev = block.find("span", string=lambda s: s and s.strip().startswith("Event"))
        dt = block.find("span", string=lambda s: s and s.strip() == "Date Added:")
        result[f"Event {i} Description"] = ev.find_next_sibling("span").get_text(strip=True) if ev and ev.find_next_sibling("span") else ""
        result[f"Event {i} Date"] = dt.find_next_sibling("span").get_text(strip=True) if dt and dt.find_next_sibling("span") else ""

    if not result.get("Filed Date") or not result.get("Plaintiff"):
        result["Error"] = result.get("Error", "Missing core data")

    return result

total = len(cases_to_scrape)
for start in range(0, total, chunk_size):
    chunk = cases_to_scrape.iloc[start:start + chunk_size]
    rows = []
    for _, row in tqdm(chunk.iterrows(), total=len(chunk), desc=f"Chunk {start // chunk_size + 1}"):
        rows.append(scrape_case(row["Case Number"], row["court_number"], row["Case Type"]))
        time.sleep(sleep_seconds)

    df_out = pd.DataFrame(rows)
    write_header = not os.path.exists(output_file)
    df_out.to_csv(output_file, mode="a", index=False, header=write_header)

print(f"✅ Scraped {total} unique cases into '{output_file}'")


Chunk 1: 100%|████████████████████████████████████████████████████████████████| 33469/33469 [23:39:00<00:00,  2.54s/it]


✅ Scraped 33469 unique cases into 'JP12_CivilHearingsFilings_Jan22Dec25_GRANULAR.csv'


In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import os

# Load your dataset and sample 100
df_cases = pd.read_csv('JP12_CivilHearings_Jan0122_Dec2225_CLEAN.csv', usecols=['Case Number', 'court_number','Case Type'])
sampled_cases = df_cases.dropna(subset=['Case Number']).sample(n=100, random_state=42)

session = requests.Session(),
output_file = 'JP12_CivilHearings_Jan0122_Dec2225_GRANULAR_EVENTS.csv'

# Remove old file if re-running
if os.path.exists(output_file):
    os.remove(output_file).csv

def scrape_eviction_case(case_number, court_number):
    url = f'https://jpwebsite.harriscountytx.gov/CaseInfo/GetCaseInfo?case={case_number}'
    for attempt in range(3):
        try:
            response = session.get(url, timeout=10)
            response.raise_for_status()
            break
        except requests.RequestException as e:
            if attempt == 2:
                return {'Case Number': case_number, 'court_number': court_number, 'Error': str(e)}
            time.sleep(1)

    soup = BeautifulSoup(response.text, 'html.parser')
    result = {'Case Number': case_number, 'court_number': court_number}

    def extract_field(label):
        tag = soup.find('span', string=label)
        if tag and tag.find_next_sibling('span'):
            return tag.find_next_sibling('span').text.strip()
        return ''

    result['Filed Date'] = extract_field('Filed Date:')
    result['Case Status'] = extract_field('Case Status:')
    result['Nature of Claim'] = extract_field('Nature of Claim:')
    result['Disposition Desc'] = extract_field('Disposition:')
    result['Disposition Date'] = extract_field('Disposition Date:')
    result['Judgment Date'] = extract_field('Judgment Date:')
    result['Claim Amount'] = extract_field('Claim Amount:')

    # Parties
    party_blocks = soup.select('#partyInfo div.even, #partyInfo div.odd')
    for block in party_blocks:
        role = block.find('span', string='Party Type:')
        name = block.find('span', string='Party Name:')
        if role and name:
            role_value = role.find_next_sibling('span').text.strip()
            name_value = name.find_next_sibling('span').text.strip()
            if role_value == 'Plaintiff':
                result['Plaintiff'] = name_value
            elif role_value == 'Defendant':
                result['Defendant'] = name_value

    if not result.get('Filed Date') or not result.get('Plaintiff'):
        result['Error'] = 'Missing core data'
        return result

    # Hearings (up to 10)
    hearing_blocks = soup.select('#eventInfo div.even, #eventInfo div.odd')[:10]
    for i, block in enumerate(hearing_blocks):
        desc = block.find('span', string='Hearing Description:')
        date = block.find('span', string='Hearing Date/Time:')
        result_text = block.find_all('span')[-3].text.strip() if len(block.find_all('span')) > 3 else ''
        result[f'Hearing {i+1} Description'] = desc.find_next_sibling('span').text.strip() if desc else ''
        result[f'Hearing {i+1} Date'] = date.find_next_sibling('span').text.strip() if date else ''
        result[f'Hearing {i+1} Result'] = result_text

    # Events (up to 25)
    event_blocks = soup.select('#filingInfo div.even, #filingInfo div.odd')[:25]
    for i, block in enumerate(event_blocks):
        desc = block.find('span', string=lambda s: s and 'Event' in s)
        date = block.find('span', string='Date Added:')
        result[f'Event {i+1} Description'] = desc.find_next_sibling('span').text.strip() if desc else ''
        result[f'Event {i+1} Date'] = date.find_next_sibling('span').text.strip() if date else ''

    return result

# Scrape in chunks of 25
chunk_size = 25
for i in range(0, len(sampled_cases), chunk_size):
    chunk = sampled_cases.iloc[i:i+chunk_size]
    chunk_results = []
    for _, row in tqdm(chunk.iterrows(), total=len(chunk), desc=f"Processing chunk {i//chunk_size + 1}"):
        case_number = str(row['Case Number'])
        court_number = row['court_number']
        data = scrape_eviction_case(case_number, court_number)
        chunk_results.append(data)
        time.sleep(0.5)
    
    df_chunk = pd.DataFrame(chunk_results)
    df_chunk.to_csv(output_file, mode='a', index=False, header=not os.path.exists(output_file))

print(f"✅ All chunks scraped and saved to '{output_file}'")


AttributeError: 'NoneType' object has no attribute 'csv'

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import os

input_file = "JP12_CivilHearingsFilings_Jan15Dec25_CLEAN.csv"
output_file = "JP12_Debt_Jan18Dec25_GRANULAR.csv"

chunk_size = 1000000
sleep_seconds = 0.5

# Read only what you need (added "Case File Date")
df_cases = pd.read_csv(
    input_file,
    usecols=["Case Number", "court_number", "Case Type", "Case File Date"],
    dtype={"Case Number": "string", "court_number": "string", "Case Type": "string", "Case File Date": "string"},
    low_memory=False
)

# Basic cleaning
df_cases = df_cases.dropna(subset=["Case Number"])
df_cases["Case Number"] = df_cases["Case Number"].astype("string").str.strip()
df_cases["Case Type"] = df_cases["Case Type"].astype("string").str.strip()

# --- FILTERS ---
# 1) Case Type must be Debt Claim
df_cases = df_cases[df_cases["Case Type"] == "Debt Claim"].copy()

# 2) Case File Date must be Jan 2017 or later
# Robust parse: handles MM/DD/YYYY (and similar) safely; unparseable -> NaT -> dropped
df_cases["Case File Date"] = pd.to_datetime(df_cases["Case File Date"], errors="coerce")
df_cases = df_cases[df_cases["Case File Date"] >= pd.Timestamp("2017-01-01")].copy()

# Optional: drop rows where Case File Date couldn't be parsed
df_cases = df_cases.dropna(subset=["Case File Date"])

cases_to_scrape = df_cases.drop_duplicates(subset=["Case Number"]).reset_index(drop=True)

if os.path.exists(output_file):
    os.remove(output_file)

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0"})

def scrape_case(case_number, court_number, case_type):
    case_number = str(case_number).strip()
    url = f"https://jpwebsite.harriscountytx.gov/CaseInfo/GetCaseInfo?case={case_number}"

    last_err = None
    html = None
    for _ in range(3):
        try:
            r = session.get(url, timeout=10)
            r.raise_for_status()
            html = r.text
            break
        except requests.RequestException as e:
            last_err = e
            time.sleep(1)

    result = {
        "Case Number": case_number,
        "court_number": court_number,
        "Case Type": case_type,
        "Plaintiff": "",
        "Defendant": "",
        "Receiver": ""
    }

    if html is None:
        result["Error"] = str(last_err)
        return result

    soup = BeautifulSoup(html, "html.parser")

    def extract_field(label):
        tag = soup.find("span", string=lambda s: s and s.strip() == label)
        if tag:
            sib = tag.find_next_sibling("span")
            if sib:
                return sib.get_text(strip=True)
        return ""

    result["Filed Date"] = extract_field("Filed Date:")
    result["Case Status"] = extract_field("Case Status:")
    result["Nature of Claim"] = extract_field("Nature of Claim:")
    result["Disposition Desc"] = extract_field("Disposition:")
    result["Disposition Date"] = extract_field("Disposition Date:")
    result["Judgment Date"] = extract_field("Judgment Date:")
    result["Claim Amount"] = extract_field("Claim Amount:")

    party_blocks = soup.select("#partyInfo div.even, #partyInfo div.odd")
    for block in party_blocks:
        role = block.find("span", string=lambda s: s and s.strip() == "Party Type:")
        name = block.find("span", string=lambda s: s and s.strip() == "Party Name:")

        role_span = role.find_next_sibling("span") if role else None
        name_span = name.find_next_sibling("span") if name else None

        if not (role_span and name_span):
            continue

        role_value = role_span.get_text(strip=True)
        name_value = name_span.get_text(strip=True)

        if role_value == "Plaintiff" and not result["Plaintiff"]:
            result["Plaintiff"] = name_value
        elif role_value == "Defendant" and not result["Defendant"]:
            result["Defendant"] = name_value
        elif role_value == "Receiver" and not result["Receiver"]:
            result["Receiver"] = name_value

    hearing_blocks = soup.select("#eventInfo div.even, #eventInfo div.odd")[:10]
    for i, block in enumerate(hearing_blocks, start=1):
        desc = block.find("span", string=lambda s: s and s.strip() == "Hearing Description:")
        date = block.find("span", string=lambda s: s and s.strip() == "Hearing Date/Time:")

        result[f"Hearing {i} Description"] = (
            desc.find_next_sibling("span").get_text(strip=True)
            if desc and desc.find_next_sibling("span") else ""
        )
        result[f"Hearing {i} Date"] = (
            date.find_next_sibling("span").get_text(strip=True)
            if date and date.find_next_sibling("span") else ""
        )

    event_blocks = soup.select("#filingInfo div.even, #filingInfo div.odd")[:25]
    for i, block in enumerate(event_blocks, start=1):
        ev = block.find("span", string=lambda s: s and s.strip().startswith("Event"))
        dt = block.find("span", string=lambda s: s and s.strip() == "Date Added:")

        result[f"Event {i} Description"] = (
            ev.find_next_sibling("span").get_text(strip=True)
            if ev and ev.find_next_sibling("span") else ""
        )
        result[f"Event {i} Date"] = (
            dt.find_next_sibling("span").get_text(strip=True)
            if dt and dt.find_next_sibling("span") else ""
        )

    if not result.get("Filed Date") or not result.get("Plaintiff"):
        result["Error"] = result.get("Error", "Missing core data")

    return result

total = len(cases_to_scrape)
for start in range(0, total, chunk_size):
    chunk = cases_to_scrape.iloc[start:start + chunk_size]
    rows = []
    for _, row in tqdm(chunk.iterrows(), total=len(chunk), desc=f"Chunk {start // chunk_size + 1}"):
        rows.append(scrape_case(row["Case Number"], row["court_number"], row["Case Type"]))
        time.sleep(sleep_seconds)

    df_out = pd.DataFrame(rows)
    write_header = not os.path.exists(output_file)
    df_out.to_csv(output_file, mode="a", index=False, header=write_header)

print(f"✅ Scraped {total} unique cases into '{output_file}'")


Chunk 1: 100%|████████████████████████████████████████████████████████████████| 39243/39243 [16:21:55<00:00,  1.50s/it]


✅ Scraped 39243 unique cases into 'JP12_Debt_Jan18Dec25_GRANULAR.csv'
