In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time

def scrape_criminal_case(case_number):
    url = f'https://jpwebsite.harriscountytx.gov/CaseInfo/GetCaseInfo?case={case_number}'
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        return {'Case Number': case_number, 'Error': str(e)}

    soup = BeautifulSoup(response.text, 'html.parser')
    result = {'Case Number': case_number}

    # --- Core Info Fields ---
    label_map = {
        'Style of Case:': 'Style of Case',
        'Citation Number:': 'Citation Number',
        'Offense Date:': 'Offense Date',
        'Arresting Agency:': 'Arresting Agency',
        'Arresting Officer:': 'Arresting Officer',
        'Offense Charged:': 'Offense Charged',
        'Plea Entered:': 'Plea Entered',
        'Plea Date:': 'Plea Date',
        'Filed Date:': 'Filed Date',
        'Case Status:': 'Case Status',
        'Disposition:': 'Disposition',
        'Disposition Date:': 'Disposition Date',
        'Judgment Date:': 'Judgment Date',
    }

    spans = soup.find_all("span")
    for i in range(len(spans) - 1):
        label = spans[i].text.strip()
        value = spans[i + 1].text.strip()
        if label in label_map:
            result[label_map[label]] = value
        elif label == "Party Type:" and value == "Defendant":
            result["Defendant"] = spans[i - 1].text.strip()
        elif label == "Party Type:" and value == "Officer":
            result["Officer"] = spans[i - 1].text.strip()

    # --- Hearings (up to 10) ---
    hearing_count = 0
    for i in range(len(spans) - 1):
        label = spans[i].text.strip()
        if label == "Hearing Description:" and hearing_count < 10:
            desc = spans[i + 1].text.strip()
            date = outcome = ''
            if i + 3 < len(spans) and spans[i + 2].text.strip() == "Hearing Date/Time:":
                date = spans[i + 3].text.strip()
            if i + 5 < len(spans) and spans[i + 4].text.strip() == "Hearing Result/Cancellation:":
                outcome = spans[i + 5].text.strip()

            result[f'Hearing Description {hearing_count+1}'] = desc
            result[f'Hearing Date/Time {hearing_count+1}'] = date
            result[f'Hearing Result/Cancellation {hearing_count+1}'] = outcome
            hearing_count += 1

    for i in range(hearing_count + 1, 11):
        result[f'Hearing Description {i}'] = ''
        result[f'Hearing Date/Time {i}'] = ''
        result[f'Hearing Result/Cancellation {i}'] = ''

    # --- Events (up to 20) ---
    event_count = 0
    for i in range(len(spans) - 1):
        label = spans[i].text.strip().replace('\xa0', '').replace('\t', '')
        if label.startswith("Event Description") and event_count < 20:
            desc = spans[i + 1].text.strip()
            date = ''
            if i + 3 < len(spans) and spans[i + 2].text.strip().startswith("Date Added"):
                date = spans[i + 3].text.strip()

            result[f'Event Description {event_count+1}'] = desc
            result[f'Event Date Added {event_count+1}'] = date
            event_count += 1

    for i in range(event_count + 1, 21):
        result[f'Event Description {i}'] = ''
        result[f'Event Date Added {i}'] = ''

    return result

# --- List of 3 Sample Cases ---
sample_case_numbers = ['232200082938', '231200095502', '191100196678']

# --- Scrape All with Progress Bar ---
results = []
for case in tqdm(sample_case_numbers, desc="Scraping Cases"):
    results.append(scrape_criminal_case(case))
    time.sleep(0.5)  # Rate limit to avoid overwhelming the site

# --- Save to CSV ---
df = pd.DataFrame(results)
df.to_csv('Parsed_Criminal_Case_Samples.csv', index=False)
print("✅ Saved to Parsed_Criminal_Case_Samples.csv")


Scraping Cases: 100%|████████████████████████| 3/3 [00:02<00:00,  1.20it/s]

✅ Saved to Parsed_Criminal_Case_Samples.csv





In [9]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time

def scrape_criminal_case(case_number):
    url = f'https://jpwebsite.harriscountytx.gov/CaseInfo/GetCaseInfo?case={case_number}'
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        return {'Case Number': case_number, 'Error': str(e)}

    soup = BeautifulSoup(response.text, 'html.parser')
    result = {'Case Number': case_number}

    label_map = {
        'Style of Case:': 'Style of Case',
        'Citation Number:': 'Citation Number',
        'Offense Date:': 'Offense Date',
        'Arresting Agency:': 'Arresting Agency',
        'Arresting Officer:': 'Arresting Officer',
        'Offense Charged:': 'Offense Charged',
        'Plea Entered:': 'Plea Entered',
        'Plea Date:': 'Plea Date',
        'Filed Date:': 'Filed Date',
        'Case Status:': 'Case Status',
        'Disposition:': 'Disposition',
        'Disposition Date:': 'Disposition Date',
        'Judgment Date:': 'Judgment Date',
    }

    spans = soup.find_all("span")
    for i in range(len(spans) - 1):
        label = spans[i].text.strip()
        value = spans[i + 1].text.strip()
        if label in label_map:
            result[label_map[label]] = value
        elif label == "Party Type:" and value == "Defendant":
            result["Defendant"] = spans[i - 1].text.strip()
        elif label == "Party Type:" and value == "Officer":
            result["Officer"] = spans[i - 1].text.strip()

    hearing_count = 0
    for i in range(len(spans) - 1):
        label = spans[i].text.strip()
        if label == "Hearing Description:" and hearing_count < 10:
            desc = spans[i + 1].text.strip()
            date = outcome = ''
            if i + 3 < len(spans) and spans[i + 2].text.strip() == "Hearing Date/Time:":
                date = spans[i + 3].text.strip()
            if i + 5 < len(spans) and spans[i + 4].text.strip() == "Hearing Result/Cancellation:":
                outcome = spans[i + 5].text.strip()

            result[f'Hearing Description {hearing_count+1}'] = desc
            result[f'Hearing Date/Time {hearing_count+1}'] = date
            result[f'Hearing Result/Cancellation {hearing_count+1}'] = outcome
            hearing_count += 1

    for i in range(hearing_count + 1, 11):
        result[f'Hearing Description {i}'] = ''
        result[f'Hearing Date/Time {i}'] = ''
        result[f'Hearing Result/Cancellation {i}'] = ''

    event_count = 0
    for i in range(len(spans) - 1):
        label = spans[i].text.strip().replace('\xa0', '').replace('\t', '')
        if label.startswith("Event Description") and event_count < 20:
            desc = spans[i + 1].text.strip()
            date = ''
            if i + 3 < len(spans) and spans[i + 2].text.strip().startswith("Date Added"):
                date = spans[i + 3].text.strip()

            result[f'Event Description {event_count+1}'] = desc
            result[f'Event Date Added {event_count+1}'] = date
            event_count += 1

    for i in range(event_count + 1, 21):
        result[f'Event Description {i}'] = ''
        result[f'Event Date Added {i}'] = ''

    return result

# --- Load your dataset and get 10 unique case numbers with JP Court ID ---
df = pd.read_csv('Harris_CrimCITHearings_2021Apr2025_CLEAN.csv', low_memory=False)
df_sample = df[['Case Number', 'JP Court ID']].dropna().drop_duplicates()
sampled_cases = df_sample.sample(n=10, random_state=42)

# --- Scrape each and add JP Court ID ---
results = []
for _, row in tqdm(sampled_cases.iterrows(), total=10, desc="Scraping Random 10 Cases"):
    case_number = str(row['Case Number']).strip()
    court_id = row['JP Court ID']
    result = scrape_criminal_case(case_number)
    result['JP Court ID'] = court_id
    results.append(result)
    time.sleep(0.5)

# --- Export results ---
pd.DataFrame(results).to_csv('Criminal_Case_Sample_10.csv', index=False)
print("✅ Saved to Criminal_Case_Sample_10.csv")


Scraping Random 10 Cases: 100%|████████████| 10/10 [00:10<00:00,  1.01s/it]

✅ Saved to Criminal_Case_Sample_10.csv





In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time

# -------- Scraper Function -------- #
def scrape_criminal_case(case_number):
    url = f'https://jpwebsite.harriscountytx.gov/CaseInfo/GetCaseInfo?case={case_number}'
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        return {'Case Number': case_number, 'Error': str(e)}

    soup = BeautifulSoup(response.text, 'html.parser')
    result = {'Case Number': case_number}

    label_map = {
        'Style of Case:': 'Style of Case',
        'Citation Number:': 'Citation Number',
        'Offense Date:': 'Offense Date',
        'Arresting Agency:': 'Arresting Agency',
        'Arresting Officer:': 'Arresting Officer',
        'Offense Charged:': 'Offense Charged',
        'Plea Entered:': 'Plea Entered',
        'Plea Date:': 'Plea Date',
        'Filed Date:': 'Filed Date',
        'Case Status:': 'Case Status',
        'Disposition:': 'Disposition',
        'Disposition Date:': 'Disposition Date',
        'Judgment Date:': 'Judgment Date',
    }

    spans = soup.find_all("span")
    for i in range(len(spans) - 1):
        label = spans[i].text.strip()
        value = spans[i + 1].text.strip()
        if label in label_map:
            result[label_map[label]] = value
        elif label == "Party Type:" and value == "Defendant":
            result["Defendant"] = spans[i - 1].text.strip()
        elif label == "Party Type:" and value == "Officer":
            result["Officer"] = spans[i - 1].text.strip()

    hearing_count = 0
    for i in range(len(spans) - 1):
        label = spans[i].text.strip()
        if label == "Hearing Description:" and hearing_count < 10:
            desc = spans[i + 1].text.strip()
            date = outcome = ''
            if i + 3 < len(spans) and spans[i + 2].text.strip() == "Hearing Date/Time:":
                date = spans[i + 3].text.strip()
            if i + 5 < len(spans) and spans[i + 4].text.strip() == "Hearing Result/Cancellation:":
                outcome = spans[i + 5].text.strip()

            result[f'Hearing Description {hearing_count+1}'] = desc
            result[f'Hearing Date/Time {hearing_count+1}'] = date
            result[f'Hearing Result/Cancellation {hearing_count+1}'] = outcome
            hearing_count += 1

    for i in range(hearing_count + 1, 11):
        result[f'Hearing Description {i}'] = ''
        result[f'Hearing Date/Time {i}'] = ''
        result[f'Hearing Result/Cancellation {i}'] = ''

    event_count = 0
    for i in range(len(spans) - 1):
        label = spans[i].text.strip().replace('\xa0', '').replace('\t', '')
        if label.startswith("Event Description") and event_count < 20:
            desc = spans[i + 1].text.strip()
            date = ''
            if i + 3 < len(spans) and spans[i + 2].text.strip().startswith("Date Added"):
                date = spans[i + 3].text.strip()

            result[f'Event Description {event_count+1}'] = desc
            result[f'Event Date Added {event_count+1}'] = date
            event_count += 1

    for i in range(event_count + 1, 21):
        result[f'Event Description {i}'] = ''
        result[f'Event Date Added {i}'] = ''

    return result

# -------- Load & Filter Dataset -------- #
df = pd.read_csv('Harris_CrimCITHearings_2021Apr2025_CLEAN.csv', low_memory=False)
df['Filed Date'] = pd.to_datetime(df['Filed Date'], errors='coerce')
df_filtered = df[df['Filed Date'].dt.year.isin([2023, 2024, 2025])]
df_filtered = df_filtered[['Case Number', 'JP Court ID']].dropna().drop_duplicates()

# -------- Sample up to 10,000 per JP Court ID -------- #
final_sample = df_filtered.groupby('JP Court ID').apply(
    lambda g: g.sample(n=min(1, len(g)), random_state=42)
).reset_index(drop=True)

# -------- Scrape and Track Progress -------- #
results = []
for idx, row in tqdm(final_sample.iterrows(), total=len(final_sample), desc="Scraping All Courts"):
    case_number = str(row['Case Number']).strip()
    court_id = row['JP Court ID']
    data = scrape_criminal_case(case_number)
    data['JP Court ID'] = court_id
    results.append(data)
    time.sleep(0.5)  # polite delay

# -------- Save Output -------- #
pd.DataFrame(results).to_csv('Criminal_Case_Full_Sample.csv', index=False)
print("✅ Done! Saved to Criminal_Case_Full_Sample.csv")


  final_sample = df_filtered.groupby('JP Court ID').apply(
Scraping All Courts: 100%|█████████████████████████████████████████████████████████████| 16/16 [00:13<00:00,  1.15it/s]

✅ Done! Saved to Criminal_Case_Full_Sample.csv





In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time

# -------- Scraper Function -------- #
def scrape_criminal_case(case_number):
    url = f'https://jpwebsite.harriscountytx.gov/CaseInfo/GetCaseInfo?case={case_number}'
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        return {'Case Number': case_number, 'Error': str(e)}

    soup = BeautifulSoup(response.text, 'html.parser')
    result = {'Case Number': case_number}

    label_map = {
        'Style of Case:': 'Style of Case',
        'Citation Number:': 'Citation Number',
        'Offense Date:': 'Offense Date',
        'Arresting Agency:': 'Arresting Agency',
        'Arresting Officer:': 'Arresting Officer',
        'Offense Charged:': 'Offense Charged',
        'Plea Entered:': 'Plea Entered',
        'Plea Date:': 'Plea Date',
        'Filed Date:': 'Filed Date',
        'Case Status:': 'Case Status',
        'Disposition:': 'Disposition',
        'Disposition Date:': 'Disposition Date',
        'Judgment Date:': 'Judgment Date',
    }

    spans = soup.find_all("span")
    for i in range(len(spans) - 1):
        label = spans[i].text.strip()
        value = spans[i + 1].text.strip()
        if label in label_map:
            result[label_map[label]] = value
        elif label == "Party Type:" and value == "Defendant":
            result["Defendant"] = spans[i - 1].text.strip()
        elif label == "Party Type:" and value == "Officer":
            result["Officer"] = spans[i - 1].text.strip()

    hearing_count = 0
    for i in range(len(spans) - 1):
        label = spans[i].text.strip()
        if label == "Hearing Description:" and hearing_count < 10:
            desc = spans[i + 1].text.strip()
            date = outcome = ''
            if i + 3 < len(spans) and spans[i + 2].text.strip() == "Hearing Date/Time:":
                date = spans[i + 3].text.strip()
            if i + 5 < len(spans) and spans[i + 4].text.strip() == "Hearing Result/Cancellation:":
                outcome = spans[i + 5].text.strip()

            result[f'Hearing Description {hearing_count+1}'] = desc
            result[f'Hearing Date/Time {hearing_count+1}'] = date
            result[f'Hearing Result/Cancellation {hearing_count+1}'] = outcome
            hearing_count += 1

    for i in range(hearing_count + 1, 11):
        result[f'Hearing Description {i}'] = ''
        result[f'Hearing Date/Time {i}'] = ''
        result[f'Hearing Result/Cancellation {i}'] = ''

    event_count = 0
    for i in range(len(spans) - 1):
        label = spans[i].text.strip().replace('\xa0', '').replace('\t', '')
        if label.startswith("Event Description") and event_count < 20:
            desc = spans[i + 1].text.strip()
            date = ''
            if i + 3 < len(spans) and spans[i + 2].text.strip().startswith("Date Added"):
                date = spans[i + 3].text.strip()

            result[f'Event Description {event_count+1}'] = desc
            result[f'Event Date Added {event_count+1}'] = date
            event_count += 1

    for i in range(event_count + 1, 21):
        result[f'Event Description {i}'] = ''
        result[f'Event Date Added {i}'] = ''

    return result

# -------- Load & Filter Dataset -------- #
df = pd.read_csv('UnresolvedCases_Over10Years_JP12.csv', low_memory=False)
df['Case File Date'] = pd.to_datetime(df['Case File Date'], errors='coerce')
df_filtered = df[['Case Number', 'JP Court ID']].dropna().drop_duplicates()

# -------- Sample up to 10,000 per JP Court ID -------- #
final_sample = df_filtered.groupby('JP Court ID').apply(
    lambda g: g.sample(n=min(10000, len(g)), random_state=42)
).reset_index(drop=True)

# -------- Scrape and Track Progress -------- #
results = []
for idx, row in tqdm(final_sample.iterrows(), total=len(final_sample), desc="Scraping All Courts"):
    case_number = str(row['Case Number']).strip()
    court_id = row['JP Court ID']
    data = scrape_criminal_case(case_number)
    data['JP Court ID'] = court_id
    results.append(data)
    time.sleep(0.5)  # polite delay

# -------- Save Output -------- #
pd.DataFrame(results).to_csv('UnresolvedCases_Over10Years_JP12_Scraped.csv', index=False)
print("✅ Done!")


  final_sample = df_filtered.groupby('JP Court ID').apply(
Scraping All Courts: 100%|███████████████████████████████████████████████████████████| 180/180 [02:26<00:00,  1.23it/s]

✅ Done!



