In [None]:
def scrape_enforcement_actions(start_month, start_year, end_page=480):
    # Ensure the year is >= 2013
    if start_year < 2013:
        print("Year must be >= 2013. Please provide a valid year.")
        return

    # Initialize necessary lists to store the scraped data
    titles, dates, links, categories = [], [], [], []

    base_url = 'https://oig.hhs.gov/fraud/enforcement/'
    page_number = 1

    # Get today's date for comparison
    today = datetime.today()

    while page_number <= end_page:
        # Form the URL for the specific page
        url = f"{base_url}?page={page_number}"
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all enforcement items on the current page
        enforcement_items = soup.find_all('li', class_='usa-card card--list pep-card--minimal mobile:grid-col-12')

        # If no enforcement items are found, stop the crawling process
        if not enforcement_items:
            print(f"No enforcement items found on page {page_number}. Stopping.")
            break

        # Track if any valid data is found in this page
        valid_data_found = False

        for item in enforcement_items:
            # Extract the title, date, category, and link
            title_tag = item.find('h2', class_='usa-card__heading')
            date_tag = item.find('span', class_='text-base-dark padding-right-105')

            # Ensure that the tags exist
            if not title_tag or not date_tag:
                continue

            title = title_tag.get_text(strip=True)
            date = date_tag.get_text(strip=True)
            date_obj = datetime.strptime(date, '%B %d, %Y')

            # If the date is before the start date, skip the entry
            if date_obj < datetime(start_year, start_month, 1):
                continue

            # If we reach here, it means we have valid data
            valid_data_found = True

            # Append the data to the lists
            titles.append(title)
            dates.append(date)

            category = item.find('li', class_='display-inline-block usa-tag text-no-lowercase text-base-darkest bg-base-lightest margin-right-1')
            category_text = category.get_text(strip=True) if category else 'N/A'
            categories.append(category_text)

            link = item.find('a', href=True)['href']
            full_link = f'https://oig.hhs.gov{link}'
            links.append(full_link)

        # If no valid data is found on this page, stop the scraping process
        if not valid_data_found:
            print(f"No valid data found on page {page_number}. Stopping.")
            break

        # Add delay to avoid being blocked by the server
        time.sleep(1)

        # Increment the page number for the next loop iteration
        page_number += 1

    # Creating a DataFrame with all the scraped data
    df = pd.DataFrame({
        'Title': titles,
        'Date': dates,
        'Category': categories,
        'Link': links
    })

    # Save the DataFrame to a single CSV file named with the given year and month
    filename = f"enforcement_actions_{start_year}_{start_month}_to_present.csv"
    df.to_csv(filename, index=False)

    print(f"Scraping complete. Data saved to {filename}. Total records: {len(df)}")

scrape_enforcement_actions(1, 2023, end_page=480)

No valid data found on page 77. Stopping.
Scraping complete. Data saved to enforcement_actions_2023_1_to_present.csv. Total records: 1510


In [None]:
def scrape_enforcement_actions(start_month, start_year, end_page=480):
    if start_year < 2013:
        print("Year must be >= 2013. Please provide a valid year.")
        return

    titles, dates, links, categories = [], [], [], []

    base_url = "https://oig.hhs.gov/fraud/enforcement/"
    page_number = 1

    today = datetime.today()

    while page_number <= end_page:
        url = f"{base_url}?page={page_number}"
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        enforcement_items = soup.find_all(
            "li", class_="usa-card card--list pep-card--minimal mobile:grid-col-12"
        )

        if not enforcement_items:
            print(f"No enforcement items found on page {page_number}. Stopping.")
            break

        valid_data_found = False

        for item in enforcement_items:
            title_tag = item.find("h2", class_="usa-card__heading")
            date_tag = item.find("span", class_="text-base-dark padding-right-105")

            if not title_tag or not date_tag:
                continue

            title = title_tag.get_text(strip=True)
            date = date_tag.get_text(strip=True)
            date_obj = datetime.strptime(date, "%B %d, %Y")

            if date_obj < datetime(start_year, start_month, 1):
                continue

            valid_data_found = True

            titles.append(title)
            dates.append(date)

            category = item.find(
                "li",
                class_="display-inline-block usa-tag text-no-lowercase text-base-darkest bg-base-lightest margin-right-1",
            )
            category_text = category.get_text(strip=True) if category else "N/A"
            categories.append(category_text)

            link = item.find("a", href=True)["href"]
            full_link = f"https://oig.hhs.gov{link}"
            links.append(full_link)

        if not valid_data_found:
            print(f"No valid data found on page {page_number}. Stopping.")
            break

        time.sleep(1)

        page_number += 1

    df = pd.DataFrame(
        {"Title": titles, "Date": dates, "Category": categories, "Link": links}
    )

    filename = f"enforcement_actions_{start_year}_{start_month}_to_present.csv"
    df.to_csv(filename, index=False)

    print(f"Scraping complete. Data saved to {filename}. Total records: {len(df)}")


scrape_enforcement_actions(1, 2023, end_page=480)

No valid data found on page 77. Stopping.
Scraping complete. Data saved to enforcement_actions_2023_1_to_present.csv. Total records: 1510
