Connected to base (Python 3.11.7)

In [None]:
import pandas as pd
import altair as alt
import time

import warnings 
warnings.filterwarnings('ignore')
alt.renderers.enable("png")

import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://oig.hhs.gov/fraud/enforcement/'
response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')

titles = []
dates = []
links = []
categories = []

enforcement_items = soup.find_all('li', class_='usa-card card--list pep-card--minimal mobile:grid-col-12')

for item in enforcement_items:
    title_tag = item.find('h2', class_='usa-card__heading')
    title = title_tag.get_text(strip=True)
    titles.append(title)

    date_tag = item.find('span', class_='text-base-dark padding-right-105')
    date = date_tag.get_text(strip=True) 
    dates.append(date)

    category_tag = item.find('li', class_='display-inline-block usa-tag text-no-lowercase text-base-darkest bg-base-lightest margin-right-1')
    category = category_tag.get_text(strip=True)
    categories.append(category)

    link_tag = title_tag.find('a', href=True) 
    link = link_tag['href'] if link_tag else 'N/A'
    if not link.startswith('http'):
        full_link = f'https://oig.hhs.gov{link}'
    else:
        full_link = link
    links.append(full_link)

df = pd.DataFrame({
    'Title': titles,
    'Date': dates,
    'Category': categories,
    'Link': links
})

print(df.head())


import time

agencies = []

for full_link in links:
    action_response = requests.get(full_link)
    action_response.raise_for_status()
    action_soup = BeautifulSoup(action_response.text, 'html.parser')

    agency_tag = action_soup.find('span', string='Agency:')
    if agency_tag:
        agency = agency_tag.find_parent('li').get_text(
            strip=True).replace('Agency:', '').strip()
    else:
        agency = 'N/A'

    agencies.append(agency)

    time.sleep(1)

df = pd.DataFrame({
    'Title': titles,
    'Date': dates,
    'Category': categories,
    'Link': links,
    'Agency': agencies
})

print(df.head())

                                               Title              Date  \
0  Former Arlington Resident Sentenced To Prison ...  November 7, 2024   
1  Paroled Felon Sentenced To Six Years For Fraud...  November 7, 2024   
2  Former Licensed Counselor Sentenced For Defrau...  November 6, 2024   
3  Macomb County Doctor And Pharmacist Agree To P...  November 4, 2024   
4  Rocky Hill Pharmacy And Its Owners Indicted Fo...  November 4, 2024   

                     Category  \
0  Criminal and Civil Actions   
1  Criminal and Civil Actions   
2  Criminal and Civil Actions   
3  Criminal and Civil Actions   
4  Criminal and Civil Actions   

                                                Link  
0  https://oig.hhs.gov/fraud/enforcement/former-a...  
1  https://oig.hhs.gov/fraud/enforcement/paroled-...  
2  https://oig.hhs.gov/fraud/enforcement/former-l...  
3  https://oig.hhs.gov/fraud/enforcement/macomb-c...  
4  https://oig.hhs.gov/fraud/enforcement/rocky-hi...  
                          

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime

def scrape_enforcement_actions(start_month, start_year):
    # 1. Input Validation
    if start_year < 2013:
        print("Year must be >= 2013")
        return

    # 2. Initialization
    enforcement_data = pd.DataFrame(columns=['Title', 'Date', 'Category', 'Link', 'Agency'])
    current_date = datetime.now()
    current_page = 1
    continue_scraping = True

    # 3. Page Scraping Loop
    while continue_scraping:
        # 4. Construct URL for the Main Page
        if current_page == 1:
            url = "https://oig.hhs.gov/fraud/enforcement/"
        else:
            url = f"https://oig.hhs.gov/fraud/enforcement/?page={current_page}"

        # 5. Fetch Page Content
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to retrieve page {current_page}")
            break
        soup = BeautifulSoup(response.text, 'html.parser')

        # 6. Extract Enforcement Actions
        enforcement_items = soup.find_all('li', class_='usa-card card--list pep-card--minimal mobile:grid-col-12')
        if not enforcement_items:
            # No more items to scrape, stop the loop
            continue_scraping = False
            break

        for item in enforcement_items:
            title_tag = item.find('h2', class_='usa-card__heading')
            title = title_tag.get_text(strip=True) if title_tag else 'N/A'

            date_tag = item.find('span', class_='text-base-dark padding-right-105')
            date = date_tag.get_text(strip=True) if date_tag else 'N/A'

            category_tag = item.find('li', class_='display-inline-block usa-tag text-no-lowercase text-base-darkest bg-base-lightest margin-right-1')
            category = category_tag.get_text(strip=True) if category_tag else 'N/A'

            link_tag = title_tag.find('a', href=True) if title_tag else None
            link = link_tag['href'] if link_tag else 'N/A'
            if link != 'N/A' and not link.startswith('http'):
                link = f'https://oig.hhs.gov{link}'

            # 7. Deep Scraping of Detailed Information
            if link != 'N/A':
                action_response = requests.get(link)
                if action_response.status_code == 200:
                    action_soup = BeautifulSoup(action_response.text, 'html.parser')
                    agency_tag = action_soup.find('span', string='Agency:')
                    if agency_tag:
                        agency = agency_tag.find_parent('li').get_text(strip=True).replace('Agency:', '').strip()
                    else:
                        agency = 'N/A'
                else:
                    agency = 'N/A'
            else:
                agency = 'N/A'

            # Add data to the DataFrame
            enforcement_data = pd.concat([enforcement_data, pd.DataFrame([{
                'Title': title,
                'Date': date,
                'Category': category,
                'Link': link,
                'Agency': agency
            }])], ignore_index=True)

        # 8. Check for Next Page
        next_page_tag = soup.find('a', class_='next-page-link')
        if next_page_tag:
            current_page += 1
        else:
            continue_scraping = False

        # 9. Add Delay Between Requests
        time.sleep(1)

    # 10. Save Data to CSV
    filename = f"enforcement_actions_{start_year}_{start_month}.csv"
    enforcement_data.to_csv(filename, index=False)
    
    # Return the DataFrame for analysis
    return enforcement_data

# Run the function to collect enforcement actions since January 2023
df = scrape_enforcement_actions(1, 2023)

# Display the result
print(df.head())

# Display details about the number of enforcement actions and the earliest action
total_actions = len(df)
early_action = df.iloc[df['Date'].idxmin()]
print(f"Total enforcement actions collected: {total_actions}")
print(f"Earliest enforcement action: {early_action}")

                                               Title              Date  \
0  Former Arlington Resident Sentenced To Prison ...  November 7, 2024   
1  Paroled Felon Sentenced To Six Years For Fraud...  November 7, 2024   
2  Former Licensed Counselor Sentenced For Defrau...  November 6, 2024   
3  Macomb County Doctor And Pharmacist Agree To P...  November 4, 2024   
4  Rocky Hill Pharmacy And Its Owners Indicted Fo...  November 4, 2024   

                     Category  \
0  Criminal and Civil Actions   
1  Criminal and Civil Actions   
2  Criminal and Civil Actions   
3  Criminal and Civil Actions   
4  Criminal and Civil Actions   

                                                Link  \
0  https://oig.hhs.gov/fraud/enforcement/former-a...   
1  https://oig.hhs.gov/fraud/enforcement/paroled-...   
2  https://oig.hhs.gov/fraud/enforcement/former-l...   
3  https://oig.hhs.gov/fraud/enforcement/macomb-c...   
4  https://oig.hhs.gov/fraud/enforcement/rocky-hi...   

                   

In [None]:
def scrape_enforcement_actions(start_month, start_year):
    # 1. Input Validation
    if start_year < 2013:
        print("Year must be >= 2013")
        return

    # 2. Initialization
    enforcement_data = pd.DataFrame(columns=['Title', 'Date', 'Category', 'Link', 'Agency'])
    current_date = datetime.now()
    current_page = 1
    continue_scraping = True

    # 3. Page Scraping Loop
    while continue_scraping:
        # 4. Construct URL for the Main Page
        if current_page == 1:
            url = "https://oig.hhs.gov/fraud/enforcement/"
        else:
            url = f"https://oig.hhs.gov/fraud/enforcement/?page={current_page}"

        # 5. Fetch Page Content
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to retrieve page {current_page}")
            break
        soup = BeautifulSoup(response.text, 'html.parser')

        # 6. Extract Enforcement Actions
        enforcement_items = soup.find_all('li', class_='usa-card card--list pep-card--minimal mobile:grid-col-12')
        if not enforcement_items:
            # No more items to scrape, stop the loop
            continue_scraping = False
            break

        for item in enforcement_items:
            title_tag = item.find('h2', class_='usa-card__heading')
            title = title_tag.get_text(strip=True) if title_tag else 'N/A'

            date_tag = item.find('span', class_='text-base-dark padding-right-105')
            date = date_tag.get_text(strip=True) if date_tag else 'N/A'

            category_tag = item.find('li', class_='display-inline-block usa-tag text-no-lowercase text-base-darkest bg-base-lightest margin-right-1')
            category = category_tag.get_text(strip=True) if category_tag else 'N/A'

            link_tag = title_tag.find('a', href=True) if title_tag else None
            link = link_tag['href'] if link_tag else 'N/A'
            if link != 'N/A' and not link.startswith('http'):
                link = f'https://oig.hhs.gov{link}'

            # 7. Deep Scraping of Detailed Information
            if link != 'N/A':
                action_response = requests.get(link)
                if action_response.status_code == 200:
                    action_soup = BeautifulSoup(action_response.text, 'html.parser')
                    agency_tag = action_soup.find('span', string='Agency:')
                    if agency_tag:
                        agency = agency_tag.find_parent('li').get_text(strip=True).replace('Agency:', '').strip()
                    else:
                        agency = 'N/A'
                else:
                    agency = 'N/A'
            else:
                agency = 'N/A'

            # Add data to the DataFrame
            enforcement_data = pd.concat([enforcement_data, pd.DataFrame([{
                'Title': title,
                'Date': date,
                'Category': category,
                'Link': link,
                'Agency': agency
            }])], ignore_index=True)

        # 8. Check for Next Page
        next_page_tag = soup.find('a', class_='next-page-link')
        if next_page_tag:
            current_page += 1
        else:
            continue_scraping = False

        # 9. Add Delay Between Requests
        time.sleep(1)

    # 10. Save Data to CSV
    filename = f"enforcement_actions_{start_year}_{start_month}.csv"
    enforcement_data.to_csv(filename, index=False)
    
    # Return the DataFrame for analysis
    return enforcement_data

# Run the function to collect enforcement actions since January 2023
df = scrape_enforcement_actions(1, 2023)

# Display the result
print(df.head())

# Display details about the number of enforcement actions and the earliest action
total_actions = len(df)
early_action = df.iloc[df['Date'].idxmin()]
print(f"Total enforcement actions collected: {total_actions}")
print(f"Earliest enforcement action: {early_action}")

                                               Title              Date  \
0  Former Arlington Resident Sentenced To Prison ...  November 7, 2024   
1  Paroled Felon Sentenced To Six Years For Fraud...  November 7, 2024   
2  Former Licensed Counselor Sentenced For Defrau...  November 6, 2024   
3  Macomb County Doctor And Pharmacist Agree To P...  November 4, 2024   
4  Rocky Hill Pharmacy And Its Owners Indicted Fo...  November 4, 2024   

                     Category  \
0  Criminal and Civil Actions   
1  Criminal and Civil Actions   
2  Criminal and Civil Actions   
3  Criminal and Civil Actions   
4  Criminal and Civil Actions   

                                                Link  \
0  https://oig.hhs.gov/fraud/enforcement/former-a...   
1  https://oig.hhs.gov/fraud/enforcement/paroled-...   
2  https://oig.hhs.gov/fraud/enforcement/former-l...   
3  https://oig.hhs.gov/fraud/enforcement/macomb-c...   
4  https://oig.hhs.gov/fraud/enforcement/rocky-hi...   

                   

In [None]:
def scrape_enforcement_actions(start_month, start_year):
    if start_year < 2013:
        print("Year must be >= 2013")
        return

    enforcement_data = pd.DataFrame(columns=['Title', 'Date', 'Category', 'Link', 'Agency'])
    current_date = datetime.now()
    current_page = 1
    continue_scraping = True

    while continue_scraping:
        if current_page == 1:
            url = "https://oig.hhs.gov/fraud/enforcement/"
        else:
            url = f"https://oig.hhs.gov/fraud/enforcement/?page={current_page}"

        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to retrieve page {current_page}")
            break
        soup = BeautifulSoup(response.text, 'html.parser')

        enforcement_items = soup.find_all('li', class_='usa-card card--list pep-card--minimal mobile:grid-col-12')
        if not enforcement_items:
            continue_scraping = False
            break

        for item in enforcement_items:
            title_tag = item.find('h2', class_='usa-card__heading')
            title = title_tag.get_text(strip=True) if title_tag else 'N/A'

            date_tag = item.find('span', class_='text-base-dark padding-right-105')
            date = date_tag.get_text(strip=True) if date_tag else 'N/A'

            category_tag = item.find('li', class_='display-inline-block usa-tag text-no-lowercase text-base-darkest bg-base-lightest margin-right-1')
            category = category_tag.get_text(strip=True) if category_tag else 'N/A'

            link_tag = title_tag.find('a', href=True) if title_tag else None
            link = link_tag['href'] if link_tag else 'N/A'
            if link != 'N/A' and not link.startswith('http'):
                link = f'https://oig.hhs.gov{link}'

            if link != 'N/A':
                action_response = requests.get(link)
                if action_response.status_code == 200:
                    action_soup = BeautifulSoup(action_response.text, 'html.parser')
                    agency_tag = action_soup.find('span', string='Agency:')
                    if agency_tag:
                        agency = agency_tag.find_parent('li').get_text(strip=True).replace('Agency:', '').strip()
                    else:
                        agency = 'N/A'
                else:
                    agency = 'N/A'
            else:
                agency = 'N/A'

            enforcement_data = pd.concat([enforcement_data, pd.DataFrame([{
                'Title': title,
                'Date': date,
                'Category': category,
                'Link': link,
                'Agency': agency
            }])], ignore_index=True)

        next_page_tag = soup.find('a', class_='next-page-link')
        if next_page_tag:
            current_page += 1
        else:
            continue_scraping = False

        time.sleep(1)

    filename = f"enforcement_actions_{start_year}_{start_month}.csv"
    enforcement_data.to_csv(filename, index=False)
    
    return enforcement_data

df = scrape_enforcement_actions(1, 2023)

print(df.head())

total_actions = len(df)
early_action = df.iloc[df['Date'].idxmin()]
print(f"Total enforcement actions collected: {total_actions}")
print(f"Earliest enforcement action: {early_action}")

                                               Title              Date  \
0  Former Arlington Resident Sentenced To Prison ...  November 7, 2024   
1  Paroled Felon Sentenced To Six Years For Fraud...  November 7, 2024   
2  Former Licensed Counselor Sentenced For Defrau...  November 6, 2024   
3  Macomb County Doctor And Pharmacist Agree To P...  November 4, 2024   
4  Rocky Hill Pharmacy And Its Owners Indicted Fo...  November 4, 2024   

                     Category  \
0  Criminal and Civil Actions   
1  Criminal and Civil Actions   
2  Criminal and Civil Actions   
3  Criminal and Civil Actions   
4  Criminal and Civil Actions   

                                                Link  \
0  https://oig.hhs.gov/fraud/enforcement/former-a...   
1  https://oig.hhs.gov/fraud/enforcement/paroled-...   
2  https://oig.hhs.gov/fraud/enforcement/former-l...   
3  https://oig.hhs.gov/fraud/enforcement/macomb-c...   
4  https://oig.hhs.gov/fraud/enforcement/rocky-hi...   

                   