In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException

In [2]:
#pip install selenium
#pip install webdriver_manager

In [3]:
# Find the path to the Chrome binary
chrome_path = '/usr/bin/google-chrome'

In [4]:
# Base URL to scrape
base_url = "https://pcaobus.org/oversight/inspections/firm-inspection-reports?pg={}&mpp=96&globalnetworks=Ernst%20%26%20Young%20Global%20Limited%2CDeloitte%20Touche%20Tohmatsu%20Limited%2CKPMG%20International%20Cooperative%2CPricewaterhouseCoopers%20International%20Limited&country=South%20Korea%2CSouth%20Africa%2CJapan%2CColombia%2CCayman%20Islands%2CCanada%2CBrazil%2CBahamas%2CArgentina%2CUnited%20Kingdom%2CUnited%20States%2CTaiwan%2CSwitzerland%2CSweden%2CSpain%2CLuxembourg%2CNetherlands%2CNorway%2CPanama%2CPeru%2CPhilippines%2CSingapore"

In [5]:
# Set up Chrome options
options = Options()
options.binary_location = chrome_path

In [6]:
# Initialize the WebDriver service
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

In [7]:
# Initialize lists to store data
pdf_links = []
countries = []
inspection_years = []
total_issuers_audit_clients = []
report_dates = []
audits_reviewed = []
part_ia_deficiency_rates = []
companies = []
inspection_report_companies = []

In [8]:
def extract_data_from_page():
    report_containers = driver.find_elements(By.CSS_SELECTOR, 'div.sf-search-results.media-list div.media-body')
    if not report_containers:
        return False
    
    for container in report_containers:
        try:
            pdf_link_tag = container.find_element(By.CSS_SELECTOR, 'a.hawk-download-pdf')
            pdf_link = pdf_link_tag.get_attribute('href')
            pdf_links.append(pdf_link)
        except NoSuchElementException:
            pdf_links.append(None)
        
        try:
            details = container.find_elements(By.CLASS_NAME, 'hawk-column')
            
            country = None
            company = None
            inspection_year = None
            total_issuer_audit_clients = None
            audits_reviewed_count = None
            part_ia_deficiency_rate = None
            report_date = None

            for detail in details:
                try:
                    key_element = detail.find_element(By.CLASS_NAME, 'lead-text-lt').text.strip().lower()
                    value_element = detail.find_element(By.CLASS_NAME, 'lead-text-st').text.strip()

                    # Print key and value elements for verification
                    print(f"Key: {key_element}, Value: {value_element}")

                    if key_element == "country":
                        country = value_element
                    elif key_element == "global network":
                        company = value_element
                    elif key_element == "inspection year":
                        inspection_year = value_element
                    elif key_element == "total issuer audit clients":
                        total_issuer_audit_clients = value_element
                    elif key_element == "audits reviewed":
                        audits_reviewed_count = value_element
                    elif key_element == "part i.a deficiency rate":
                        part_ia_deficiency_rate = value_element
                    elif key_element == "inspection report date":
                        report_date = value_element

                except NoSuchElementException as e:
                    print(f"Element not found: {e}")
                except Exception as e:
                    print(f"Error occurred: {e}")

            # Print details in a more readable format
            print("Details extracted:")
            print(f"Country: {country}")
            print(f"Global Network: {company}")
            print(f"Inspection Year: {inspection_year}")
            print(f"Total Issuer Audit Clients: {total_issuer_audit_clients}")
            print(f"Audits Reviewed: {audits_reviewed_count}")
            print(f"Part I.A Deficiency Rate: {part_ia_deficiency_rate}")
            print(f"Inspection Report Date: {report_date}")

        except NoSuchElementException as e:
            print(f"Error: {e}")
            country = company = inspection_year = total_issuer_audit_clients = audits_reviewed_count = part_ia_deficiency_rate = report_date = None

        countries.append(country)
        inspection_years.append(inspection_year)
        total_issuers_audit_clients.append(total_issuer_audit_clients)
        audits_reviewed.append(audits_reviewed_count)
        part_ia_deficiency_rates.append(part_ia_deficiency_rate)
        report_dates.append(report_date)

        try:
            company = details[1].find_element(By.CLASS_NAME, 'lead-text-st').text.strip() if len(details) > 1 else None
            companies.append(company)
        except NoSuchElementException:
            companies.append(None)

        try:
            report_company_tag = container.find_element(By.CSS_SELECTOR, 'h3 a')
            report_company = report_company_tag.text.strip()
            inspection_report_companies.append(report_company)
        except NoSuchElementException:
            inspection_report_companies.append(None)

    return True

In [9]:
# Loop through all pages and extract data
page_number = 1
while True:
    page_url = base_url.format(page_number)
    driver.get(page_url)
    time.sleep(5)  # Adjust this if needed, to wait for the page to load
    print(f"Extracting data from page {page_number}")
    if not extract_data_from_page():
        print(f"No data found on page {page_number}, stopping.")
        break
    page_number += 1

print(f"Extracted data from {page_number - 1} pages.")

Extracting data from page 1
Key: country, Value: Colombia
Key: global network, Value: Deloitte Touche Tohmatsu Limited
Key: inspection year, Value: 2023
Key: total issuer audit clients, Value: 1
Key: audits reviewed, Value: 1
Key: part i.a deficiency rate, Value: 0%
Key: inspection report date, Value: Apr. 26, 2024
Details extracted:
Country: Colombia
Global Network: Deloitte Touche Tohmatsu Limited
Inspection Year: 2023
Total Issuer Audit Clients: 1
Audits Reviewed: 1
Part I.A Deficiency Rate: 0%
Inspection Report Date: Apr. 26, 2024
Key: country, Value: Taiwan
Key: global network, Value: Ernst & Young Global Limited
Key: inspection year, Value: 2023
Key: total issuer audit clients, Value: 1
Key: audits reviewed, Value: 3
Key: part i.a deficiency rate, Value: 0%
Key: inspection report date, Value: Apr. 26, 2024
Details extracted:
Country: Taiwan
Global Network: Ernst & Young Global Limited
Inspection Year: 2023
Total Issuer Audit Clients: 1
Audits Reviewed: 3
Part I.A Deficiency Rate:

In [10]:
# Close the WebDriver
driver.quit()

In [11]:
# Create a DataFrame with the extracted data
data = {
    'PDF Link': pdf_links,
    'Country': countries,
    'Inspection Year': inspection_years,
    'Total Issuer Audit Clients': total_issuers_audit_clients,
    'Inspection Report Date': report_dates,
    'Audits Reviewed': audits_reviewed,
    'Part I.A Deficiency Rate': part_ia_deficiency_rates,
    'Company': companies,
    'Inspection Report Company': inspection_report_companies
}
df = pd.DataFrame(data)

In [12]:
df

Unnamed: 0,PDF Link,Country,Inspection Year,Total Issuer Audit Clients,Inspection Report Date,Audits Reviewed,Part I.A Deficiency Rate,Company,Inspection Report Company
0,https://assets.pcaobus.org/pcaob-dev/docs/defa...,Colombia,2023,1,"Apr. 26, 2024",1,0%,Deloitte Touche Tohmatsu Limited,Deloitte & Touche S.A.S.
1,https://assets.pcaobus.org/pcaob-dev/docs/defa...,Taiwan,2023,1,"Apr. 26, 2024",3,0%,Ernst & Young Global Limited,Ernst & Young
2,https://assets.pcaobus.org/pcaob-dev/docs/defa...,Luxembourg,2023,3,"Apr. 26, 2024",3,67%,Ernst & Young Global Limited,Ernst & Young S.A.
3,https://assets.pcaobus.org/pcaob-dev/docs/defa...,Singapore,2023,3,"Mar. 21, 2024",3,0%,Deloitte Touche Tohmatsu Limited,Deloitte & Touche LLP
4,https://assets.pcaobus.org/pcaob-dev/docs/defa...,Colombia,2023,2,"Mar. 21, 2024",3,33%,Ernst & Young Global Limited,Ernst & Young Audit S.A.S.
...,...,...,...,...,...,...,...,...,...
422,https://assets.pcaobus.org/pcaob-dev/docs/defa...,United States,2004,,"Sep. 29, 2005",,,KPMG International Cooperative,KPMG LLP
423,https://assets.pcaobus.org/pcaob-dev/docs/defa...,United States,2003,,"Aug. 26, 2004",,,Deloitte Touche Tohmatsu Limited,Deloitte & Touche LLP
424,https://assets.pcaobus.org/pcaob-dev/docs/defa...,United States,2003,,"Aug. 26, 2004",,,Ernst & Young Global Limited,Ernst & Young LLP
425,https://assets.pcaobus.org/pcaob-dev/docs/defa...,United States,2003,,"Aug. 26, 2004",,,KPMG International Cooperative,KPMG LLP


In [14]:
# Save DataFrame to a CSV file
df.to_csv('PCAOB_inspection_reports.csv', index=False)