In [16]:
import requests
from bs4 import BeautifulSoup
import os

def download_pdfs(url, download_folder):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all rows in the table of litigation releases
        rows = soup.find_all('tr', class_='pr-list-page-row')

        for row in rows:
            date_field = row.find('td', class_='views-field views-field-field-publish-date')
            respondent_field = row.find('td', class_='views-field views-field-field-release-file-number views-field-nothing-1')
            if date_field and respondent_field:
                date = date_field.time.text if date_field.time else 'No_Date'
                respondents = respondent_field.div.text.strip().replace(' ', '_').replace('/', '_') if respondent_field.div else 'No_Respondents'
                pdf_link = respondent_field.find('a', {'type': 'application/pdf'})

                if pdf_link:
                    pdf_url = 'https://www.sec.gov' + pdf_link['href']
                    pdf_name = f"{date}_{respondents}.pdf"

                    print(f"Downloading: {pdf_name} from {pdf_url}")
                    pdf_response = requests.get(pdf_url)

                    if not os.path.exists(download_folder):
                        os.makedirs(download_folder)

                    with open(os.path.join(download_folder, pdf_name), 'wb') as f:
                        f.write(pdf_response.content)
                        print(f'Downloaded: {pdf_name}')
    except Exception as e:
        print(f"An error occurred: {e}")

sec_url = 'https://www.sec.gov/litigation/litreleases'
download_folder = '/Users/Ray/Desktop/SEC_PDFs'

download_pdfs(sec_url, download_folder)

In [None]:
# currently working scrapping file 

import requests
from bs4 import BeautifulSoup
import os
import time

def download_pdfs_from_page(url, session, download_folder):
    try:
        response = session.get(url)
        if response.status_code == 200:
            print("Successfully connected to the page.")
            soup = BeautifulSoup(response.text, 'html.parser')
        else:
            print(f"Failed to connect, status code: {response.status_code}")
            return None

        rows = soup.find_all('tr', class_='pr-list-page-row')

        for row in rows:
            try:
                date_field = row.find('td', class_='views-field views-field-field-publish-date')
                respondent_field = row.find('td', class_='views-field views-field-field-release-file-number views-field-nothing-1')
                if date_field and respondent_field:
                    date = date_field.time.text.strip().replace(' ', '_').replace('.', '') if date_field.time else 'No_Date'
                    respondents = respondent_field.div.text.strip().replace(' ', '_').replace('/', '_') if respondent_field.div else 'No_Respondents'
                    pdf_link = respondent_field.find('a', {'type': 'application/pdf'})

                    if pdf_link:
                        pdf_url = 'https://www.sec.gov' + pdf_link['href']
                        document_type = pdf_link.text.strip().split(' ')[0].lower()
                        pdf_name = f"{date}_{respondents}_{document_type}.pdf"

                        # Truncate pdf_name to a maximum length (e.g., 255 characters)
                        max_length = 255
                        if len(pdf_name) > max_length:
                            pdf_name = pdf_name[:max_length - 4] + '.pdf'

                        print(f"Downloading: {pdf_name} from {pdf_url}")
                        pdf_response = session.get(pdf_url)

                        if not os.path.exists(download_folder):
                            os.makedirs(download_folder)

                        with open(os.path.join(download_folder, pdf_name), 'wb') as f:
                            f.write(pdf_response.content)
                            print(f'Downloaded: {pdf_name}')
            except Exception as e:
                print(f"An error occurred while downloading the PDF: {e}")
                continue  # Continue to the next PDF even if the current one fails

        return soup
    except Exception as e:
        print(f"An error occurred while processing the page: {e}")
        return None


def get_next_page_url(current_soup, base_url):
    if current_soup:
        next_page_link = current_soup.find('a', {'title': 'Go to next page'})
        if next_page_link:
            next_page_url = base_url + next_page_link['href']
            print(f"Next page URL: {next_page_url}")
            return next_page_url
    return None

def download_all_pdfs(base_url, download_folder):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': base_url
    }

    with requests.Session() as session:
        session.headers.update(headers)
        current_url = base_url
        while current_url:
            print(f"Processing page: {current_url}")
            current_soup = download_pdfs_from_page(current_url, session, download_folder)
            if current_soup is None:
                print("Failed to download from the current page, stopping the script.")
                break
            time.sleep(5)  # Delay to avoid rate limiting
            current_url = get_next_page_url(current_soup, base_url)

download_folder = '/Users/Ray/Desktop/SECLITIgation'
base_url = 'https://www.sec.gov/litigation/litreleases'

download_all_pdfs(base_url, download_folder)


In [24]:
import pandas as pd

# Define the path where the Excel files are located
path = '/Users/Ray/Desktop/'

# Load the two Excel files
df1 = pd.read_excel(path + '1.xlsx')
df2 = pd.read_excel(path + '2.xlsx')

# Merge the two dataframes based on the 'Respondents' column
combined_df = pd.merge(df1, df2, on='Respondents', how='inner')

# Save the combined dataframe to a new Excel file on the desktop
combined_df.to_excel(path + 'combined.xlsx', index=False)


In [26]:
import pandas as pd

# Define the path where the Excel file is located
path = '/Users/RayXu/Desktop/'

# Assuming 'combined.xlsx' is the file you've already created and contains the columns as you mentioned
combined_df = pd.read_excel(path + 'SECLitigation.xlsx')

# Reorder the columns to put 'Date' first
columns_order = ['Date', 'Respondents', 'Release No','Case Number', 'Filename', 'Document Type','PDF  Link']
combined_df = combined_df[columns_order]

# Save the reordered dataframe to a new Excel file
combined_df.to_excel(path + 'reordered_combined.xlsx', index=False)
