## Collate SEC 10K and 10Q filings

This script turn SEC filings into manageable dataframes for analysis.
© Emmanuel Mensah Boateng

In [23]:
#!/usr/bin/env python
#title           :10X_Processor.py
#description     :This will create a header for a python script.
#author          :Emmanuel Mensah Boateng
#date            :20240915
#version         :1.1
#usage           :python pyscript.py
#notes           :
#==============================================================================

import os
import re
import pandas as pd
import time

# Function to get company name, siccd, and main text (excluding <SEC-Header>)
def process_filing_file(file_path):
    with open(file_path, 'r') as file:
        data = file.read()

    # Get the content in <SEC-Header> tag
    header_content = re.search(r'<SEC-Header>(.*?)</SEC-Header>', data, re.DOTALL)
    
    if header_content:
        header_data = header_content.group(1)

        # Determine the filing type: only 10-Q(/A) or 10-K(/A)
        filing_type_match = re.search(r'CONFORMED SUBMISSION TYPE:\s*(10-(K|Q)(/A)?)', header_data)
        if filing_type_match:
            filing_type = filing_type_match.group(1)

            # Get company name
            company_match = re.search(r'COMPANY CONFORMED NAME:\s*(.+)', header_data)
            company_name = company_match.group(1).strip() if company_match else "Unknown"

            cik_match = re.search(r'CENTRAL INDEX KEY:\s*(\d+)', header_data)
            cik = cik_match.group(1).strip() if cik_match else None
            
            # Get siccd
            sic_match = re.search(r'\s*STANDARD\s+INDUSTRIAL\s+CLASSIFICATION\s*:\s*.*?(\d+)', header_data)
            sic_code = sic_match.group(1).strip() if sic_match else None

            # Get Conformed Period of Report
            period_match = re.search(r'CONFORMED PERIOD OF REPORT:\s+(\d{8})', header_data)
            period = period_match.group(1).strip() if period_match else None

            # If period is found, extract year, quarter, and formatted date
            if period:
                date = pd.to_datetime(period, format='%Y%m%d')
                year = date.year
                quarter = date.quarter
                formatted_date = date.strftime('%Y-%m-%d')  # Format the date as YYYY-MM-DD
            else:
                year = None
                quarter = None
                formatted_date = None
            # delete content within <SEC-Header> tag separate it from the main text to clean it
            cleaned_text = re.sub(r'<SEC-Header>.*?</SEC-Header>', '', data, flags=re.DOTALL).strip()

            # get items into into dataframe based on filing type
            if filing_type in ['10-Q', '10-Q/A']:
                item_1 = extract_item_section(cleaned_text, '1')  # Item 1 - Financial Statements
                item_2 = extract_item_section(cleaned_text, '2')  # Item 2 - MD&A
                item_3 = extract_item_section(cleaned_text, '3')  # Item 3 - Market Risk
                item_1a = extract_item_section(cleaned_text, '1A')  # Item 1A - Risk Factors
                return {
                    'file_name': os.path.basename(file_path),
                    'company_name': company_name,
                    'cik': cik,
                    'siccd': sic_code,
                    'filing_type': filing_type,
                    'report_period': period,
                    'year': year,
                    'quarter': quarter,
                    'formatted_date': formatted_date,
                    'item_1a': item_1a, 
                    'item_7': item_2, 
                    'item_7a': item_3, 
                    'item_8': item_1  
                }
            elif filing_type in ['10-K', '10-K/A']:
                #item_1 = extract_item_section(cleaned_text, '1')  # Item 1 - Business
                item_1a = extract_item_section(cleaned_text, '1A')  # Item 1A - Risk Factors
                item_7 = extract_item_section(cleaned_text, '7')  # Item 7 - MD&A
                item_7a = extract_item_section(cleaned_text, '7A')  # Item 7A - Market Risk
                item_8 = extract_item_section(cleaned_text, '8')  # Item 8 - Financial Statements
                return {
                    'file_name': os.path.basename(file_path),
                    'company_name': company_name,
                    'cik': cik,
                    'siccd': sic_code,
                    'filing_type': filing_type,
                    'report_period': period,
                    'year': year,
                    'quarter': quarter,
                    'formatted_date': formatted_date,
                    #'Item 1 - Business': item_1,
                    'item_1a': item_1a,
                    #'Item 1C - Cybersecurity': item_1c,
                    'item_7': item_7,
                    'item_7a': item_7a,
                    'item_8': item_8
                }   
        else:
            #print(f"File {file_path} is neither a 10-Q nor 10-K filing.")
            return None
    else:
        print(f"SEC-Header not found in the document {file_path}.")
        return None

# Function to extract sections based on item number
def extract_item_section(text, item_number):
    # Matches text from Item X until the next item or end of text
    #pattern = fr"Item {item_number}[\s\S]*?(?=Item \d|\Z)"
    #pattern = fr"Item\s*{item_number}[\.\s\r\n]+[\s\S]*?(?=\nItem\s*\d|\Z)"
    
    pattern = fr"ITEM\s*{item_number}[\.\s\r\n]+[\s\S]*?(?=\nITEM\s*\d|\Z)"
    match = re.search(pattern, text, re.IGNORECASE)
    
    if match:
        # Get the text, split by the last newline, and replace other newlines with spaces
        #section_text = match.group(0)
        #section_text = section_text.rsplit('\n', 1)[0]  # Keep everything before the last newline
        #section_text = section_text.replace('\n', ' ')  # Replace all other newlines with spaces
        section_text = match.group(0).replace('\n', ' ').replace('\r', ' ')
        return section_text.strip()
    return None
    
# Function to recursively find all text files in a folder structure and process them
def process_all_filings(folder_path):
    data = []

   # Walk through all subdirectories and files
    for dirpath, _, filenames in os.walk(folder_path):
        for file_name in filenames:
            if file_name.endswith('.txt'):  # Assuming filings are stored as .txt files
                file_path = os.path.join(dirpath, file_name)
                filing_data = process_filing_file(file_path)
                if filing_data:
                    data.append(filing_data)

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)
    return df

def process_year_subfolder(folder_path):
    for subfolder_year in os.listdir(folder_path):
        year_folder_path = os.path.join(folder_path, subfolder_year)
            
        if os.path.isdir(year_folder_path):  # Check if it's a directory (e.g., 2001, 2002)
            print(f"Processing {subfolder_year} folder: {year_folder_path}")
            start_time = time.time()  # Start the timer for the year folder

            # Process all filings within this year folder
            df_filings = process_all_filings(year_folder_path)

            # Save the DataFrame in the same folder level as the quarterly subfolders
            df_filings_filename = os.path.join(year_folder_path, f'10X_{subfolder_year}.pkl')
            df_filings_filename_pq = os.path.join(year_folder_path, f'10X_{subfolder_year}.parquet')
            
            # Save DataFrame as .pkl and .parquet
            df_filings.to_pickle(df_filings_filename)
            df_filings.to_parquet(df_filings_filename_pq)

            end_time = time.time()  # End the timer for the year folder
            execution_time = end_time - start_time
            print(f"Time taken to process {subfolder_year}: {execution_time} seconds\n")

# Base folder containing year subfolders (e.g., \10-X\)
folder_path = '../Data/10-X/'

# Process all the year folders within the base folder
process_year_subfolder(folder_path)


Processing 1993 folder: ../Data/10-X/1993
Time taken to process 1993: 0.5585098266601562 seconds

Processing 1994 folder: ../Data/10-X/1994
Time taken to process 1994: 380.0815489292145 seconds

Processing 1995 folder: ../Data/10-X/1995
Time taken to process 1995: 731.1101634502411 seconds

Processing 1996 folder: ../Data/10-X/1996
Time taken to process 1996: 1476.5094397068024 seconds

Processing 1997 folder: ../Data/10-X/1997
Time taken to process 1997: 1557.898642539978 seconds

Processing 1998 folder: ../Data/10-X/1998
Time taken to process 1998: 1956.9836626052856 seconds

Processing 1999 folder: ../Data/10-X/1999
Time taken to process 1999: 1991.8075802326202 seconds

Processing 2000 folder: ../Data/10-X/2000
Time taken to process 2000: 1962.3905489444733 seconds

Processing 2001 folder: ../Data/10-X/2001
Time taken to process 2001: 1830.883445262909 seconds

Processing 2002 folder: ../Data/10-X/2002
Time taken to process 2002: 1873.859039068222 seconds

Processing 2003 folder: .

In [None]:
df_filings.to_pickle(year)

Processing 1993 folder: ../Data/10-X/1993
Time taken to process 1993: 1.0285255908966064 seconds

Processing 1994 folder: ../Data/10-X/1994
Time taken to process 1994: 529.5554234981537 seconds

Processing 1995 folder: ../Data/10-X/1995
Time taken to process 1995: 1010.5450804233551 seconds

Processing 1996 folder: ../Data/10-X/1996
Time taken to process 1996: 2189.3113400936127 seconds

Processing 1997 folder: ../Data/10-X/1997
Time taken to process 1997: 2087.8048317432404 seconds

Processing 1998 folder: ../Data/10-X/1998
Time taken to process 1998: 2717.9122655391693 seconds

Processing year folder: ../Data/10-X/1999
Time taken to process 1999: 0.03589177131652832 seconds

Processing 2000 folder: ../Data/10-X/2000
Time taken to process 2000: 2811.5032114982605 seconds

Processing year folder: ../Data/10-X/2001
Time taken to process 2001: 1949.109524488449 seconds

Processing year folder: ../Data/10-X/2002
Time taken to process 2002: 2511.8194739818573 seconds

Processing year folder: ../Data/10-X/2003
Time taken to process 2003: 2711.0055890083313 seconds

Processing year folder: ../Data/10-X/2004
Time taken to process 2004: 2714.805201768875 seconds

Processing year folder: ../Data/10-X/2005
Time taken to process 2005: 2472.6729593276978 seconds

Processing year folder: ../Data/10-X/2006
Time taken to process 2006: 2320.9473762512207 seconds

Processing year folder: ../Data/10-X/2007
Time taken to process 2007: 1572.4861545562744 seconds

Processing year folder: ../Data/10-X/2008
Time taken to process 2008: 39.014293909072876 seconds

Processing year folder: ../Data/10-X/2009
Time taken to process 2009: 2206.9212749004364 seconds

Processing year folder: ../Data/10-X/2010
Time taken to process 2010: 2447.032383441925 seconds

Processing year folder: ../Data/10-X/2011
Time taken to process 2011: 2344.9236290454865 seconds

Processing year folder: ../Data/10-X/2012
Time taken to process 2012: 2080.6926894187927 seconds

Processing year folder: ../Data/10-X/2013
Time taken to process 2013: 2177.288468360901 seconds

Processing year folder: ../Data/10-X/2014
Time taken to process 2014: 2268.6661620140076 seconds

Processing year folder: ../Data/10-X/2015
Time taken to process 2015: 2.877331495285034 seconds

Processing year folder: ../Data/10-X/2016
Time taken to process 2016: 2055.654616832733 seconds

Processing year folder: ../Data/10-X/2017
Time taken to process 2017: 1997.4830532073975 seconds

Processing year folder: ../Data/10-X/2018
Time taken to process 2018: 1793.5029277801514 seconds

Processing year folder: ../Data/10-X/2019
Time taken to process 2019: 1750.7520945072174 seconds

Processing year folder: ../Data/10-X/2020
Time taken to process 2020: 2038.4034173488617 seconds

Processing year folder: ../Data/10-X/2021
Time taken to process 2021: 2425.4822454452515 seconds

Processing 2022 folder: ../Data/10-X/2022
Time taken to process 2022: 2233.775817632675 seconds

Processing year folder: ../Data/10-X/2023
Time taken to process 2023: 2527.53492808342 seconds
