In [83]:
import os
import re
from sec_edgar_downloader import Downloader
from bs4 import BeautifulSoup
from io import StringIO
import pandas as pd
import csv

In [96]:
def find_string_in_html(html_content, search_string):
    # Parse HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all <th> tags with class "tl"
    th_tags = soup.find_all('th', class_='tl')
    
    # Loop through each <th> tag
    for th_tag in th_tags:
        # Check if the search string is in the tag's text
        # if search_string in th_tag.get_text():
        if th_tag.get_text(strip=True) == search_string and len(th_tag.get_text(strip=True)) == len(search_string):
            return True
    
    # Return failure if the search string is not found in any <th> tag
    return False

In [92]:


def merge_tables(existing_df, new_df):
    # Check if existing_df is empty
    if existing_df.empty:
        return new_df
    else:
        
        # Get the names of columns except the first one (index column)
        existing_cols = existing_df.columns
        new_cols = new_df.columns
        
        # Check for overlapping columns
        overlapping_cols = list(set(existing_cols) & set(new_cols))
        print(overlapping_cols)
        
        # Merge two dataframes based on the first column
        merged_df = pd.merge(existing_df, new_df, on=overlapping_cols, how='outer')
        return merged_df

def write_csv(df, search_string):
    # Write the dataframe to a CSV file
    df.to_csv(f"merged_table_data_{search_string}.csv", index=False)
    # print("Data written to merged_table_data.csv")

def change_to_df(html_content, search_string):
    # Parse HTML content
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Find all tables with the specified class
    tables = soup.find_all("table", {"class": "report"})
    
    # Initialize an empty dataframe
    merged_df = pd.DataFrame()
    
    # Iterate over each table
    for table in tables:
        # Convert the table to a dataframe
        df = pd.read_html(StringIO(str(table)))[0]
        
        # Merge with existing dataframe
        merged_df = merge_tables(merged_df, df)
    
    # Write the merged dataframe to a CSV file
    write_csv(merged_df, search_string)


In [89]:
def combine_txt_files(ticker_dir, search_string):
    # Get the directory path for the ticker
    ticker_path = os.path.join("sec-edgar-filings", ticker_dir, "10-K")

    # Initialize a list to store the content of .txt files
    txt_content = []

    # List all directories inside the ticker's 10-K directory
    for sub_dir in os.listdir(ticker_path):
        sub_dir_path = os.path.join(ticker_path, sub_dir)
        if os.path.isdir(sub_dir_path):  # Check if it's a directory
            # Traverse files in the current subdirectory
            for root, _, files in os.walk(sub_dir_path):
                # Traverse files in the current directory
                for file in files:
                    filename, file_extension = os.path.splitext(file)
                    if file_extension.lower() == ".htm" or file_extension.lower() == ".html":
                        if filename.startswith("R") and filename[1:].isdigit():
                            # Open and read the content of the file
                            with open(os.path.join(root, file), "r") as f:
                                html_content = f.read()
                                if find_string_in_html(html_content, search_string):
                                    # Perform the operation here
                                    print(f"Found '{search_string}' in {file}")
                                    # Operation to be performed when the string is found
                                    change_to_df(html_content, search_string)
                                    # Append the content of the file to txt_content list
                                    txt_content.append(html_content)

    # Combine the content of all .txt files into one string
    combined_content = "\n".join(txt_content)
   

    # Write the combined content into a new file
    combined_file_path = os.path.join(ticker_path, f"{ticker_dir}_{search_string}.htm")
    with open(combined_file_path, "w") as combined_file:
        combined_file.write(combined_content)


In [97]:
# Example usage:
companies = ["AAPL"]  # Add more tickers if needed
start_year = 2021
end_year = 2022

for company in companies:
    # download_10k_filings(company, start_year, end_year)
    combine_txt_files(company,"Revenue Recognition")
    

Found 'Revenue Recognition' in R9.htm
