In [None]:
import pandas as pd
# from sec_edgar_api import EdgarClient
# from sec_edgar_downloader import Downloader
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import glob
import logging
from typing import Literal

# Download sec-edgar files

In [None]:
edgar = EdgarClient(user_agent="<Sample Company Name> <Admin Contact>@<Sample Company Domain>")
edgar

In [None]:
edgar.get_submissions(cik="320193")

In [None]:
edgar.get_company_facts(cik="789019")

# Use sec-edgar-downloader

In [None]:
dl = Downloader()

dl.get("10-Q", "0000789019")

# Parse and Extract from .HTML

In [None]:
# <table>, <tr>, <td>, and <th>: These tags are used to create tables. 
# <table> is for the table itself,
# <tr> defines a row,
# <td> defines a cell, and 
# <th> defines a header cell.

In [None]:
# find all html files in the directory
html_files = glob.glob(r"D:\lianz\Desktop\Python\personal_projects\sec_data\sec-edgar-filings\0000320193\*\*\*.html")
html_files

## Read HTML File

In [None]:
def read_html(file: str):
    """Reads html file and returns BeautifulSoup object"""
    with open(f'{file}', 'r') as f:
        contents = f.read()
    soup = BeautifulSoup(contents, 'html.parser')
    return soup

## Find target table

In [None]:
def find_target_table(target_text, soup, search_type=Literal['loose','strict']):
    """
    target_text: regex pattern
    
    soup: BeautifulSoup object
    
    search_type: 'loose' or 'strict'
        - loose: search for target_text in the soup
        - strict: search for target_text in the soup, then find the next table following the text

    Returns a list of BeautifulSoup objects containing the target table(s) in html format.
    """
    target_tables = None
    
    target_element = soup.find_all('div') # find_all returns a list
    if search_type == 'loose':
        target_tables = [i for i in target_element if target_text.search(i.text.lower())]
    elif search_type == 'strict':
        target_tables = [j.find_next('table') for j in [i for i in target_element if target_text.search(i.text)]]
    
    return target_tables

## Extract Table from HTML

In [None]:
def extract_table_from_html(table_html):
    """
    table_html: BeautifulSoup object
    
    Returns a dataframe of the table.
    
    """
    data = [] # row x col [[1,2,3,4,5], [1,2,3,4,5], [1,2,3,4,5]] = 3 rows, 5 columns
    # Loop over each row
    for row in table_html.find_all('tr')[:]: #
        # print(f'row: {row}')
        cols = []
        # Loop over each cell in the row
        for cell in row.find_all(['td', 'th'])[:]:
            # print(cell)
            # Get the colspan and rowspan attributes, defaulting to 1 if they don't exist
            colspan = int(cell.get('colspan', 1))
            rowspan = int(cell.get('rowspan', 1))

            # print(f'cell: {cell.text.strip()}\ncolspan: {colspan}\nrowspan: {rowspan}')
            # If the cell spans multiple rows or columns, add copies of it to the cols list
            for i in range(rowspan):
                for j in range(colspan):
                    cols.append(cell.text.strip())


        # Add the cols list to the data list
        data.append(cols)
        # print(data)
    df = pd.DataFrame(data)
    df.replace('', np.nan, inplace=True)
    return df

## Clean df

In [None]:
def clean_df(df):
    """
    df: dataframe
    
    Process the dataframe to remove unnecessary rows and columns.
    
    Steps:
    1. Find header row and use it to set column names
    2. Remove non-header rows and columns where all values are NaN
    3. Replace empty strings with NaN
    4. Forward fill NaN values along columns
    5. Remove rows where all values are NaN
    6. Reset index after dropping rows
    7. if column contains '$' then remove it
    8. if row only contains same element then store the index and value in a dictionary
    9. Use the dictionary to assign the value as first level of multiindex for the rows in between the rows to keep
    10. replace any cell that contains : with empty string
    11. Combine two columns to create a multiindex for the rows

    Returns a cleaned dataframe.
    """

    first_row = df.iloc[:, 0].notnull().idxmax()
    # Use first three rows for the header
    header = df.iloc[0:first_row].fillna('').agg(' '.join).str.strip()
    df.columns = header

    # Remove the rows used for header and reset index
    df = df.iloc[first_row:].reset_index(drop=True)

    # Remove columns where all values are NaN
    df = df.dropna(how='all', axis=1)

    # Replace empty strings with NaN
    df.replace("", np.nan, inplace=True)

    # Forward fill NaN values along columns
    df.fillna(method='ffill', axis=1, inplace=True)

    # Remove rows where all values are NaN
    df = df.dropna(how='all', axis=0)

    # Reset index after dropping rows
    df.reset_index(drop=True, inplace=True)

    # if column contains '$' then remove it
    col_to_keep = [i for i,x in enumerate(df.columns) if '$' not in df.iloc[:, i].values and '%' not in df.iloc[:, i].values and 'change' not in x.lower()]
    df = df.iloc[:, col_to_keep]
    df = df.loc[:, ~df.columns.duplicated(keep='first')]

    # if row only contains same element then store the index and value in a dictionary
    row_to_keep = {}
    for i in range(len(df)):
        if len(set(df.iloc[i, :])) == 1:
            row_to_keep[i] = df.iloc[i, 0]

    # Use the dictionary to assign the value as first level of multiindex for the rows in between the rows to keep
    df['Category'] = df.iloc[:, 0]
    for i in range(len(df)):
        if i in row_to_keep.keys():
            df.iloc[i, -1] = row_to_keep[i]
        else:
            df.iloc[i, -1] = df.iloc[i-1, -1]


    row_to_keep = []
    for i in range(len(df)):
        if len(set(df.iloc[i, :])) != 1:
            row_to_keep.append(i)

    df = df.iloc[row_to_keep, :]
    multiindex = df.iloc[:,[-1,0]]

    # replace any cell that contains : with empty string
    df = df.replace(to_replace=':', value='', regex=True)

    # Combine two columns to create a multiindex for the rows
    df.index = pd.MultiIndex.from_arrays([df.iloc[:,-1].values, df.iloc[:,0].values])
    df = df.iloc[:, 1:-1]
    return df

In [None]:
# Configure the logging settings
logging.basicConfig(level=logging.DEBUG, filename='loop_logs.log', filemode='w', format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
file_to_read = html_files[6]
file_name = file_to_read.split("\\")[-2]
print(f'Reading html files... {file_name}')
html_soup = read_html(file_to_read)
target_table = find_target_table(re.compile(r'.*(Products and services|Net sales|Sales Data|net sales by operating segment).*'), html_soup, search_type='strict')

In [None]:
df = extract_table_from_html(target_table[4])
df = clean_df(df)
df = df.stack(level=0).reset_index(level=2).reset_index()
df.columns = ['Category', 'Segment', 'Date_info', 'Value']
df.to_dict('records')
df

In [None]:
all_df = pd.DataFrame()
number_of_files = len(html_files)
for file in html_files[:]:
    logging.info(file)
    file_name = file.split("\\")[-2]
    
    try:
        html_soup = read_html(file)
        logging.info(f'Step 1: {file_name}: Read html successfully.')
    
        target_table = find_target_table(re.compile(r'.*(Sales Data|net sales by operating segment|net sales|products and services).*'), html_soup, search_type='strict')
        logging.info(f'Step 2: {file_name}: Found target table successfully.')

        assert target_table is not None, f'{file_name}: Target table not found.'
        df = extract_table_from_html(target_table).iloc[:, :]
        logging.info(f'Step 3: {file_name}: Extracted table successfully.')

        df = clean_df(df)
        logging.info(f'Step 4: {file_name}: Cleaned table successfully.')

        df = df.stack(level=0).reset_index(level=2).reset_index()
        logging.info(f'Step 5: {file_name}: Stacked table successfully.')
    
        df.columns = ['Category', 'Segment', 'Date_info', 'Value']
        logging.info(f'Step 6: {file_name}: Renamed columns successfully.')
    
        all_df = pd.concat([all_df, df], axis=0, ignore_index=True)
        logging.info(f'Step 7: {file_name}: Concatenated table successfully.')
    except Exception as e:
        logging.info(f'Step 7: {file_name}: Concatenated table unsuccessfully.')
        logging.error(e)
    
    number_of_files -= 1
    logging.info(f'Number of files left: {number_of_files}')
    break
all_df['Date'] = all_df['Date_info'].str.extract(r"([A-Za-z]+\s\d{1,2},\s\d{4})")

# Extract three months ended text from date column
all_df['Time Leading To'] = all_df['Date_info'].str.extract(r"([A-Za-z]+\s[A-Za-z]+\s[A-Za-z]+)")

all_df.drop(columns=['Date_info'], inplace=True)

# reorder columns
all_df = all_df[['Category', 'Segment', 'Time Leading To', 'Date', 'Value']]
all_df.to_csv('all_df.csv', index=False)

In [None]:
html_soup = read_html(html_files[6])
target_text = re.compile(r'.*(sales data|net sales|products).*')
target_element = html_soup.find_all('div')
# all_text = [i for i in target_element if target_text.search(i.text.lower())]
all_text = [j.find_next('table') for j in [i for i in target_element if target_text.search(i.text)]]
len(all_text)

In [None]:
for i in all_text[:22]:
    table = extract_table_from_html(i)
    if len(table) != 0:
        actual = table
df = clean_df(actual)
df = df.stack(level=0).reset_index(level=2).reset_index()

df.columns = ['Category', 'Segment', 'Date_info', 'Value']
df

In [None]:
df.to_dict(orient='records')

# Using Pandas

In [None]:
sales_data_table_02 = pd.read_html(r"sec-edgar-filings\0000320193\10-Q\0000912057-02-004945\filing-details.html")
sales_data_table_23 = pd.read_html(r"sec-edgar-filings\0000320193\10-Q\0000320193-23-000006\filing-details.html")
sales_data_table_15 = pd.read_html(r"sec-edgar-filings\0000320193\10-Q\0001193125-15-259935\filing-details.html")

In [None]:
sales_data_table_15[38]

In [None]:
df = sales_data_table_15[38]
# Use first three rows for the header
header = df.iloc[0:3].fillna('').agg(' '.join).str.strip()
df.columns = header

# Remove the rows used for header and reset index
df = df.iloc[3:].reset_index(drop=True)

# Remove columns where all values are NaN
df = df.dropna(how='all', axis=1)

# Replace empty strings with NaN
df.replace("", np.nan, inplace=True)

# Forward fill NaN values along columns
df.fillna(method='ffill', axis=1, inplace=True)

# Remove rows where all values are NaN
df = df.dropna(how='all', axis=0)

# Reset index after dropping rows
df.reset_index(drop=True, inplace=True)

# if column contains '$' then remove it
col_to_keep = [i for i,x in enumerate(df.columns) if '$' not in df.iloc[:, i].values]
df = df.iloc[:, col_to_keep]
df = df.loc[:, ~df.columns.duplicated(keep='first')]

# if row only contains same element then store the index and value in a dictionary
row_to_keep = {}
for i in range(len(df)):
    if len(set(df.iloc[i, :])) == 1:
        row_to_keep[i] = df.iloc[i, 0]

# Use the dictionary to assign the value as first level of multiindex for the rows in between the rows to keep
df['Category'] = df.iloc[:, 0]
for i in range(len(df)):
    if i in row_to_keep.keys():
        df.iloc[i, -1] = row_to_keep[i]
    else:
        df.iloc[i, -1] = df.iloc[i-1, -1]


row_to_keep = []
for i in range(len(df)):
    if len(set(df.iloc[i, :])) != 1:
        row_to_keep.append(i)

df = df.iloc[row_to_keep, :]
multiindex = df.iloc[:,[-1,0]]

# replace any cell that contains : with empty string
df = df.replace(to_replace=':', value='', regex=True)

# Combine two columns to create a multiindex for the rows
df.index = pd.MultiIndex.from_arrays([df.iloc[:,-1].values, df.iloc[:,0].values])
df = df.iloc[:, 1:-1]

# unpivot the dataframe based on the third column
df = df.stack(level=0).reset_index(level=2).reset_index()
df


# Find a way to convert data in the tables to a key-value pair in JSON format

In [None]:
# convert df to dictionary and label with AAPL
df = df.to_dict(orient='index')
df = {'AAPL': df}
df # shares in thousands, eps in per share, and others in millions

# Download and send summary as mail

In [None]:
#TODO: create script .py file to download latest earnings and send summary to email



# Test SEC API

In [34]:
import requests
import pandas as pd
import json
import xml.etree.ElementTree as ET
from ratelimit import limits, sleep_and_retry
import logging
from bs4 import BeautifulSoup
from tqdm import trange

# Configure the logging settings
logging.basicConfig(level=logging.DEBUG, filename='sec_logs.log', filemode='a', format='%(asctime)s - %(levelname)s - %(message)s')

class SECData:
    """Class to retrieve data from SEC Edgar database.

    Args:
        requester_name (str): Name of the requester
        requester_email (str): Email of the requester
        taxonomy (str): us-gaap, ifrs-full, dei, or srt

    Raises:
        Exception: If taxonomy is not one of the following: us-gaap, ifrs-full, dei, or srt
    
    Attributes:
        BASE_API_URL (str): Base url for SEC Edgar database
        US_GAAP_TAXONOMY_URL (str): URL for us-gaap taxonomy
        ALLOWED_TAXONOMIES (list): List of allowed taxonomies
        headers (dict): Headers to be used for API calls
        cik (DataFrame): DataFrame containing CIK and ticker
        tags (list): List of tags in us-gaap taxonomy
        taxonomy (str): us-gaap, ifrs-full, dei, or srt

    Methods:
        get_cik_list: Retrieves the full list of CIK available from SEC database.
        get_ticker_cik: Get a specific ticker's CIK number. 
        get_usgaap_tags: Get the list of tags in us-gaap taxonomy.
        get_submissions: Retrieves the list of submissions for a specific CIK.
        get_company_concept: Retrieves the XBRL disclosures from a single company (CIK) 
            and concept (a taxonomy and tag) into a single JSON file.
        get_company_facts: Retrieves the XBRL disclosures from a single company (CIK) 
            into a single JSON file.
        get_frames: Retrieves one fact for each reporting entity that is last filed that most closely fits the calendrical period requested.
    """
    
    BASE_API_URL = "https://data.sec.gov/"
    BASE_SEC_URL = "https://www.sec.gov/"
    BASE_DIRECTORY_URL = "https://www.sec.gov/Archives/edgar/data/"
    US_GAAP_TAXONOMY_URL = "https://xbrl.fasb.org/us-gaap/2023/elts/us-gaap-2023.xsd"
    ALLOWED_TAXONOMIES = ['us-gaap', 'ifrs-full', 'dei', 'srt']
    INDEX_EXTENSION = ['-index.html', '-index-headers.html']
    FILE_EXTENSIONS = ['.xsd', '.htm', '_cal.xml', '_def.xml', '_lab.xml', '_pre.xml', '_htm.xml', '.xml']

    def __init__(self, requester_company: str,requester_name: str, requester_email: str, taxonomy: str):
        self.requester_company = requester_company
        self.requester_name = requester_name
        self.requester_email = requester_email
        self.sec_headers = {"User-Agent": f"{requester_company} {requester_name} {requester_email}",
                        "Accept-Encoding": "gzip, deflate",
                        "Host": "www.sec.gov"}
        self.sec_data_headers = {"User-Agent": f"{requester_company} {requester_name} {requester_email}",
                        "Accept-Encoding": "gzip, deflate",
                        "Host": "data.sec.gov"}
        self.cik = self.get_cik_list()
        self.tags = self.get_usgaap_tags()
        if taxonomy not in self.ALLOWED_TAXONOMIES:
            raise ValueError(
                f"Taxonomy {taxonomy} is not supported. Please use one of the following taxonomies: {self.ALLOWED_TAXONOMIES}")
        self.taxonomy = taxonomy


    @sleep_and_retry
    @limits(calls=10, period=1)
    def rate_limited_request(self, url: str, headers: dict):
        """Rate limited request to SEC Edgar database.

        Args:
            url (str): URL to retrieve data from
            headers (dict): Headers to be used for API calls

        Returns:
            response: Response from API call
        """
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            logging.error(f'''
Request failed at URL: {url}''')
        return response
    

    def get_cik_list(self):
        """Retrieves the full list of CIK available from SEC database.

        Raises:
            Exception: On failure to retrieve CIK list

        Returns:
            cik_df: DataFrame containing CIK and ticker
        """
        url = r"https://www.sec.gov/files/company_tickers.json"
        cik_raw = self.rate_limited_request(url, self.sec_headers)
        cik_json = cik_raw.json()
        cik_df = pd.DataFrame.from_dict(cik_json).T
        return cik_df


    def get_ticker_cik(self, ticker: str,):
        """Get a specific ticker's CIK number. 
        CIK########## is the entity's 10-digit Central Index Key (CIK).

        Args:
            ticker (str): public ticker symbol of the company

        Returns:
            cik: CIK number of the company excluding the leading 'CIK'
        """
        ticker_cik = self.cik.query(f"ticker == '{ticker}'")['cik_str']
        cik = f"{ticker_cik.iloc[0]:010d}"
        return cik


    def get_usgaap_tags(self, xsd_url: str = US_GAAP_TAXONOMY_URL):
        """Get the list of tags (elements) in us-gaap taxonomy or provide a different xsd_url to get tags from a different taxonomy.

        Returns:
            list of tags
        """        
        response = self.rate_limited_request(xsd_url, headers=self.sec_headers)
        xsd_content = response.text
        root = ET.fromstring(xsd_content)

        return [element.attrib['name'] for element in root.findall(".//{http://www.w3.org/2001/XMLSchema}element")]


    def get_submissions(self, cik):
        url = f"{self.BASE_API_URL}submissions/CIK{cik}.json"
        response = self.rate_limited_request(url, headers=self.sec_data_headers)
        if response.status_code != 200:
            raise Exception(
                f"Failed to retrieve submissions for CIK {cik}. Status code: {response.status_code}")
        data = json.loads(response.text)
        return data


    def get_company_concept(self, cik: str, tag: str, taxonomy: str = 'us-gaap',):
        """The company-concept API returns all the XBRL disclosures from a single company (CIK) 
        and concept (a taxonomy and tag) into a single JSON file, with a separate array of facts 
        for each units on measure that the company has chosen to disclose 
        (e.g. net profits reported in U.S. dollars and in Canadian dollars).

        Args:
            cik (str): CIK number of the company. Get the list using self.cik
            taxonomy (str): us-gaap, ifrs-full, dei, or srt
            tag (str): taxonomy tag (e.g. Revenue, AccountsPayableCurrent). See full list from https://xbrl.fasb.org/us-gaap/2023/elts/us-gaap-2023.xsd

        Raises:
            Exception: On failure to retrieve company concept either due to invalid CIK, taxonomy, or tag

        Returns:
            data: JSON file containing all the XBRL disclosures from a single company (CIK)
        """
        url = f"{self.BASE_API_URL}api/xbrl/companyconcept/CIK{cik}/{taxonomy}/{tag}.json"
        response = self.rate_limited_request(url, headers=self.sec_data_headers)
        data = json.loads(response.text)
        return data


    def get_company_facts(self, cik):
        url = f"{self.BASE_API_URL}api/xbrl/companyfacts/CIK{cik}.json"
        response = self.rate_limited_request(url, headers=self.sec_data_headers)
        if response.status_code != 200:
            raise Exception(
                f"Failed to retrieve company facts for CIK {cik}. Status code: {response.status_code}")
        data = json.loads(response.text)
        return data


    def get_frames(self, taxonomy, tag, unit, period):
        """The xbrl/frames API aggregates one fact for each reporting entity that is last filed that most closely fits the calendrical period requested. 
        This API supports for annual, quarterly and instantaneous data: https://data.sec.gov/api/xbrl/frames/us-gaap/AccountsPayableCurrent/USD/CY2019Q1I.json

        Args:
            taxonomy (str): us-gaap, ifrs-full, dei, or srt
            tag (str): taxonomy tag (e.g. Revenue, AccountsPayableCurrent). See full list from https://xbrl.fasb.org/us-gaap/2023/elts/us-gaap-2023.xsd
            unit (str): USD, USD-per-shares, etc.
            period (str): CY#### for annual data (duration 365 days +/- 30 days), CY####Q# for quarterly data (duration 91 days +/- 30 days), CY####Q#I for instantaneous data

        Raises:
            Exception: (placeholder)

        Returns:
            data: json formatted response
        """        
        url = f"{self.BASE_API_URL}api/xbrl/frames/{taxonomy}/{tag}/{unit}/{period}.json"
        response = self.rate_limited_request(url, headers=self.sec_data_headers)
        data = json.loads(response.text)
        return data
    
    def get_data_as_dataframe(self, cik: str,):
        """Retrieves the XBRL disclosures from a single company (CIK) and returns it as a pandas dataframe.

        Args:
            cik (str): CIK number of the company. Get the list using self.cik

        Returns:
            df: pandas dataframe containing the XBRL disclosures from a single company (CIK)
        """
        data = self.get_company_facts(cik)
        
        df = pd.DataFrame()

        for tag in data['facts'][self.taxonomy]:
            facts = data['facts']['us-gaap'][tag]['units']
            unit_key = list(facts.keys())[0]
            temp_df = pd.DataFrame(facts[unit_key])
            temp_df['label'] = tag
            df = pd.concat([df, temp_df], axis=0, ignore_index=True)
        df = df.astype({'val': 'float64', 
                        'end': 'datetime64[ns]',
                        'start': 'datetime64[ns]',
                        'filed': 'datetime64[ns]'})
        df['Months Ended'] = (df['end'] - df['start']).dt.days.div(30.4375).round(0)
        return df
    

    def get_index(self, cik: str = None,) -> dict:
        """Each CIK directory and all child subdirectories contain three files to assist in 
        automated crawling of these directories. 
        These are not visible through directory browsing.
            - index.html (the web browser would normally receive these)
            - index.xml (a XML structured version of the same content)
            - index.json (a JSON structured vision of the same content)

        Args:
            cik (str): CIK number of the company. Get the list using self.cik

        Returns:
            json: pandas dataframe containing the XBRL disclosures from a single company (CIK)
        """
        if cik is not None:
            url = self.BASE_DIRECTORY_URL + cik + '/' + 'index.json'
        
        else:
            url = self.BASE_DIRECTORY_URL + self.cik + '/' + 'index.json'

        response = self.rate_limited_request(url, headers=self.sec_headers)
        return response.json()

class TickerData(SECData):
    """Inherited from SECData class. Retrieves data from SEC Edgar database based on ticker.
    url is constructed based on the following: https://www.sec.gov/Archives/edgar/data/{cik}/{ascension_number}/{file_name}
    cik is the CIK number of the company = access via get_ticker_cik
    ascension_number is the accessionNumber column of filings_df
    file name for xml is always '{ticker}-{reportDate}.{extension}
    """

    def __init__(self, ticker:str, requester_company: str = 'Financial API', requester_name: str = 'API Caller', requester_email: str = 'apicaller@gmail.com', taxonomy: str ='us-gaap',):
        super().__init__(requester_company, requester_name, requester_email, taxonomy)
        self.ticker = ticker.upper()
        self.cik = self.get_ticker_cik(self.ticker)
        self._submissions = None
        self._filings = None
        self.forms = self.filings['form'].unique()
        self._index = self.get_index(self.cik)
        self._filing_folder_urls = None
        self._filing_urls = None

    @property
    def submissions(self,) -> dict:
        if self._submissions is None:
            self._submissions = self.get_submissions(self.cik)
        return self._submissions

    @property
    def filings(self,) -> pd.DataFrame:
        if self._filings is None:
            self._filings = self.get_filings(self.submissions)
        return self._filings

    @property
    def filing_folder_urls(self,) -> list:
        if self._filing_folder_urls is None:
            self._filing_folder_urls = self.get_filing_folder_urls()
        return self._filing_folder_urls

    @property
    def filing_urls(self,) -> list:
        if self._filing_urls is None:
            self._filing_urls = self.get_filing_urls()
            
        return self._filing_urls


    def get_filing_folder_urls(self,) -> list:
        """Get filing folder urls from index dict.

        Args:
            index (dict): index dict from get_index method

        Returns:
            filing_folder_urls (list): list of filing folder urls
        """
        filing_folder_urls = [self.BASE_SEC_URL + self._index['directory']['name'] + '/' + folder['name'] for folder in self._index['directory']['item'] if folder['type'] == 'folder.gif']
        return filing_folder_urls


    def get_filing_urls(self,) -> list:
        """Get filing urls from filing folder urls.

        Args:
            filing_folder_urls (list): list of filing folder urls

        Returns:
            filing_urls (list): list of filing urls to .txt files
        """
        filing_urls = []
        with trange(len(self.filing_folder_urls), desc=f'Instantiating filing urls for {self.ticker}...') as t:
            for i in t:
                logging.info(t)
                try:
                    soup = self.get_file_data(self.filing_folder_urls[i])
                    for link in soup.find_all('a'):
                        if link.get('href').endswith('.txt'):
                            filing_urls.append(self.BASE_SEC_URL + link.get('href'))
                except Exception as e:
                    logging.error(f'Failed to instantiate filing urls for {self.ticker}...')
                    logging.error(e)
                    t.write(f'Failed to instantiate filing urls for {self.ticker}...')
                    continue
        return filing_urls
    

    def get_filings(self, submissions: dict):
        """Get filings from submissions dict.

        Args:
            submissions (dict): submissions dict from get_submissions method
        
        Returns:
            filings (DataFrame): DataFrame containing filings
        """
        filings = pd.DataFrame(submissions['filings']['recent'])

        # Convert reportDate, filingDate, acceptanceDateTime columns to datetime
        filings['reportDate'] = pd.to_datetime(filings['reportDate'])
        filings['filingDate'] = pd.to_datetime(filings['filingDate'])
        filings['acceptanceDateTime'] = pd.to_datetime(
            filings['acceptanceDateTime'])
        filings['file_url'] = self.BASE_DIRECTORY_URL + self.cik + '/' + filings['accessionNumber'].str.replace('-', '') + '/' + filings['accessionNumber'] + '.txt'
        return filings


    def get_file_data(self, file_url: str) -> BeautifulSoup:
        """Get file data from file url which can be retrieved by calling self.get_file_url method.

        Args:
            file_url (str): File url to retrieve data from on the SEC website

        Returns:
            data: File data as a BeautifulSoup object
        """
        data = self.rate_limited_request(url=file_url, headers=self.sec_headers)
        soup = BeautifulSoup(data.content, "lxml")
        return soup
    
    def search_tags(self, soup: BeautifulSoup, pattern: str) -> BeautifulSoup:
        """Search for tags in BeautifulSoup object.

        Args:
            soup (BeautifulSoup): BeautifulSoup object
            pattern (str): pattern to search for

        Returns:
            soup: BeautifulSoup object
        """
        return soup.find_all(pattern)
    
    def search_context(self, soup: BeautifulSoup) -> pd.DataFrame:
        """Search for context in company .txt filing. 
        Context provides information about the entity, segment, and time period for facts in the filing.

        Args:
            soup (BeautifulSoup): BeautifulSoup object

        Returns:
            df: DataFrame containing context information with columns 
            {
                'contextId': str,
                'entity': str,
                'segment': str,
                'startDate': 'datetime64[ns]',
                'endDate': 'datetime64[ns]',
                'instant': 'datetime64[ns]'
            }
        """
        contexts = self.search_tags(soup, '^context$')
        dict_list = []
        columns = {'contextId': str, 'entity': str, 'segment': str, 'startDate': 'datetime64[ns]', 'endDate': 'datetime64[ns]', 'instant': 'datetime64[ns]'}
        for tag in contexts:
            temp_dict = {}
            temp_dict['contextId'] = tag.attrs['id']
            temp_dict['entity'] = tag.find("entity").text.split()[0] if tag.find("entity") is not None else None
            temp_dict['segment'] = tag.find("segment").text.strip() if tag.find("segment") is not None else None
            temp_dict['startDate'] = tag.find("startdate").text if tag.find("startdate") is not None else None
            temp_dict['endDate'] = tag.find("enddate").text if tag.find("enddate") is not None else None
            temp_dict['instant'] = tag.find("instant").text if tag.find("instant") is not None else None
            dict_list.append(temp_dict)

        df = pd.DataFrame(dict_list, columns=columns.keys()).astype(columns)
        return df

# Get facts as DataFrame

In [None]:
cik  = sec.get_ticker_cik('MSFT')
companyfacts_df = sec.get_data_as_dataframe(cik)

# Get XSD tags for quarterly filings

In [32]:
aapl = TickerData(requester_company='Financial Docs', requester_name='John Doe', requester_email='financial@gmail.com', taxonomy='us-gaap', ticker='aapl')
msft = TickerData(requester_company='Financial Docs', requester_name='John Doe', requester_email='financial@gmail.com', taxonomy='us-gaap', ticker='MSFT')
nvda = TickerData(requester_company='Financial Docs', requester_name='John Doe', requester_email='financial@gmail.com', taxonomy='us-gaap', ticker='NVDA')

In [None]:
with trange(len(aapl.filing_urls),) as t:
    for i in t:
        t.set_description(f'Iteration {i}')
        try:
            soup = aapl.get_file_data(aapl.filing_urls[i])
            form_type = soup.find(re.compile('type')).text.split()[0] if soup.find(re.compile('type')) is not None else None
            t.set_postfix(form_type=form_type, request_url=aapl.filing_urls[i])
            logging.info(t)
        except Exception as e:
            print(f'Error processing filing URL {aapl.filing_urls[i]}: {e}')
            logging.error(f'Error processing filing URL {aapl.filing_urls[i]}: {e}')
            t.write(f'Error processing filing URL {aapl.filing_urls[i]}: {e}')
            continue

## Search for tags

In [None]:
# Find reporting time frame of reportDate
soup = msft.get_file_data(msft.filings.iloc[0]['file_url'])

In [None]:
msft_us_gaap = msft.tag_search(soup, 'us-gaap')
for i,x in enumerate(msft_us_gaap):
    if x['tag name'].find('revenue') != -1 and len(x['attributes']) > 0:
        print(x['tag name'])
        print(x['attributes'])
        print(x['text'])

## Search Link Labels

In [None]:
msft_label_links = msft.tag_search(soup, '^link:label$')

[i for i in msft_label_links if i['attributes']['xlink:label'].lower().find('us-gaap:revenuefromcontractwithcustomerexcludingassessedtax'.replace(':','_')) != -1]