# Test SEC API

In [13]:
import logging

class MyLogger:
    def __init__(self, name: str = __name__, level: str = 'debug', log_file: str = 'logs.log'):
        # Initialize logger
        self.logging_level = logging.DEBUG if level == 'debug' else logging.INFO
        self.scrape_logger = logging.getLogger(name)
        self.scrape_logger.setLevel(self.logging_level)

        # Check if the self.scrape_logger already has handlers to avoid duplicate logging.
        if not self.scrape_logger.hasHandlers():
            # Create a file handler
            file_handler = logging.FileHandler(log_file, mode='a')
            file_handler.setLevel(self.logging_level)

            # Create a stream handler
            stream_handler = logging.StreamHandler()
            stream_handler.setLevel(self.logging_level)

            # Create a logging format
            formatter = logging.Formatter(
                '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            file_handler.setFormatter(formatter)
            stream_handler.setFormatter(formatter)

            # Add the handlers to the self.scrape_logger
            self.scrape_logger.addHandler(file_handler)
            self.scrape_logger.addHandler(stream_handler)


In [15]:
# Built-in libraries
import logging
import requests
import json
import re
from abc import ABC, abstractmethod
from typing import List

# Third-party libraries
import pandas as pd
import xml.etree.ElementTree as ET
from ratelimit import limits, sleep_and_retry
from bs4 import BeautifulSoup
from bs4.element import Tag
from tqdm import trange



def convert_keys_to_lowercase(d):
    """Recursively convert all keys in a dictionary to lowercase.

    Args:
        d (dict): Dictionary to convert

    Returns:
        dict: Dictionary with all keys converted to lowercase
    """
    new_dict = {}
    for k, v in d.items():
        if isinstance(v, dict):
            v = convert_keys_to_lowercase(v)
        new_key = re.sub(r'[^a-zA-Z0-9]', '', k.lower())
        new_dict[new_key] = v
    return new_dict


def indexify_url(folder_url: str) -> str:
    """Converts url to index url.

    Args:
        url (str): url to convert to index url

    Returns:
        str: index url
    """
    return folder_url + '/index.json'


class SearchStrategy(ABC):
    @abstractmethod
    def get_pattern(self) -> str:
        pass


class ContextSearchStrategy(SearchStrategy):
    def get_pattern(self) -> str:
        return '^context$'


class LinkLabelSearchStrategy(SearchStrategy):
    def get_pattern(self) -> str:
        return '^link:label$'


class FactSearchStrategy(SearchStrategy):
    def get_pattern(self) -> str:
        return '^us-gaap:'


class SECData(MyLogger):
    """Class to retrieve data from SEC Edgar database.

    Args:
        requester_name (str): Name of the requester
        requester_email (str): Email of the requester
        taxonomy (str): us-gaap, ifrs-full, dei, or srt

    Raises:
        Exception: If taxonomy is not one of the following: us-gaap, ifrs-full, dei, or srt

    Attributes:
        BASE_API_URL (str): Base url for SEC Edgar database
        US_GAAP_TAXONOMY_URL (str): URL for us-gaap taxonomy
        ALLOWED_TAXONOMIES (list): List of allowed taxonomies
        headers (dict): Headers to be used for API calls
        cik (DataFrame): DataFrame containing CIK and ticker
        tags (list): List of tags in us-gaap taxonomy
        taxonomy (str): us-gaap, ifrs-full, dei, or srt

    Methods:
        get_cik_list: Retrieves the full list of CIK available from SEC database.
        get_ticker_cik: Get a specific ticker's CIK number. 
        get_usgaap_tags: Get the list of tags in us-gaap taxonomy.
        get_submissions: Retrieves the list of submissions for a specific CIK.
        get_company_concept: Retrieves the XBRL disclosures from a single company (CIK) 
            and concept (a taxonomy and tag) into a single JSON file.
        get_company_facts: Retrieves the XBRL disclosures from a single company (CIK) 
            into a single JSON file.
        get_frames: Retrieves one fact for each reporting entity that is last filed that most closely fits the calendrical period requested.
    """

    BASE_API_URL = "https://data.sec.gov/"
    BASE_SEC_URL = "https://www.sec.gov/"
    BASE_DIRECTORY_URL = "https://www.sec.gov/Archives/edgar/data/"
    SIC_LIST_URL = "https://www.sec.gov/corpfin/division-of-corporation-finance-standard-industrial-classification-sic-code-list"
    US_GAAP_TAXONOMY_URL = "http://xbrl.fasb.org/us-gaap/2024/elts/us-gaap-2024.xsd"
    SRT_TAXONOMY_URL = "http://xbrl.fasb.org/srt/2024/elts/srt-std-2024.xsd"
    ALLOWED_TAXONOMIES = {'us-gaap', 'ifrs-full', 'dei', 'srt'}
    INDEX_EXTENSION = {'-index.html', '-index-headers.html'}
    DIRECTORY_INDEX = {'index.json', 'index.xml', 'index.html'}
    FILE_EXTENSIONS = {'.xsd', '.htm', '_cal.xml',
                       '_def.xml', '_lab.xml', '_pre.xml', '_htm.xml', '.xml'}

    SCRAPE_FILE_EXTENSIONS = {'_lab', '_def', '_pre', '_cal'}

    def __init__(self, requester_company: str = 'Financial API', requester_name: str = 'API Caller', requester_email: str = 'apicaller@gmail.com', taxonomy: str = 'us-gaap',):
        super().__init__(name='sec-scraper', level='debug', log_file='././logs.log')

        self.requester_company = requester_company
        self.requester_name = requester_name
        self.requester_email = requester_email
        self.sec_headers = {"User-Agent": f"{requester_company} {requester_name} {requester_email}",
                            "Accept-Encoding": "gzip, deflate",
                            "Host": "www.sec.gov"}
        self.sec_data_headers = {"User-Agent": f"{requester_company} {requester_name} {requester_email}",
                                 "Accept-Encoding": "gzip, deflate",
                                 "Host": "data.sec.gov"}
        self._cik_list = None
        self._us_gaap_tags = None
        self._srt_tags = None
        if taxonomy not in self.ALLOWED_TAXONOMIES:
            raise ValueError(
                f"Taxonomy {taxonomy} is not supported. Please use one of the following taxonomies: {self.ALLOWED_TAXONOMIES}")
        self.taxonomy = taxonomy

    @property
    def cik_list(self,):
        if self._cik_list is None:
            self._cik_list = self.get_cik_list()
        return self._cik_list

    @property
    def us_gaap_tags(self,):
        if self._us_gaap_tags is None:
            self._us_gaap_tags = self.get_tags(
                xsd_url=self.US_GAAP_TAXONOMY_URL)
            self.us_gaap_tags['id'] = self.us_gaap_tags['id'].str.split(
                '_', n=1).str.join(':').str.lower()
        return self._us_gaap_tags

    @property
    def srt_tags(self,):
        if self._srt_tags is None:
            self._srt_tags = self.get_tags(xsd_url=self.SRT_TAXONOMY_URL)
        return self._srt_tags

    @sleep_and_retry
    @limits(calls=10, period=1)
    def rate_limited_request(self, url: str, headers: dict):
        """Rate limited request to SEC Edgar database.

        Args:
            url (str): URL to retrieve data from
            headers (dict): Headers to be used for API calls

        Returns:
            response: Response from API call
        """
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            self.scrape_logger.error(f'''Request failed at URL: {url}''')
        else:
            self.scrape_logger.info(f'''Request successful at URL: {url}''')
        return response

    def get_cik_list(self):
        """Retrieves the full list of CIK available from SEC database.

        Raises:
            Exception: On failure to retrieve CIK list

        Returns:
            cik_df: DataFrame containing CIK and ticker
        """
        url = r"https://www.sec.gov/files/company_tickers.json"
        cik_raw = self.rate_limited_request(url, self.sec_headers)
        cik_json = cik_raw.json()
        cik_df = pd.DataFrame.from_dict(cik_json).T
        return cik_df

    def get_ticker_cik(self, ticker: str,):
        """Get a specific ticker's CIK number. 
        CIK########## is the entity's 10-digit Central Index Key (CIK).

        Args:
            ticker (str): public ticker symbol of the company

        Returns:
            cik: CIK number of the company excluding the leading 'CIK'
        """
        ticker_cik = self.cik_list.query(
            f"ticker == '{ticker.upper()}'")['cik_str']
        cik = f"{ticker_cik.iloc[0]:010d}"
        return cik

    def get_tags(self, xsd_url: str = US_GAAP_TAXONOMY_URL):
        """Get the list of tags (elements) in us-gaap taxonomy or provide a different xsd_url to get tags from a different taxonomy.

        Returns:
            list of tags
        """
        url = requests.get(xsd_url).content
        us_gaap_df = pd.DataFrame(
            [element.attrs for element in BeautifulSoup(url, 'lxml').find_all('xs:element')])

        return us_gaap_df

    def get_submissions(self, cik: str = None, submission_file: str = None) -> dict:
        if cik is not None:
            url = f"{self.BASE_API_URL}submissions/CIK{cik}.json"
        elif submission_file is not None:
            url = f"{self.BASE_API_URL}submissions/{submission_file}"
        else:
            raise Exception(
                "Please provide either a CIK number or a submission file.")
        response = self.rate_limited_request(
            url, headers=self.sec_data_headers)
        if response.status_code != 200:
            raise Exception(
                f"Failed to retrieve submissions. Status code: {response.status_code}")
        data = json.loads(response.text)
        return data

    def get_company_concept(self, cik: str, tag: str, taxonomy: str = 'us-gaap',):
        """The company-concept API returns all the XBRL disclosures from a single company (CIK) 
        and concept (a taxonomy and tag) into a single JSON file, with a separate array of facts 
        for each units on measure that the company has chosen to disclose 
        (e.g. net profits reported in U.S. dollars and in Canadian dollars).

        Args:
            cik (str): CIK number of the company. Get the list using self.cik
            taxonomy (str): us-gaap, ifrs-full, dei, or srt
            tag (str): taxonomy tag (e.g. Revenue, AccountsPayableCurrent). See full list from https://xbrl.fasb.org/us-gaap/2023/elts/us-gaap-2023.xsd

        Raises:
            Exception: On failure to retrieve company concept either due to invalid CIK, taxonomy, or tag

        Returns:
            data: JSON file containing all the XBRL disclosures from a single company (CIK)
        """
        url = f"{self.BASE_API_URL}api/xbrl/companyconcept/CIK{cik}/{taxonomy}/{tag}.json"
        response = self.rate_limited_request(
            url, headers=self.sec_data_headers)
        data = json.loads(response.text)
        return data

    def get_company_facts(self, cik):
        url = f"{self.BASE_API_URL}api/xbrl/companyfacts/CIK{cik}.json"
        response = self.rate_limited_request(
            url, headers=self.sec_data_headers)
        if response.status_code != 200:
            raise Exception(
                f"Failed to retrieve company facts for CIK {cik}. Status code: {response.status_code}")
        data = json.loads(response.text)
        return data

    def get_frames(self, taxonomy, tag, unit, period):
        """The xbrl/frames API aggregates one fact for each reporting entity that is last filed that most closely fits the calendrical period requested. 
        This API supports for annual, quarterly and instantaneous data: https://data.sec.gov/api/xbrl/frames/us-gaap/AccountsPayableCurrent/USD/CY2019Q1I.json

        Args:
            taxonomy (str): us-gaap, ifrs-full, dei, or srt
            tag (str): taxonomy tag (e.g. Revenue, AccountsPayableCurrent). See full list from https://xbrl.fasb.org/us-gaap/2023/elts/us-gaap-2023.xsd
            unit (str): USD, USD-per-shares, etc.
            period (str): CY#### for annual data (duration 365 days +/- 30 days), CY####Q# for quarterly data (duration 91 days +/- 30 days), CY####Q#I for instantaneous data

        Raises:
            Exception: (placeholder)

        Returns:
            data: json formatted response
        """
        url = f"{self.BASE_API_URL}api/xbrl/frames/{taxonomy}/{tag}/{unit}/{period}.json"
        response = self.rate_limited_request(
            url, headers=self.sec_data_headers)
        data = json.loads(response.text)
        return data

    def get_data_as_dataframe(self, cik: str,):
        """Retrieves the XBRL disclosures from a single company (CIK) and returns it as a pandas dataframe.

        Args:
            cik (str): CIK number of the company. Get the list using self.cik

        Returns:
            df: pandas dataframe containing the XBRL disclosures from a single company (CIK)
        """
        data = self.get_company_facts(cik)

        df = pd.DataFrame()

        for tag in data['facts'][self.taxonomy]:
            facts = data['facts']['us-gaap'][tag]['units']
            unit_key = list(facts.keys())[0]
            temp_df = pd.DataFrame(facts[unit_key])
            temp_df['label'] = tag
            df = pd.concat([df, temp_df], axis=0, ignore_index=True)
        df = df.astype({'val': 'float64',
                        'end': 'datetime64[ns]',
                        'start': 'datetime64[ns]',
                        'filed': 'datetime64[ns]'})
        df['Months Ended'] = (df['end'] - df['start']
                              ).dt.days.div(30.4375).round(0)
        return df

    def get_cik_index(self, cik: str = None,) -> dict:
        """Each CIK directory and all child subdirectories contain three files to assist in 
        automated crawling of these directories. 
        These are not visible through directory browsing.
            - index.html (the web browser would normally receive these)
            - index.xml (a XML structured version of the same content)
            - index.json (a JSON structured vision of the same content)

        Args:
            cik (str): CIK number of the company. Get the list using self.cik

        Returns:
            json: pandas dataframe containing the XBRL disclosures from a single company (CIK)
        """
        if cik is not None:
            url = self.BASE_DIRECTORY_URL + cik + '/' + 'index.json'

        else:
            url = self.BASE_DIRECTORY_URL + self.cik + '/' + 'index.json'

        response = self.rate_limited_request(url, headers=self.sec_headers)
        return response.json()

    def get_sic_list(self, sic_list_url: str = SIC_LIST_URL) -> dict:
        """Get the list of SIC codes from SEC website.

        Args:
            sic_list_url (str): URL to the list of SIC codes

        Returns:
            pd.DataFrame: DataFrame containing the SIC codes and descriptions
        """
        response = self.rate_limited_request(
            sic_list_url, headers=self.sec_headers)

        soup = BeautifulSoup(response.content, "lxml")
        sic_table = soup.find('table', {'class': 'list'})
        sic_list = []
        for row in sic_table.find_all('tr')[1:]:
            sic_dict = {'_id': None,
                        'Office': None, 'Industry Title': None}
            sic_dict['_id'] = row.text.split('\n')[1]
            sic_dict['Office'] = row.text.split('\n')[2]
            sic_dict['Industry Title'] = row.text.split('\n')[3]
            sic_list.append(sic_dict)

        return sic_list


class TickerData(SECData):
    """Inherited from SECData class. Retrieves data from SEC Edgar database based on ticker.
    url is constructed based on the following: https://www.sec.gov/Archives/edgar/data/{cik}/{ascension_number}/{file_name}
    cik is the CIK number of the company = access via get_ticker_cik
    ascension_number is the accessionNumber column of filings_df
    file name for xml is always '{ticker}-{reportDate}.{extension}
    """

    def __init__(self, ticker: str, requester_company: str = 'Financial API', requester_name: str = 'API Caller', requester_email: str = 'apicaller@gmail.com', taxonomy: str = 'us-gaap', search_strategy: SearchStrategy = None):
        super().__init__(requester_company, requester_name, requester_email, taxonomy,)
        self.search_strategy = search_strategy
        self.ticker = ticker.upper()
        self.cik = self.get_ticker_cik(self.ticker)
        self._submissions = self.get_submissions(self.cik)
        self._filings = None
        self._forms = None
        self._index = self.get_cik_index(self.cik)
        self._filing_folder_urls = None
        self._filing_urls = None

    @property
    def submissions(self,) -> dict:
        if self._submissions is not None:
            self._submissions['cik'] = self.cik
            self._submissions['filings'] = self.filings.replace(
                {pd.NaT: None}).to_dict('records')
        return self._submissions

    @property
    def filings(self,) -> pd.DataFrame:
        if self._filings is None:
            self._filings = self.get_filings()
        return self._filings

    @property
    def latest_filing(self,) -> pd.DataFrame:
        return self.filings.iloc[0, :].to_dict() if len(self.filings) > 0 else None

    @property
    def latest_10Q(self,) -> pd.DataFrame:
        return self.filings.query("form == '10-Q'").iloc[0, :].to_dict() if len(self.filings.query("form == '10-Q'")) > 0 else None

    @property
    def latest_10K(self,) -> pd.DataFrame:
        return self.filings.query("form == '10-K'").iloc[0, :].to_dict() if len(self.filings.query("form == '10-K'")) > 0 else None

    @property
    def latest_8K(self,) -> pd.DataFrame:
        return self.filings.query("form == '8-K'").iloc[0, :].to_dict() if len(self.filings.query("form == '8-K'")) > 0 else None

    @property
    def filing_folder_urls(self,) -> list:
        if self._filing_folder_urls is None:
            self._filing_folder_urls = self._get_filing_folder_urls()
        return self._filing_folder_urls

    @property
    def filing_urls(self,) -> list:
        if self._filing_urls is None:
            self._filing_urls = self.filings['file_url'].tolist()

        return self._filing_urls

    @property
    def forms(self,) -> list:
        if self._forms is None:
            self._forms = self.filings['form'].unique()
        return self._forms

    def set_search_strategy(self, search_strategy: SearchStrategy):
        self.search_strategy = search_strategy

    def _get_filing_folder_urls(self,) -> list:
        """Get filing folder urls from index dict.

        Args:
            index (dict): index dict from get_index method

        Returns:s
            filing_folder_urls (list): list of filing folder urls
        """

        filing_folder_urls = [self.BASE_SEC_URL + self._index['directory']['name'] + '/' + folder['name']
                              for folder in self._index['directory']['item'] if folder['type'] == 'folder.gif']
        return filing_folder_urls

    def get_filing_folder_index(self, folder_url: str, return_df: bool = True):
        """Get filing folder index from folder url.

        Args:
            folder_url (str): folder url to retrieve data from
            return_df (bool, optional): Whether to return a DataFrame or dict. Defaults to True.

        Returns:
            index (dict): index dict or dataframe
        """
        index_url = indexify_url(folder_url)
        index = self.rate_limited_request(index_url, headers=self.sec_headers)
        return pd.DataFrame(index.json()['directory']['item']) if return_df else index.json()['directory']['item']

    def get_filings(self,) -> dict:
        """Get filings and urls to .txt from submissions dict.

        Args:
            submissions (dict): submissions dict from get_submissions method

        Returns:
            filings (dict): dictionary containing filings
        """
        self.scrape_logger.info(
            f'Making http request for {self.ticker} filings...')
        filings = self._submissions['filings']['recent']

        if len(self._submissions['filings']) > 1:
            self.scrape_logger.info(
                f'Additional filings found for {self.ticker}...')
            for file in self._submissions['filings']['files']:
                additional_filing = self.get_submissions(
                    submission_file=file['name'])
                filings = {key: filings[key] + additional_filing[key]
                           for key in filings.keys()}

        filings = pd.DataFrame(filings)
        # Convert reportDate, filingDate, acceptanceDateTime columns to datetime
        filings['reportDate'] = pd.to_datetime(filings['reportDate'])
        filings['filingDate'] = pd.to_datetime(filings['filingDate'])
        filings['acceptanceDateTime'] = pd.to_datetime(
            filings['acceptanceDateTime'])
        filings['cik'] = self.cik

        filings = filings.loc[~pd.isnull(filings['reportDate'])]

        # get folder url for each row
        filings['folder_url'] = self.BASE_DIRECTORY_URL + \
            self.cik + '/' + filings['accessionNumber'].str.replace('-', '')

        # get file url for each row
        filings['file_url'] = filings['folder_url'] + \
            '/' + filings['accessionNumber'] + '.txt'

        return filings

    def get_file_data(self, file_url: str) -> BeautifulSoup:
        """Get file data from file url which can be retrieved by calling self.get_file_url method.

        Args:
            file_url (str): File url to retrieve data from on the SEC website

        Returns:
            data: File data as a BeautifulSoup object
        """
        data = self.rate_limited_request(
            url=file_url, headers=self.sec_headers)
        try:
            soup = BeautifulSoup(data.content, "lxml")
            self.scrape_logger.info(
                f'Parsed file data from {file_url} successfully.')
            return soup

        except Exception as e:
            self.scrape_logger.error(
                f'Failed to parse file data from {file_url}. Error: {e}')
            raise Exception(
                f'Failed to parse file data from {file_url}. Error: {e}')

    def get_elements(self, folder_url: str, index_df: pd.DataFrame, scrape_file_extension: str) -> pd.DataFrame:
        """Get elements from .xml files from folder_url.

        Args:
            folder_url (str): folder url to retrieve data from
            index_df (pd.DataFrame): dataframe containing files in the filing folder
            scrape_file_extension (str): .xml file extension to scrape

        Returns:
            pd.DataFrame: returns a dataframe containing the elements, attributes, text
        """
        xml = index_df.query(f"name.str.contains('{scrape_file_extension}')")
        xml_content = self.rate_limited_request(
            folder_url + '/' + xml['name'].iloc[0], headers=self.sec_headers).content

        xml_soup = BeautifulSoup(xml_content, 'lxml-xml')
        labels = xml_soup.find_all()
        labels_list = []
        for i in labels[1:]:
            label_dict = dict(**i.attrs, labelText=i.text.strip())
            labels_list.append(label_dict)
        return pd.DataFrame(labels_list)

    def search_tags(self, soup: BeautifulSoup, pattern: str = None) -> List[Tag]:
        """Search for tags in BeautifulSoup object. Strategy can be set using self.set_search_strategy method.

        Args:
            soup (BeautifulSoup): BeautifulSoup object
            pattern (str): regex pattern to search for

        Returns:
            soup: BeautifulSoup object
        """
        if self.search_strategy is None and pattern is None:
            raise Exception('Search strategy not set and no pattern provided.')
        if pattern is None:
            pattern = self.search_strategy.get_pattern()
        return soup.find_all(re.compile(pattern))

    # To add more search methods, add a SearchStrategy abstract class with get_pattern method and add a method here
    def search_context(self, soup: BeautifulSoup) -> List[Tag]:
        self.set_search_strategy(ContextSearchStrategy())
        return self.search_tags(soup)

    def search_linklabels(self, soup: BeautifulSoup) -> List[Tag]:
        self.set_search_strategy(LinkLabelSearchStrategy())
        return self.search_tags(soup)

    def search_facts(self, soup: BeautifulSoup) -> List[Tag]:
        self.set_search_strategy(FactSearchStrategy())
        return self.search_tags(soup)

    def get_metalinks(self, metalinks_url: str) -> pd.DataFrame:
        """Get metalinks from metalinks url.

        Args:
            metalinks_url (str): metalinks url to retrieve data from

        Returns:
            df: DataFrame containing metalinks information with columns 
            {
                'labelKey': str,
                'localName': str,
                'labelName': int,
                'terseLabel': str,
                'documentation': str,
            }
        """
        try:
            response = self.rate_limited_request(
                url=metalinks_url, headers=self.sec_headers).json()
            metalinks_instance = convert_keys_to_lowercase(
                response['instance'])
            instance_key = list(metalinks_instance.keys())[0]
            dict_list = []
            for i in metalinks_instance[instance_key]['tag']:
                dict_list.append(dict(labelKey=i.lower(),
                                      localName=metalinks_instance[instance_key]['tag'][i].get(
                                          'localname'),
                                      labelName=metalinks_instance[instance_key]['tag'][i].get(
                                          'lang').get('enus').get('role').get('label'),
                                      terseLabel=metalinks_instance[instance_key]['tag'][i].get(
                                          'lang').get('enus').get('role').get('terselabel'),
                                      documentation=metalinks_instance[instance_key]['tag'][i].get('lang').get('enus').get('role').get('documentation'),))

            df = pd.DataFrame.from_dict(dict_list)
            return df
        except Exception as e:
            self.scrape_logger.error(
                f'Failed to retrieve metalinks from {metalinks_url}. Error: {e}')
            return None

    def __repr__(self) -> str:
        class_name = type(self).__name__
        main_attrs = ['ticker', 'cik', 'submissions', 'filings']
        available_methods = [method_name for method_name in dir(self) if callable(
            getattr(self, method_name)) and not method_name.startswith("_")]
        return f"""{class_name}({self.ticker})
    CIK: {self.cik}
    Latest filing: {self.latest_filing['filingDate'].strftime('%Y-%m-%d') if self.latest_filing else 'No filing found'} for Form {self.latest_filing['form'] if self.latest_filing else None}. Access via: {self.latest_filing['folder_url'] if self.latest_filing else None}
    Latest 10-Q: {self.latest_10Q['filingDate'].strftime('%Y-%m-%d') if self.latest_10Q else 'No filing found'}. Access via: {self.latest_10Q['folder_url'] if self.latest_10Q else None}
    Latest 10-K: {self.latest_10K['filingDate'].strftime('%Y-%m-%d') if self.latest_10K else 'No filing found'}. Access via: {self.latest_10K['folder_url'] if self.latest_10K else None}"""

    def __repr_html__(self) -> str:
        class_name = type(self).__name__
        main_attrs = ['ticker', 'cik', 'submissions', 'filings']
        available_methods = [method_name for method_name in dir(self) if callable(
            getattr(self, method_name)) and not method_name.startswith("_")]
        latest_filing_date = self.latest_filing['filingDate'].strftime(
            '%Y-%m-%d') if self.latest_filing else 'No filing found'
        latest_filing_form = self.latest_filing['form'] if self.latest_filing else None
        latest_filing_folder_url = self.latest_filing['folder_url'] if self.latest_filing else None
        latest_10Q_date = self.latest_10Q['filingDate'].strftime(
            '%Y-%m-%d') if self.latest_10Q else 'No filing found'
        latest_10Q_folder_url = self.latest_10Q['folder_url'] if self.latest_10Q else None
        latest_10K_date = self.latest_10K['filingDate'].strftime(
            '%Y-%m-%d') if self.latest_10K else 'No filing found'
        latest_10K_folder_url = self.latest_10K['folder_url'] if self.latest_10K else None
        return f"""
        <div style="border: 1px solid #ccc; padding: 10px; margin: 10px;">
            <h3>{self.submissions['name']}</h3>
            <h5>{self.submissions['sicDescription']}</h5>
            <p><strong>Ticker:</strong> {self.ticker}</p>
            <p><strong>CIK:</strong> {self.cik}</p>
            <p><strong>Latest filing:</strong> {latest_filing_date} for Form {latest_filing_form}. Access via: <a href="{latest_filing_folder_url}">{latest_filing_folder_url}</a></p>
            <p><strong>Latest 10-Q:</strong> {latest_10Q_date}. Access via: <a href="{latest_10Q_folder_url}">{latest_10Q_folder_url}</a></p>
            <p><strong>Latest 10-K:</strong> {latest_10K_date}. Access via: <a href="{latest_10K_folder_url}">{latest_10K_folder_url}</a></p>
        </div>
        """

# MongoDB Connection

In [16]:
# Built-in libraries
import datetime as dt

# Third party libraries
import streamlit as st
from pymongo import MongoClient, ASCENDING
from pymongo import MongoClient, ASCENDING, IndexModel, UpdateOne
from pymongo.errors import OperationFailure

@st.cache_resource
def init_connection(secrets_name: str = 'mongo'):
    return MongoClient(**st.secrets[secrets_name])


@st.cache_resource(ttl=86400)  # only refresh after 24h
def get_data():
    client = init_connection()
    db = client.FinanceApp
    balance_sheet_collection = db.balance_sheet
    income_collection = db.income_statement
    cash_collection = db.cash_flow_statement
    company_profile = db.company_profile
    historical = db.historical
    stock_split = db.stock_split
    return balance_sheet_collection, income_collection, cash_collection, company_profile, historical, stock_split


class SECDatabase(MyLogger):
    def __init__(self, connection_string):
        super().__init__(name='SECDatabase', level='DEBUG', log_file='././logs.log')
        self.client = MongoClient(connection_string)
        self.db = self.client.SECRawData
        self.tickerdata = self.db.TickerData
        self.tickerfilings = self.db.TickerFilings
        self.sicdb = self.db.SICList
        self.factsdb = self.db.Facts
        try:
            self.tickerdata.create_indexes(
                [IndexModel([('cik', ASCENDING)], unique=True)])
        except OperationFailure as e:
            self.scrape_logger.error(e)

        try:
            self.tickerfilings.create_indexes([IndexModel(
                [('accessionNumber', ASCENDING)], unique=True), IndexModel([('form', ASCENDING)])])
        except OperationFailure as e:
            self.scrape_logger.error(e)

        try:
            self.factsdb.create_indexes(
                [IndexModel([('factId', ASCENDING)], unique=True)])

        except OperationFailure as e:
            self.scrape_logger.error(e)

    @property
    def get_server_info(self):
        return self.client.server_info()

    @property
    def get_collection_names(self):
        return self.db.list_collection_names()

    @property
    def get_tickerdata_index_information(self):
        return self.tickerdata.index_information()

    @property
    def get_tickerfilings_index_information(self):
        return self.tickerfilings.index_information()

    def get_tickerdata(self, cik: str = None, ticker: str = None):
        if cik is not None:
            return self.tickerdata.find_one({'cik': cik})
        elif ticker is not None:
            return self.tickerdata.find_one({'tickers': ticker.upper()})
        else:
            raise Exception('Please provide either a CIK or ticker.')

    def insert_submission(self, submission: dict):
        """Insert submissions into SEC database. CIK is the primary key.

        Args:
            ticker (TickerData): TickerData object

        Returns:
            str: empty string if successful
            str: ticker's cik if failed
        """
        submission['lastUpdated'] = dt.datetime.now()
        try:
            self.tickerdata.update_one({'cik': submission['cik']}, {
                                       '$set': submission}, upsert=True)
            self.scrape_logger.info(
                f'Inserted submissions for {submission["cik"]} into SEC database.')

        except Exception as e:
            self.scrape_logger.error(
                f'Failed to insert submissions for {submission["cik"]} into SEC database. Error: {e}')
            return submission['cik']
        return None

    def insert_filings(self, cik: str, filings: list):
        """Insert filings into SEC database. Each submission has many filings. Accession number is the primary key.

        Args:
            ticker (TickerData): TickerData object

        Returns:
            str: empty string if successful
            str: ticker's cik if failed
        """
        try:
            for doc in filings:
                doc['lastUpdated'] = dt.datetime.now()

            update_requests = [UpdateOne({'accessionNumber': doc['accessionNumber']}, {
                                         '$set': doc}, upsert=True) for doc in filings]

            self.tickerfilings.bulk_write(update_requests)
            self.scrape_logger.info(
                f'Sucessfully updated filings for {cik}...')

        except Exception as e:
            self.scrape_logger.error(
                f'Failed to insert filings for {cik}...{e}')
            return cik
        return None

    def insert_facts(self, accession: str, facts: list):
        """Insert facts into SEC database. Each filing has many facts.

        Args:
            facts (list): A list containing facts for a single filing

        Returns:
            str: empty string if successful
            str: ticker's cik if failed
        """
        try:
            for doc in facts:
                doc['lastUpdated'] = dt.datetime.now()

            fact_update_requests = [UpdateOne({'factId': fact['factId']}, {
                                              '$set': fact}, upsert=True) for fact in facts]

            self.factsdb.bulk_write(fact_update_requests)
            self.scrape_logger.info(f'Updated facts for {accession}...')

        except Exception as e:
            self.scrape_logger.error(
                f'Failed to insert facts for {accession}...{e}')
            return accession
        return None


In [17]:
import os

sec = SECData()
mongo = SECDatabase(os.getenv('mongodb_sec'))

# Context Dataclass

In [18]:
# Built-in libraries
from dataclasses import dataclass
import datetime as dt
import re

# Third party libraries
from bs4.element import Tag


@dataclass
class Context:
    context_tag: Tag

    @property
    def contextId(self) -> str:
        """Get contextId

        Returns:
            str: contextId
        """
        return self.context_tag.attrs.get('id')

    @property
    def entity(self) -> str | None:
        """Get entity

        Returns:
            str: entity
        """
        return self.context_tag.find("entity").text.split()[
            0] if self.context_tag.find("entity") is not None else None

    @property
    def startDate(self) -> dt.datetime | None:
        """Get start date

        Returns:
            dt.datetime: start date
        """
        start = self.context_tag.find("startdate").text if self.context_tag.find(
            "startdate") is not None else None
        return dt.datetime.strptime(start, '%Y-%m-%d') if start is not None else None

    @property
    def endDate(self) -> dt.datetime | None:
        """Get end date

        Returns:
            dt.datetime: end date
        """
        end = self.context_tag.find("enddate").text if self.context_tag.find(
            "enddate") is not None else None
        return dt.datetime.strptime(end, '%Y-%m-%d') if end is not None else None

    @property
    def instant(self):
        """Get instant date

        Returns:
            dt.datetime: instant date
        """
        instant = self.context_tag.find("instant").text if self.context_tag.find(
            "instant") is not None else None
        return dt.datetime.strptime(instant, '%Y-%m-%d') if instant is not None else None

    @property
    def segment(self) -> dict | None:
        """Get segments and tags classifying the segment and store in dict

        Returns:
            dict: dict containing segment and tags classifying the segment
        """
        segment = self.context_tag.find("segment")

        if segment is None:
            return None

        segment_dict = {}

        segment_breakdown = segment.find_all(re.compile('^xbrldi:.*'))

        for i in segment_breakdown:
            segment_dict[i.attrs.get('dimension')] = i.text

        return segment_dict

    def to_dict(self) -> dict:
        """Convert context to dict

        Returns:
            dict: dict containing context information
        """
        return dict(contextId=self.contextId, entity=self.entity, segment=self.segment, startDate=self.startDate, endDate=self.endDate, instant=self.instant)

    def get_segment_length(self) -> int:
        """Get length of segment

        Returns:
            int: length of segment
        """
        segment = self.context_tag.find("segment")

        if segment is None:
            return 0

        return len(segment)

    def __repr__(self):
        return f'Context(contextId={self.contextId}, entity={self.entity}, segment={self.segment}, startDate={self.startDate}, endDate={self.endDate}, instant={self.instant})'

    def __repr_html__(self):
        return f"""
        <div style="border: 1px solid #ccc; padding: 10px; margin: 10px;">
            <h3>Context</h3>
            <p><strong>contextId:</strong> {self.contextId}</p>
            <p><strong>entity:</strong> {self.entity}</p>
            <p><strong>segment:</strong> {self.segment}</p>
            <p><strong>startDate:</strong> {self.startDate}</p>
            <p><strong>endDate:</strong> {self.endDate}</p>
            <p><strong>instant:</strong> {self.instant}</p>
        </div>
        """

    def __str__(self):
        return f'''contextId={self.contextId}
entity={self.entity}
segment={self.segment}
startDate={self.startDate}
endDate={self.endDate}
instant={self.instant}'''


@dataclass
class LinkLabels:
    label_tag: Tag

    @property
    def linkLabelId(self) -> str | None:
        """Get labelId

        Returns:
            str: labelId
        """
        return self.label_tag.attrs.get('id')

    @property
    def xlinkLabel(self) -> str | None:
        """Get linkLabel

        Returns:
            str: linkLabel
        """
        return self.label_tag.attrs.get('xlink:label')

    @property
    def xlinkRole(self) -> str | None:
        """Get linkRole

        Returns:
            str: linkRole
        """
        return self.label_tag.attrs.get('xlink:role')

    @property
    def xlinkType(self) -> str | None:
        """Get linkType

        Returns:
            str: linkType
        """
        return self.label_tag.attrs.get('xlink:type')

    @property
    def xlmnsXml(self) -> str | None:
        """Get xlmnsXml

        Returns:
            str: xlmnsXml
        """
        return self.label_tag.attrs.get('xmlns:xml')

    @property
    def xlmLang(self) -> str | None:
        """Get xlmLang

        Returns:
            str: xlmLang
        """
        return self.label_tag.attrs.get('xml:lang')

    @property
    def labelName(self) -> str | None:
        """Get labelName

        Returns:
            str: labelName
        """
        return self.label_tag.text if self.label_tag.text is not None else None

    def to_dict(self) -> dict:
        """Convert linkLabels to dict

        Returns:
            dict: dict containing linkLabels information
        """
        return dict(linkRole=self.linkRole, linkLabel=self.linkLabel, linkbase=self.linkbase)

    def __repr__(self):
        return f'LinkLabels(linkRole={self.linkRole}, linkLabel={self.linkLabel}, linkbase={self.linkbase})'

    def __repr_html__(self):
        return f"""
        <div style="border: 1px solid #ccc; padding: 10px; margin: 10px;">
            <h3>LinkLabels</h3>
            <p><strong>linkRole:</strong> {self.linkRole}</p>
            <p><strong>linkLabel:</strong> {self.linkLabel}</p>
            <p><strong>linkbase:</strong> {self.linkbase}</p>
        </div>
        """

    def __str__(self):
        return f'''linkRole={self.linkRole}
linkLabel={self.linkLabel}
linkBase={self.linkbase}'''


@dataclass
class Facts:
    fact_tag: Tag

    @property
    def factName(self):
        """Get factName

        Returns:
            str: factName
        """
        return self.fact_tag.name

    @property
    def factId(self):
        """Get factId

        Returns:
            str: factId
        """
        return self.fact_tag.attrs.get('id')

    @property
    def contextRef(self):
        """Get contextRef

        Returns:
            str: contextRef
        """
        return self.fact_tag.attrs.get('contextref')

    @property
    def unitRef(self):
        """Get unitRef

        Returns:
            str: unitRef
        """
        return self.fact_tag.attrs.get('unitref')

    @property
    def decimals(self):
        """Get decimals

        Returns:
            str: decimals
        """
        return self.fact_tag.attrs.get('decimals')

    @property
    def factValue(self):
        """Get factValue

        Returns:
            str: factValue
        """
        return self.fact_tag.text

    def to_dict(self) -> dict:
        """Convert facts to dict

        Returns:
            dict: dict containing facts information
        """
        return dict(factName=self.factName, factId=self.factId, contextRef=self.contextRef, unitRef=self.unitRef, decimals=self.decimals, factValue=self.factValue)

    def __repr__(self):
        return f'Facts(factName={self.factName}, factId={self.factId}, contextRef={self.contextRef}, unitRef={self.unitRef}, decimals={self.decimals}, factValue={self.factValue})'

    def __repr_html__(self):
        return f"""
        <div style="border: 1px solid #ccc; padding: 10px; margin: 10px;">
            <h3>Facts</h3>
            <p><strong>factName:</strong> {self.factName}</p>
            <p><strong>factId:</strong> {self.factId}</p>
            <p><strong>contextRef:</strong> {self.contextRef}</p>
            <p><strong>unitRef:</strong> {self.unitRef}</p>
            <p><strong>decimals:</strong> {self.decimals}</p>
            <p><strong>factValue:</strong> {self.factValue}</p>
        </div>
        """

    def __str__(self):
        return f'''factName={self.factName}
factId={self.factId}
contextRef={self.contextRef}
unitRef={self.unitRef}
decimals={self.decimals}
factValue={self.factValue}'''


# Gather labels, definitions, and calculations xml data

In [212]:
# Third-party libraries
import pandas as pd
import numpy as np
import datetime as dt
import streamlit as st

def reverse_standard_mapping(standard_name_mapping: dict):
    reverse_mapping = {}
    for standard_name, xbrl_tags in standard_name_mapping.items():
        for tag in xbrl_tags:
            reverse_mapping[tag] = standard_name

    return reverse_mapping


def get_filing_facts(ticker: TickerData, filings_to_scrape: list, verbose=False):
    """
    Scrape facts, context, labels, definitions, calculations, metalinks from filings_to_scrape

    ### Parameters
    ----------
    ticker : TickerData
        TickerData object
    filings_to_scrape : list
        list of filings dict to scrape

    ### Returns
    -------
    all_labels : pd.DataFrame
        all labels scraped
    all_calc : pd.DataFrame
        all calculations scraped
    all_defn : pd.DataFrame
        all definitions scraped
    all_context : pd.DataFrame
        all contexts scraped
    all_facts : pd.DataFrame
        all facts scraped
    all_metalinks : pd.DataFrame    
        all metalinks scraped
    all_merged_facts : pd.DataFrame
        all merged facts scraped
    failed_folders : list
        list of failed folders
    """
    all_labels = pd.DataFrame()
    all_calc = pd.DataFrame()
    all_defn = pd.DataFrame()
    all_context = pd.DataFrame()
    all_facts = pd.DataFrame()
    all_metalinks = pd.DataFrame()
    all_merged_facts = pd.DataFrame()
    failed_folders = []

    for file in filings_to_scrape:
        if (file.get('form') != '10-Q' or file.get('form') != '10-K') and file.get('filingDate') < dt.datetime(2009, 1, 1):
            continue

        accessionNumber = file.get('accessionNumber')
        folder_url = file.get('folder_url')
        file_url = file.get('file_url')
        ticker.scrape_logger.info(
            file.get('filingDate').strftime('%Y-%m-%d') + ': ' + folder_url)

        soup = ticker.get_file_data(file_url=file_url)

        try:  # Scrape facts
            facts_list = []
            facts = ticker.search_facts(soup=soup)
            for fact_tag in facts:
                facts_list.append(Facts(fact_tag=fact_tag).to_dict())
            facts_df = pd.DataFrame(facts_list)
            facts_df['accessionNumber'] = accessionNumber
            all_facts = pd.concat([all_facts, facts_df], ignore_index=True)
        except Exception as e:
            ticker.scrape_logger.error(
                f'Failed to scrape facts for {folder_url}...{e}')
            failed_folders.append(dict(folder_url=folder_url, accessionNumber=accessionNumber,
                                  error=f'Failed to scrape facts for {folder_url}...{e}', filingDate=file.get('filingDate')))
            pass

        if len(facts_list) == 0:
            ticker.scrape_logger.info(
                f'No facts found for {ticker.ticker}({ticker.cik})-{folder_url}...\n')
            continue

        try:  # Scrape context
            context_list = []
            contexts = ticker.search_context(soup=soup)
            for tag in contexts:
                context_list.append(Context(context_tag=tag).to_dict())
            context_df = pd.DataFrame(context_list)
            context_df['accessionNumber'] = accessionNumber
            all_context = pd.concat(
                [all_context, context_df], ignore_index=True)
        except Exception as e:
            ticker.scrape_logger.error(
                f'Failed to scrape context for {folder_url}...{e}')
            failed_folders.append(dict(folder_url=folder_url, accessionNumber=accessionNumber,
                                  error=f'Failed to scrape context for {folder_url}...{e}', filingDate=file.get('filingDate')))
            pass

        index_df = ticker.get_filing_folder_index(folder_url=folder_url)

        try:  # Scrape metalinks
            metalinks = ticker.get_metalinks(
                folder_url=folder_url + '/MetaLinks.json')
            metalinks['accessionNumber'] = accessionNumber
            all_metalinks = pd.concat(
                [all_metalinks, metalinks], ignore_index=True)
        except Exception as e:
            ticker.scrape_logger.error(
                f'Failed to scrape metalinks for {folder_url}...{e}')
            failed_folders.append(dict(folder_url=folder_url, accessionNumber=accessionNumber,
                                  error=f'Failed to scrape metalinks for {folder_url}...{e}', filingDate=file.get('filingDate')))
            pass

        try:  # Scrape labels
            labels = ticker.get_elements(folder_url=folder_url, index_df=index_df,
                                         scrape_file_extension='_lab').query("`xlink:type` == 'resource'")
            labels['xlink:role'] = labels['xlink:role'].str.split(
                '/').apply(lambda x: x[-1])
            labels['xlink:labelOriginal'] = labels['xlink:label']
            labels['xlink:label'] = labels['xlink:label']\
                .str.replace('(lab_)|(_en-US)', '', regex=True)\
                    .str.split('_')\
                        .apply(lambda x: ':'.join(x[:2]))\
                .str.lower()
            labels['accessionNumber'] = accessionNumber
            all_labels = pd.concat([all_labels, labels], ignore_index=True)

        except Exception as e:
            ticker.scrape_logger.error(
                f'Failed to scrape labels for {folder_url}...{e}')
            failed_folders.append(dict(folder_url=folder_url, accessionNumber=accessionNumber,
                                  error=f'Failed to scrape labels for {folder_url}...{e}', filingDate=file.get('filingDate')))
            pass

        try:  # Scrape calculations
            calc = ticker.get_elements(folder_url=folder_url, index_df=index_df,
                                       scrape_file_extension='_cal').query("`xlink:type` == 'arc'")
            calc['accessionNumber'] = accessionNumber
            all_calc = pd.concat([all_calc, calc], ignore_index=True)
        except Exception as e:
            ticker.scrape_logger.error(
                f'Failed to scrape calc for {folder_url}...{e}')
            failed_folders.append(dict(folder_url=folder_url, accessionNumber=accessionNumber,
                                  error=f'Failed to scrape calc for {folder_url}...{e}', filingDate=file.get('filingDate')))
            pass

        try:  # Scrape definitions
            defn = ticker.get_elements(folder_url=folder_url, index_df=index_df,
                                       scrape_file_extension='_def').query("`xlink:type` == 'arc'")
            defn['accessionNumber'] = accessionNumber
            all_defn = pd.concat([all_defn, defn], ignore_index=True)
        except Exception as e:
            ticker.scrape_logger.error(
                f'Failed to scrape defn for {folder_url}...{e}')
            failed_folders.append(dict(folder_url=folder_url, accessionNumber=accessionNumber,
                                  error=f'Failed to scrape defn for {folder_url}...{e}', filingDate=file.get('filingDate')))
            pass

        ticker.scrape_logger.info(
            f'Merging facts with context and labels. Current facts length: {len(facts_list)}...')
        try:
            merged_facts = facts_df.merge(context_df, how='left', left_on='contextRef', right_on='contextId')\
                .merge(labels.query("`xlink:role` == 'label'"), how='left', left_on='factName', right_on='xlink:label')
            merged_facts = merged_facts.drop(
                ['accessionNumber_x', 'accessionNumber_y'], axis=1)

            ticker.scrape_logger.info(
                f'Successfully merged facts with context and labels. Merged facts length: {len(merged_facts)}...')
        except Exception as e:
            ticker.scrape_logger.error(
                f'Failed to merge facts with context and labels for {folder_url}...{e}')
            failed_folders.append(dict(folder_url=folder_url, accessionNumber=accessionNumber,
                                  error=f'Failed to merge facts with context and labels for {folder_url}...{e}', filingDate=file.get('filingDate')))
            pass

        all_merged_facts = pd.concat(
            [all_merged_facts, merged_facts], ignore_index=True)

        ticker.scrape_logger.info(
            f'Successfully scraped {ticker.ticker}({ticker.cik})-{folder_url}...\n')
        if verbose:
            st.success(
                ticker.ticker + ' ' + file.get('filingDate').strftime('%Y-%m-%d'))

    all_merged_facts = all_merged_facts.loc[~all_merged_facts['labelText'].isnull(), [
        'labelText', 'segment', 'startDate', 'endDate', 'instant', 'factValue', 'unitRef']]

    return all_labels, all_calc, all_defn, all_context, all_facts, all_metalinks, all_merged_facts, failed_folders


def translate_labels_to_standard_names(merged_facts: pd.DataFrame, standard_name_mapping: dict):
    merged_facts['standardName'] = merged_facts['labelText'].apply(
        lambda x: standard_name_mapping.get(x, x))

    return merged_facts


def clean_values_in_facts(merged_facts: pd.DataFrame):
    df = merged_facts.loc[
        ~(merged_facts['factValue'].str.contains(
            '[^0-9\.\-]|(^\d+\-\d+\-\d+$)'))
        & (merged_facts['factValue'] != "")
        & (merged_facts['factValue'] != "-")
    ].copy()

    df['factValue'] = df['factValue'].astype(float)

    return df


def segment_breakdown_levels(final_df: pd.DataFrame) -> int:
    dict_len = 0
    for i in final_df['segment']:
        if isinstance(i, dict):
            curr_len = len(list(i.items()))
            if curr_len > dict_len:
                dict_len = curr_len
                if curr_len > 1:
                    print(list(i.items()))

    return dict_len


def clean_values_in_segment(merged_facts: pd.DataFrame, labels_df: pd.DataFrame) -> pd.DataFrame:
    """Segment column of merged facts is cleaned to remove "ticker:" and "us-gaap:" prepend, and to split camel case into separate words (e.g. "us-gaap:RevenuesBeforeTax" becomes "Revenues Before Tax"). 

    Args:
        merged_facts (pd.DataFrame): merged facts data frame from get_filing_facts.

    Returns:
        merged_facts (pd.DataFrame): merged facts data frame with segment column cleaned
    """
    merged_facts['segmentAxis'] = merged_facts['segment']\
        .apply(lambda x: list(x.keys())[0] if isinstance(x, dict) else "")\
        .str.lower()\
        # .apply(lambda x: x.split(':')[1] if x != "" else "")\
    # .str.replace(pat=r'([A-Z])', repl=r' \1', regex=True).str.strip()

    merged_facts['segmentValue'] = merged_facts['segment']\
        .apply(lambda x: list(x.values())[0] if isinstance(x, dict) else "")\
        .str.lower()\
        # .apply(lambda x: x.split(':')[1] if x.find(':') >= 0 else "")\
    # .str.replace(pat=r'([A-Z])', repl=r' \1', regex=True).str.strip()

    merged_facts.drop('segment', axis=1, inplace=True)

    # Merge with labels to get standard names for segment labels
    labels_df = labels_df.query("`xlink:role` == 'label'")[
        ['xlink:label', 'labelText']]
    merged_facts = merged_facts.merge(labels_df, how='left', left_on='segmentAxis', right_on='xlink:label', suffixes=('', '_segmentAxis'))\
        .merge(labels_df, how='left', left_on='segmentValue', right_on='xlink:label', suffixes=('', '_segmentValue'))

    # Fill in missing labelText values with segmentValue
    merged_facts['labelText_segmentValue'].fillna(
        merged_facts['segmentValue'], inplace=True)
    merged_facts.drop(['segmentAxis', 'segmentValue', 'xlink:label',
                      'xlink:label_segmentValue'], axis=1, inplace=True)

    return merged_facts


def split_facts_into_start_instant(merged_facts: pd.DataFrame):
    """Splits facts into start/end and instant

    Args:
        merged_facts (pd.DataFrame): merged facts data frame from get_filing_facts

    Returns:
        merged_facts: merged facts data frame without duplicates on the columns labelText, segment, startDate, endDate, instant, value
        start_end: start/end facts data frame where startDate and endDate are not null
        instant: instant facts data frame where instant is not null
    """
    merged_facts.drop_duplicates(subset=[
        'labelText', 'segment', 'startDate', 'endDate', 'instant', 'factValue'], keep='last', inplace=True)

    start_end = merged_facts.dropna(axis=0, subset=['startDate', 'endDate'])[['labelText', 'segment', 'unitRef',
                                                                              'startDate', 'endDate', 'factValue']].sort_values(by=['labelText', 'segment', 'startDate', 'endDate',])
    instant = merged_facts.dropna(axis=0, subset=['instant'])[
        ['labelText', 'segment', 'unitRef', 'instant', 'factValue']].sort_values(by=['labelText', 'segment', 'instant',])

    return merged_facts, start_end, instant


def get_monthly_period(df: pd.DataFrame) -> pd.DataFrame:
    df['period'] = pd.to_timedelta(
        df['endDate'] - df['startDate']).dt.days / 30.25
    df['period'] = df['period'].round(0)
    df['Months Ended'] = np.select(
        [
            df['period'] == 3,
            df['period'] == 6,
            df['period'] == 9,
            df['period'] == 12,
        ],
        [
            "Three Months Ended",
            "Six Months Ended",
            "Nine Months Ended",
            "Twelve Months Ended",
        ],
        default=None
    )
    return df


# Script to insert submission, filings, and facts for each filing into database

In [None]:
sec = SECData()
sic_dict = sec.get_sic_list()
mongo = SECDatabase(connection_string=os.getenv('mongodb_sec'))

failed_submissions = []
failed_filings = []
failed_facts = []

with trange(len(sec.cik_list['ticker'][:50]), desc='Instantiating ticker...',) as t:
    for item in t:
        ticker = sec.cik_list['ticker'].iloc[item] # Get ticker from cik_list
        t.set_postfix(ticker=ticker, cik=sec.cik_list['cik_str'].iloc[item])

        # Initialize and instantiate TickerData object
        try:
            symbol = TickerData(ticker=ticker)
            cik = symbol.cik # get cik of ticker
            symbol.submissions['lastUpdated'] = dt.datetime.now()
            symbol.submissions['office'] = mongo.sicdb.find_one({'_id': symbol.submissions['sic']})['Office']
            sec.scrape_logger.info(f'{t}')
            sec.scrape_logger.info(f'\nInstantiated {symbol}...')
        except Exception as e:
            sec.scrape_logger.info(f'{t}')
            sec.scrape_logger.error(f'Failed to instantiate {ticker} with cik {cik}...{e}')
            continue

        filings = symbol.submissions.pop('filings')
        # print(filings)
        # Insert submissions to TickerData collection
        inserted_submission = mongo.insert_submission(submission=symbol._submissions)
        if inserted_submission is not None:
            failed_submissions.append(inserted_submission)

        # Insert filings to TickerFilings collection
        inserted_filing = mongo.insert_filings(cik=cik, filings=filings)
        if inserted_filing is not None:
            failed_filings.append(inserted_filing)

        # # Insert facts to Facts collection
        # for doc in filings:
        #     doc['lastUpdated'] = dt.datetime.now()

        #     if doc['form'] == '10-Q' or doc['form'] == '10-K':
        #         try:
        #             facts = symbol.get_facts_for_each_filing(doc)
        #             inserted_facts = mongo.insert_facts(accession=doc['accessionNumber'], facts=facts)
        #             if inserted_facts is not None:
        #                 failed_facts.append(inserted_facts)
        #         except Exception as e:
        #             sec.scrape_logger.error(f'TickerData().get_facts_for_each_filing() function failed for {doc["accessionNumber"]}...{e}')
        #             failed_facts.append(doc['accessionNumber'])
            
        sec.scrape_logger.info(f'Successfully updated {ticker}({cik})...\n')

# Initialize variables for testing

In [23]:
sec = SECData()
ticker = TickerData(ticker='AAPL')
file = ticker.filings.loc[ticker.filings['form'] == '10-K'].iloc[0]
accessionNumber = file.get('accessionNumber')
folder_url = file.get('folder_url')
file_url = file.get('file_url')
soup = ticker.get_file_data(file_url=file_url)
index_df = ticker.get_filing_folder_index(folder_url=folder_url)

# ticker.scrape_logger.info(
#     file.get('filingDate').strftime('%Y-%m-%d') + ': ' + folder_url)

# start_date = dt.datetime(2022, 1, 1) # after XBRL implementation

# query = {
#     'cik': ticker.cik,
#     'form': {'$in': ['10-K']},
#     'filingDate': {'$gte': start_date},
# }

# filings_to_scrape = [i for i in mongo.tickerfilings.find(query).sort('filingDate', 1)]


2024-01-21 15:26:36,099 - sec-scraper - INFO - Request successful at URL: https://www.sec.gov/files/company_tickers.json
2024-01-21 15:26:36,942 - sec-scraper - INFO - Request successful at URL: https://data.sec.gov/submissions/CIK0000320193.json
2024-01-21 15:26:37,293 - sec-scraper - INFO - Request successful at URL: https://www.sec.gov/Archives/edgar/data/0000320193/index.json
2024-01-21 15:26:37,297 - sec-scraper - INFO - Making http request for AAPL filings...
2024-01-21 15:26:37,298 - sec-scraper - INFO - Additional filings found for AAPL...
2024-01-21 15:26:37,976 - sec-scraper - INFO - Request successful at URL: https://data.sec.gov/submissions/CIK0000320193-submissions-001.json
2024-01-21 15:26:38,439 - sec-scraper - INFO - Request successful at URL: https://www.sec.gov/Archives/edgar/data/0000320193/000032019323000106/0000320193-23-000106.txt
2024-01-21 15:26:41,094 - sec-scraper - INFO - Parsed file data from https://www.sec.gov/Archives/edgar/data/0000320193/000032019323000

# Test Get Elements vs search tags for getting labels

In [None]:
ticker.get_elements(folder_url=folder_url, 
                    index_df=index_df,
                    scrape_file_extension='_lab').info()
                        # .query("`xlink:type` == 'resource'")

In [None]:
labels_list = []
for i in ticker.search_tags(soup=soup, pattern='^label.*'):
    for x in i.find_all():
        label_dict = dict(**x.attrs, labelText=x.text)
        labels_list.append(label_dict)

pd.DataFrame(labels_list).info()


# Test new context dataclass

In [None]:
context_list = []
contexts_soup = ticker.search_context(soup=soup)

In [None]:
#TODO: get explicit member from segment to categorize the segment by the axis it belongs to
        # the axis is standardized by US GAAP standards
        # e.g. ProductorService Axis = srt:ProductOrServiceAxis
        # e.g. StatementGeographicalAxis Axis = srt:StatementGeographicalAxis
        # use scraped labels to get the axis name (terselabel) like 'Product and Service' or 'Geographical'
        # use the axis name to categorize the segment

all_context = pd.DataFrame()

for tag in contexts_soup:
    context_list.append(Context(context_tag=tag).to_dict())
context_df = pd.DataFrame(context_list)
context_df['accessionNumber'] = accessionNumber
all_context = pd.concat(
    [all_context, context_df], ignore_index=True)
all_context

# Test new facts dataclass

In [None]:
all_facts = pd.DataFrame()
facts_list = []
facts = ticker.search_facts(soup=soup)
for fact_tag in facts:
    facts_list.append(Facts(fact_tag=fact_tag).to_dict())
facts_df = pd.DataFrame(facts_list)
facts_df['accessionNumber'] = accessionNumber
all_facts = pd.concat([all_facts, facts_df], ignore_index=True)
all_facts

In [14]:
fact_len_list = []
for fact_tag in facts:
    fact_len_list.append(len(fact_tag.name))

max(fact_len_list)

100

In [None]:
all_facts.loc[all_facts['factName'].str.contains('us-gaap:adjustmentstoadditionalpaidincapitalsharebasedcompensationrequisiteserviceperiodrecognitionv'),['factName']].values

# Test getting labels from _lab.xml

In [108]:
labels = ticker.get_elements(folder_url=folder_url, index_df=index_df,
                                scrape_file_extension='_lab').query("`xlink:type` == 'resource'")
labels['xlink:role'] = labels['xlink:role'].str.split(
    '/').apply(lambda x: x[-1])
labels['xlink:labelOriginal'] = labels['xlink:label']
labels['xlink:label'] = labels['xlink:label']\
    .str.replace('(lab_)|(_en-US)', '', regex=True)\
        .str.split('_')\
            .apply(lambda x: ':'.join(x[:2]))\
    .str.lower()
labels['accessionNumber'] = accessionNumber

2024-01-21 21:04:48,120 - sec-scraper - INFO - Request successful at URL: https://www.sec.gov/Archives/edgar/data/0000320193/000032019323000106/aapl-20230930_lab.xml


In [148]:
labels.loc[labels['xlink:labelOriginal'].str.contains('ProductOrServiceAxis')]

Unnamed: 0,roleURI,xlink:type,xlink:href,labelText,xlink:role,id,xlink:label,xml:lang,xlink:arcrole,xlink:from,xlink:to,order,xlink:labelOriginal,accessionNumber
1915,,resource,,Product and Service [Axis],terseLabel,lab_srt_ProductOrServiceAxis_3eed9b08-8241-45d7-959f-4f72aba4e68d_terseLabel_en-US,srt:productorserviceaxis,en-US,,,,,lab_srt_ProductOrServiceAxis,0000320193-23-000106
1916,,resource,,Product and Service [Axis],label,lab_srt_ProductOrServiceAxis_label_en-US,srt:productorserviceaxis,en-US,,,,,lab_srt_ProductOrServiceAxis,0000320193-23-000106


# Test get filing Facts

In [48]:
form = '10-K'
start_year = 2009
end_year = 2023

filing_available = ticker.filings[(ticker.filings['form'] == form) & (
    ticker.filings['filingDate'].dt.year >= start_year) & (ticker.filings['filingDate'].dt.year <= end_year)]
filing_available = filing_available.to_dict('records')

In [49]:
all_labels, all_calc, all_defn, all_context, all_facts, all_metalinks, all_merged_facts, failed_folders = get_filing_facts(ticker=ticker, filings_to_scrape=filing_available, verbose=True)

2024-01-21 15:38:52,481 - sec-scraper - INFO - 2023-11-03: https://www.sec.gov/Archives/edgar/data/0000320193/000032019323000106
2024-01-21 15:38:53,000 - sec-scraper - INFO - Request successful at URL: https://www.sec.gov/Archives/edgar/data/0000320193/000032019323000106/0000320193-23-000106.txt
2024-01-21 15:38:56,399 - sec-scraper - INFO - Parsed file data from https://www.sec.gov/Archives/edgar/data/0000320193/000032019323000106/0000320193-23-000106.txt successfully.
2024-01-21 15:38:57,554 - sec-scraper - INFO - Request successful at URL: https://www.sec.gov/Archives/edgar/data/0000320193/000032019323000106/index.json
2024-01-21 15:38:57,557 - sec-scraper - ERROR - Failed to scrape metalinks for https://www.sec.gov/Archives/edgar/data/0000320193/000032019323000106...TickerData.get_metalinks() got an unexpected keyword argument 'folder_url'
2024-01-21 15:38:57,928 - sec-scraper - INFO - Request successful at URL: https://www.sec.gov/Archives/edgar/data/0000320193/000032019323000106

In [None]:
# write all_labels, all_calc, all_defn to xlsx on different sheets
with pd.ExcelWriter(f'././data/{ticker.ticker}_all_data.xlsx') as writer:
    all_facts.to_excel(writer, sheet_name='facts', index=False)
    all_context.to_excel(writer, sheet_name='context', index=False)
    all_labels.to_excel(writer, sheet_name='labels', index=False)
    all_merged_facts.to_excel(writer, sheet_name='merged_facts', index=False)
    all_calc.to_excel(writer, sheet_name='calc', index=False)
    all_defn.to_excel(writer, sheet_name='defn', index=False)
    all_metalinks.to_excel(writer, sheet_name='metalinks', index=False)

In [185]:
final_df = clean_values_in_facts(all_merged_facts)
final_df = clean_values_in_segment(final_df, labels_df=all_labels)
final_df = get_monthly_period(final_df)
# final_df, start_end, instant = split_facts_into_start_instant(final_df)


This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.



In [211]:
final_df.loc[final_df['labelText'].str.contains('Income'), 'labelText'].unique()

array(['Operating Income (Loss)', 'Nonoperating Income (Expense)',
       'Income (Loss) from Continuing Operations before Income Taxes, Noncontrolling Interest',
       'Income Tax Expense (Benefit)', 'Net Income (Loss)',
       'Other Comprehensive Income (Loss), Foreign Currency Transaction and Translation Adjustment, Net of Tax',
       'Other Comprehensive Income (Loss), Reclassification Adjustment from AOCI for Sale of Securities, Net of Tax',
       'Other Comprehensive Income (Loss), Net of Tax, Portion Attributable to Parent',
       'Comprehensive Income (Loss), Net of Tax, Attributable to Parent',
       'Accumulated Other Comprehensive Income (Loss), Net of Tax',
       'Other Noncash Income (Expense)', 'Income Taxes Paid, Net',
       'Deferred Income Tax Assets, Net', 'Accrued Income Taxes, Current',
       'Accrued Income Taxes, Noncurrent',
       'Investment Income, Interest and Dividend',
       'Other Nonoperating Income (Expense)',
       'Deferred Federal Income Ta

# Find out how many breakdowns segment has

In [78]:
def segment_breakdown_levels(final_df: pd.DataFrame) -> int:
    dict_len = 0
    for i in final_df['segment']:
        if isinstance(i, dict):
            curr_len = len(list(i.items()))
            if  curr_len > dict_len:
                dict_len = curr_len
                if curr_len > 1:
                    print(list(i.items()))

    return dict_len

segment_breakdown_levels(final_df)

[('us-gaap:FairValueByFairValueHierarchyLevelAxis', 'us-gaap:FairValueInputsLevel1Member'), ('us-gaap:FinancialInstrumentAxis', 'us-gaap:MoneyMarketFundsMember')]
[('us-gaap:BalanceSheetLocationAxis', 'us-gaap:OtherAssetsMember'), ('us-gaap:DerivativeInstrumentRiskAxis', 'us-gaap:ForeignExchangeContractMember'), ('us-gaap:FairValueByFairValueHierarchyLevelAxis', 'us-gaap:FairValueInputsLevel2Member'), ('us-gaap:HedgingDesignationAxis', 'us-gaap:DesignatedAsHedgingInstrumentMember')]


4

# Test merging labels with labels from xbrl us-gaap xsd document

In [16]:
xbrl_us_gaap = 'http://xbrl.fasb.org/us-gaap/2024/elts/us-gaap-2024.xsd'
xbrl_srt = 'http://xbrl.fasb.org/srt/2024/elts/srt-std-2024.xsd'

In [None]:
labels = sec.us_gaap_tags
labels['id'] = labels['id'].str.split('_', n=1).str.join(':').str.lower()
merged_fact_with_label = all_facts.merge(labels, how='left', left_on='factName', right_on='id')
[i for i in merged_fact_with_label.loc[merged_fact_with_label['id'].isnull(),'factName']]

In [20]:
label_len_list = []
for label in labels['id']:
    label_len_dict = {}
    label_len_dict['label_name'] = label
    label_len_dict['label_len'] = len(label)
    label_len_list.append(label_len_dict)

pd.DataFrame(label_len_list).sort_values(by='label_len', ascending=False)

Unnamed: 0,label_name,label_len
16119,us-gaap:elementnameandstandardlabelinmaturitynumericlowerendtonumerichigherenddatemeasurememberormaturitygreaterthanlowendnumericvaluedatemeasurememberormaturitylessthanhighendnumericvaluedatemeasurememberformatsguidance,220
11402,us-gaap:qualitativeandquantitativeinformationassetsorliabilitiesfortransferorscontinuinginvolvementsecuritizationorassetbackedfinancingarrangementprincipalamountoutstandingabstract,180
8205,us-gaap:qualitativeandquantitativeinformationassetsorliabilitiesfortransferorscontinuinginvolvementinsecuritizationorassetbackedfinancingarrangementarrangementsoffinancialsupport,178
17029,us-gaap:investmentprogramproportionalamortizationmethodelectedincometaxcreditandotherincometaxbenefitbeforeamortizationstatementofincomeorcomprehensiveincomeextensibleenumeration,178
11365,us-gaap:otherthantemporaryimpairmentlossesinvestmentsportioninothercomprehensiveincomelossbeforetaxincludingportionattributabletononcontrollinginterestavailableforsalesecurities,177
...,...,...
9255,us-gaap:retains,15
5370,us-gaap:capital,15
5812,us-gaap:assets,14
5704,us-gaap:cash,12


# Test Reverse Mapping

In [225]:
STANDARD_NAME_MAPPING = {
    'Revenue':
        [
            'Revenue from Contract with Customer, Excluding Assessed Tax',
            'Revenues',
            'Revenue, Net',
            'Sales Revenue Net',
            'Sales Revenue, Net',],
    'Cost of Goods or Services':
        [
            'Cost of Goods and Services Sold',
            'Cost Of Goods And Services Sold',
        ],
    'Gross Profit':
        [
            '',
        ],
    'Operating Expenses':
        [
            '',
        ],
    'Operating Income':
        [
            '',
        ],
    'Interest Expense':
        [
            '',
        ],
    'Income Before Tax':
        [
            '',
        ],
    'Income Tax Expense':
        [
            'Income Taxes Paid, Net',
            'Income Taxes Paid Net'
        ],
    'Net Income':
        [
            'Net Income (Loss)',
            'Net income',
            'Net sales',
            'Net Income Loss',
        ],
    'Cash and Cash Equivalents':
        [
            '',
        ],
    'Short Term Investments':
        [
            '',
        ],
    'Total Cash':
        [
            '',
        ],
    'Net Receivables':
        [
            '',
        ],
    'Inventory':
        [
            '',
        ],
    'Other Current Assets':
        [
            '',
        ],
    'Total Current Assets':
        [
            '',
        ],
    'Long Term Investments':
        [
            '',
        ],
    'Property Plant and Equipment':
        [
            '',
        ],
    'Goodwill':
        [
            '',
        ],
    'Intangible Assets':
        [
            '',
        ],
    'Other Assets':
        [
            '',
        ],
    'Total Assets':
        [
            '',
        ],
    'Accounts Payable':
        [
            '',
        ],
    'Short/Current Long Term Debt':
        [
            '',
        ],
    'Other Current Liabilities':
        [
            '',
        ],
    'Total Current Liabilities':
        [
            '',
        ],
    'Long Term Debt':
        [
            '',
        ],
    'Other Liabilities':
        [
            '',
        ],
    'Total Liabilities':
        [
            '',
        ],
    'Common Stock':
        [
            '',
        ],
    'Retained Earnings':
        [
            '',
        ],
    'Treasury Stock':
        [
            '',
        ],
    'Capital Surplus':
        [
            '',
        ],
    'Shareholder Equity':
        [
            '',
        ],
    'Net Tangible Assets':
        [
            '',
        ],
    'Total Stockholders Equity':
        [
            '',
        ],
    'Net Cash Flow':
        [
            '',
        ],
    'Net Cash Flow-Operating':
        [
            '',
        ],
    'Net Cash Flows-Investing':
        [
            '',
        ],
    'Net Cash Flows-Financing':
        [
            '',
        ],
    'Effect of Exchange Rate Changes':
        [
            '',
        ],
    'Net Change in Cash':
        [
            '',
        ],
    'Cash Interest Paid':
        [
            '',
        ],
    'Cash Taxes Paid':
        [
            '',
        ],
    'Depreciation and Amortization':
        [
            '',
        ],
    'Capital Expenditures':
        [
            '',
        ],
    'Change in Working Capital':
        [
            '',
        ],
    'Free Cash Flow':
        [
            '',
        ],
    'Free Cash Flow per Share':
        [
            '',
        ],
    'Operating Cash Flow per Share':
        [
            '',
        ],
    'Cash per Share':
        [
            '',
        ],
    'Book Value per Share':
        [
            '',
        ],
    'Tangible Book Value per Share':
        [
            '',
        ],
    'Shareholders Equity per Share':
        [
            '',
        ],
    'Interest Debt per Share':
        [
            '',
        ],
    'Market Cap':
        [
            '',
        ],
    'Enterprise Value':
        [
            '',
        ],
    'PE Ratio':
        [
            '',
        ],
    'Price to Sales Ratio':
        [
            '',
        ],
    'POCF Ratio':
        [
            '',
        ],
    'PFCF Ratio':
        [
            '',
        ],
    'PB Ratio':
        [
            '',
        ],
    'PTB Ratio':
        [
            '',
        ],
    'EV to Sales':
        [
            '',
        ],
    'Enterprise Value over EBITDA':
        [
            '',
        ],
    'EV to Operating cash flow':
        [
            '',
        ],
}

reversed_mapping = reverse_standard_mapping(standard_name_mapping=STANDARD_NAME_MAPPING)

translated_df = translate_labels_to_standard_names(merged_facts=all_merged_facts, standard_name_mapping=reversed_mapping)

translated_df

Unnamed: 0,labelText,segment,startDate,endDate,instant,factValue,unitRef,standardName
8,"Revenue, Remaining Performance Obligation, Expected Timing of Satisfaction, Period",{'us-gaap:RevenueRemainingPerformanceObligationExpectedTimingOfSatisfactionStartDateAxis': ' 2023-10-01 '},NaT,NaT,2023-09-30,P1Y,,"Revenue, Remaining Performance Obligation, Expected Timing of Satisfaction, Period"
9,"Revenue, Remaining Performance Obligation, Percentage",{'us-gaap:RevenueRemainingPerformanceObligationExpectedTimingOfSatisfactionStartDateAxis': ' 2023-10-01 '},NaT,NaT,2023-09-30,0.67,number,"Revenue, Remaining Performance Obligation, Percentage"
10,"Revenue, Remaining Performance Obligation, Expected Timing of Satisfaction, Period",{'us-gaap:RevenueRemainingPerformanceObligationExpectedTimingOfSatisfactionStartDateAxis': ' 2024-09-29 '},NaT,NaT,2023-09-30,P1Y,,"Revenue, Remaining Performance Obligation, Expected Timing of Satisfaction, Period"
11,"Revenue, Remaining Performance Obligation, Percentage",{'us-gaap:RevenueRemainingPerformanceObligationExpectedTimingOfSatisfactionStartDateAxis': ' 2024-09-29 '},NaT,NaT,2023-09-30,0.25,number,"Revenue, Remaining Performance Obligation, Percentage"
12,"Revenue, Remaining Performance Obligation, Expected Timing of Satisfaction, Period",{'us-gaap:RevenueRemainingPerformanceObligationExpectedTimingOfSatisfactionStartDateAxis': ' 2025-09-28 '},NaT,NaT,2023-09-30,P1Y,,"Revenue, Remaining Performance Obligation, Expected Timing of Satisfaction, Period"
...,...,...,...,...,...,...,...,...
15965,Tax benefit from employee stock plan awards,{'us-gaap:StatementEquityComponentsAxis': 'us-gaap:CommonStockMember'},2008-09-28,2009-09-26,NaT,-78000000,USD,Tax benefit from employee stock plan awards
15966,"Stock Issued During Period, Shares, Share-based Compensation",{'us-gaap:StatementEquityComponentsAxis': 'us-gaap:CommonStockMember'},2008-09-28,2009-09-26,NaT,11480000,Shares,"Stock Issued During Period, Shares, Share-based Compensation"
15967,"Stock Issued During Period, Value, Share-based Compensation",{'us-gaap:StatementEquityComponentsAxis': 'us-gaap:CommonStockMember'},2008-09-28,2009-09-26,NaT,404000000,USD,"Stock Issued During Period, Value, Share-based Compensation"
15968,Net income,{'us-gaap:StatementEquityComponentsAxis': 'us-gaap:RetainedEarningsMember'},2008-09-28,2009-09-26,NaT,5704000000,USD,Net Income


# Parse using GPT (test)

In [None]:
context = symbol.search_context(soup)[0]
data = {
    'id': context['id'],
    'entity': {
        'identifier': {
            'scheme': context.find('identifier')['scheme'],
            'value': context.find('identifier').text
        }
    },
    'period': {
        'startDate': context.find('startdate').text,
        'endDate': context.find('enddate').text
    }
}

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import XMLOutputParser
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessage,
    HumanMessagePromptTemplate,
)
from langchain.callbacks import get_openai_callback
import json

llm = ChatOpenAI(temperature=0)

parser = XMLOutputParser(tags=['id', 'entity', 'period'])
template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                "You are a helpful assistant that parses XML files for a company's financial statements from the SEC Edgar database."
                "The XML content will be provided by the user."
                "You will parse the output and return it in the json format."
                "{format_instructions}"
            )
        ),
        HumanMessagePromptTemplate.from_template("{xml}"),
    ]
)

context_list = []
total_cost = 0
total_tokens = 0

llm = ChatOpenAI()
with trange(len(contexts[:]), desc='Scraping contexts...') as t:
    for i in t:
        with get_openai_callback() as cb:
            t.set_postfix(context=contexts[i].attrs.get('id'))
            output = llm(template.format_messages(format_instructions=parser.get_format_instructions(), xml=contexts[i]))
            total_cost += cb.total_cost
            total_tokens += cb.total_tokens
            context_list.append(json.loads(output.content))


# Test Plots

In [81]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'browser'
pio.renderers

Renderers configuration
-----------------------
    Default renderer: 'browser'
    Available renderers:
        ['plotly_mimetype', 'jupyterlab', 'nteract', 'vscode',
         'notebook', 'notebook_connected', 'kaggle', 'azure', 'colab',
         'cocalc', 'databricks', 'json', 'png', 'jpeg', 'jpg', 'svg',
         'pdf', 'browser', 'firefox', 'chrome', 'chromium', 'iframe',
         'iframe_connected', 'sphinx_gallery', 'sphinx_gallery_png']

In [None]:
metric_df = final_df
metric_df = metric_df.loc[\
    (metric_df['labelText'].str.contains('Sales Revenue |'))\
    & (metric_df['segmentAxis'] == 'Statement Business Segments Axis')\
].sort_values(by=['labelText', 'segmentAxis', 'segmentValue', 'startDate', 'endDate'])
print(metric_df['segmentValue'].unique())

metric_df = metric_df.loc[\
    (metric_df['segmentValue'] == 'Europe Member')\
]
metric_df = metric_df.drop_duplicates(subset=['labelText', 'segmentAxis', 'segmentValue', 'startDate', 'endDate'], keep='last',)
metric_df

In [101]:
final_df['labelText'].unique()

array(['Revenue, Remaining Performance Obligation, Percentage',
       'Revenue from Contract with Customer, Excluding Assessed Tax',
       'Cost of Goods and Services Sold', 'Gross Profit',
       'Research and Development Expense',
       'Selling, General and Administrative Expense',
       'Operating Expenses', 'Operating Income (Loss)',
       'Nonoperating Income (Expense)',
       'Income (Loss) from Continuing Operations before Income Taxes, Noncontrolling Interest',
       'Income Tax Expense (Benefit)', 'Net Income (Loss)',
       'Earnings Per Share, Basic', 'Earnings Per Share, Diluted',
       'Weighted Average Number of Shares Outstanding, Basic',
       'Weighted Average Number of Shares Outstanding, Diluted',
       'Other Comprehensive Income (Loss), Foreign Currency Transaction and Translation Adjustment, Net of Tax',
       'OCI, Debt Securities, Available-for-Sale, Unrealized Holding Gain (Loss), before Adjustment, after Tax',
       'Other Comprehensive Income (Lo

In [94]:
# Create a line plot
fig = px.line(metric_df, x='endDate', y='factValue',
                color='labelText', line_group='labelText',
                #   hover_data={'change': ':,'},
                )
# Overlay a scatter plot for the individual points
fig.add_trace(
    go.Scatter(
        x=metric_df['endDate'],
        y=metric_df['factValue'],
        mode='markers',
        # marker=dict(
        #     color=metric_df['color'].map(
        #         {'increase': 'green', 'decrease': 'red', 'neutral': 'grey'}),
        #     size=15,
        #     symbol=metric_df['color'].map(
        #         {'increase': 'triangle-up', 'decrease': 'triangle-down', 'neutral': 'circle'})
        # ),
        hoverinfo='skip',
        showlegend=False
    )
)
# for trace in fig.data:
#     print(trace)
# Customize the layout
fig.update_layout(
    title='Metrics over time',
    xaxis_title='End Date',
    yaxis_title='Value',
    legend_title='Segment Axis',
    font=dict(
        family='Courier New, monospace',
        size=18,
        color='RebeccaPurple'
    ),
    hovermode='x unified'
)

fig.update_xaxes(autorange=True)
fig.update_yaxes(autorange=True, rangemode="tozero")


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result

