In [None]:
from eventtrader.keys import SEC_API_KEY  # Import directly from the package


In [None]:
import asyncio
import websockets
import json
SERVER_URL = "wss://stream.sec-api.io"
WS_ENDPOINT = SERVER_URL + "?apiKey=" + SEC_API_KEY
async def websocket_client():
    try:
        async with websockets.connect(WS_ENDPOINT) as websocket:
            print("✅ Connected to:", SERVER_URL)
            while True:
                message = await websocket.recv()
                filings = json.loads(message)
                for f in filings:
                    print(f"Full filing info: {f}")
                    # print(f["accessionNo"], f["formType"], f["filedAt"], f["cik"])
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
# asyncio.run(websocket_client())
await websocket_client()

# Filtering Rules

Original filing format: {'id': '2d18c83f53d9561f521252206a523cdb', 'accessionNo': '0001949846-25-000046', 'cik': '1773383', 'ticker': 'DT', 'companyName': 'Dynatrace, Inc.', 'companyNameLong': 'Dynatrace, Inc. (Subject)', 'formType': '144', 'description': 'Form 144 - Report of proposed sale of securities', 'filedAt': '2025-02-14T10:25:23-05:00', 'linkToTxt': 'https://www.sec.gov/Archives/edgar/data/1773383/000194984625000046/0001949846-25-000046.txt', 'linkToHtml': 'https://www.sec.gov/Archives/edgar/data/1773383/000194984625000046/0001949846-25-000046-index.htm', 'linkToXbrl': '', 'linkToFilingDetails': 'https://www.sec.gov/Archives/edgar/data/1773383/000194984625000046/xsl144X01/primary_doc.xml', 'entities': [{'companyName': 'Dynatrace, Inc. (Subject)', 'cik': '1773383', 'irsNo': '000000000', 'fiscalYearEnd': '0331', 'sic': '7372 Services-Prepackaged Software', 'undefined': '06 Technology)'}], 'documentFormatFiles': [{'sequence': '1', 'documentUrl': 'https://www.sec.gov/Archives/edgar/data/1773383/000194984625000046/xsl144X01/primary_doc.xml', 'type': '144', 'size': '\xa0'}, {'sequence': '1', 'documentUrl': 'https://www.sec.gov/Archives/edgar/data/1773383/000194984625000046/primary_doc.xml', 'type': '144', 'size': '2966'}, {'sequence': '\xa0', 'description': 'Complete submission text file', 'documentUrl': 'https://www.sec.gov/Archives/edgar/data/1773383/000194984625000046/0001949846-25-000046.txt', 'type': '\xa0', 'size': '4442'}], 'dataFiles': [], 'seriesAndClassesContractsInformation': []}


1. Filter filings based on the "formType":

- Keep filings where "formType" is one of ['8-K', '10-K', '10-Q', '8-K/A', '10-K/A', '10-Q/A']
- Discard any filing where "formType" does not match the above list


2. For each filing, process the data based on the "formType":
a. If "formType" is one of ['10-K', '10-Q', '10-K/A', '10-Q/A']:

- Check the 'dataFiles' array for an object where 'type' is 'XML'
- If found, extract the URL from the 'documentUrl' field within that object
- If no XML data is found, discard the filing


b. If "formType" is one of ['8-K', '8-K/A']:

- Check the 'dataFiles' array for an object where 'type' is 'XML'
- If found, extract the URL from the 'documentUrl' field within that object
- If not found, get the URL from the 'linkToTxt' field


3. Process the 'documentFormatFiles' array for each filing:

- Iterate through the 'documentFormatFiles' array
- For each object in the array, check if the 'type' field starts with 'EX-10.' or 'EX-99.'
- If the 'type' field matches 'EX-10.' or 'EX-99.', extract the URL from the 'documentUrl' field within that object
- Discard all other objects in the 'documentFormatFiles' array where the 'type' field does not start with 'EX-10.' or 'EX-99.'


4. Output a single line for each filing that includes:

- Base fields (e.g., "id", "companyName", "formType", "filingDate", etc.)
- Entity fields (e.g., "entityId", "entityName", etc.)
- A dictionary containing the extracted 'EX-10.x' and 'EX-99.x' exhibits along with their associated URLs


The goal is to have each row of the output contain all the relevant metadata for a single filing, including the primary form URL (based on the "formType" rules) and the associated 'EX-10.x' and 'EX-99.x' exhibit URLs (if any).

    RULE: Discard any filing where "formType" != ['8-K', '10-K', '10-Q', '8-K/A', '10-K/A', '10-Q/A']

    
    * Note ouput for each filing below will have a single line ouput with **all meta details for that filing**. Meaning each row will include base_fields, entity fields as well as a dictionary containing any 'EX-10.x' or 'EX-99.x' and their associated url (based on rules below)


    RULE IF "formType" == ['10-K', '10-Q', '10-K/A', '10-Q/A'], Keep following data:
        1. In 'dataFiles' > 'type' == 'XML', get url from 'documentUrl' (in 'dataFiles')
        2. Discard if no XML data is found

    RULE IF "formType" == ['8-K', '8-K/A'], Keep following data:
        1. If 'dataFiles' > 'type' == 'XML', get url from 'documentUrl' (in 'dataFiles') else get url from 'linkToTxt'

    RULE for finding EX-10.x and EX-99.x for each "formType" above:
        1. In 'documentFormatFiles' > 'type' == 'EX-10.x' or 'EX-99.x', get value from 'documentUrl' (in documentFormatFiles)
        2. Discard all other 'documentFormatFiles' > 'type' and in the same line store 'EX-10.x' or 'EX-99.x' along with their associated url ('documentUrl')

Overall idea is that each single line of output will contain primary forms such as ['8-K', '10-K', '10-Q', '8-K/A', '10-K/A', '10-Q/A'] along with their associated EX-10.x and EX-99.x exhibits urls.
        

In [None]:
link = 'https://www.sec.gov/Archives/edgar/data/937098/000093709825000014/0000937098-25-000014.txt'



### For storing the data in a Json file


In [None]:
import asyncio
import websockets
import json
from eventtrader.keys import SEC_API_KEY


SERVER_URL = "wss://stream.sec-api.io"
WS_ENDPOINT = SERVER_URL + "?apiKey=" + SEC_API_KEY
OUTPUT_FILE = "sec_filings.json"  # File to save valid JSON data

async def websocket_client():
    try:
        async with websockets.connect(WS_ENDPOINT) as websocket:
            print("✅ Connected to:", SERVER_URL)
            filings_list = []

            while True:
                message = await websocket.recv()
                filings = json.loads(message)

                for f in filings:
                    filings_list.append(f)  # Append each valid JSON object

                # Save data every 10 filings
                if len(filings_list) % 1 == 0:
                    save_data(filings_list)

    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")

# Save the JSON data correctly
def save_data(filings_list):
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(filings_list, f, indent=4)  # Ensure proper formatting
    print(f"✅ Data saved to {OUTPUT_FILE}")

# Run WebSocket client
await websocket_client()


#### Load saved JSON file

In [None]:
import json
import pandas as pd

# Load JSON file
file_path = "sec_filings.json"
with open(file_path, "r", encoding="utf-8") as file:
    filings_data = json.load(file)

# Flatten function to extract nested fields
def flatten_filing(filing):
    base_fields = {
        "id": filing.get("id"),
        "accessionNo": filing.get("accessionNo"),
        "cik": filing.get("cik"),
        "ticker": filing.get("ticker"),
        "companyName": filing.get("companyName"),
        "companyNameLong": filing.get("companyNameLong"),
        "formType": filing.get("formType"),
        "description": filing.get("description"),
        "filedAt": filing.get("filedAt"),
        "linkToTxt": filing.get("linkToTxt"),
        "linkToHtml": filing.get("linkToHtml"),
        "linkToXbrl": filing.get("linkToXbrl"),
        "linkToFilingDetails": filing.get("linkToFilingDetails"),
        "periodOfReport": filing.get("periodOfReport"),
        "effectivenessDate": filing.get("effectivenessDate"),
    }

    # Extract Entities
    entities = filing.get("entities", [])
    entity_rows = []
    for entity in entities:
        entity_row = base_fields.copy()
        entity_row.update({
            "entity_companyName": entity.get("companyName"),
            "entity_cik": entity.get("cik"),
            "entity_irsNo": entity.get("irsNo"),
            "entity_stateOfIncorporation": entity.get("stateOfIncorporation"),
            "entity_fiscalYearEnd": entity.get("fiscalYearEnd"),
            "entity_type": entity.get("type"),
            "entity_act": entity.get("act"),
            "entity_fileNo": entity.get("fileNo"),
            "entity_filmNo": entity.get("filmNo"),
            "entity_sic": entity.get("sic"),
        })
        entity_rows.append(entity_row)

    # Extract Documents
    documents = filing.get("documentFormatFiles", [])
    doc_rows = []
    for doc in documents:
        doc_row = base_fields.copy()
        doc_row.update({
            "doc_sequence": doc.get("sequence"),
            "doc_description": doc.get("description"),
            "doc_documentUrl": doc.get("documentUrl"),
            "doc_type": doc.get("type"),
            "doc_size": doc.get("size"),
        })
        doc_rows.append(doc_row)

    # Extract Data Files
    data_files = filing.get("dataFiles", [])
    data_rows = []
    for data in data_files:
        data_row = base_fields.copy()
        data_row.update({
            "data_sequence": data.get("sequence"),
            "data_description": data.get("description"),
            "data_documentUrl": data.get("documentUrl"),
            "data_type": data.get("type"),
            "data_size": data.get("size"),
        })
        data_rows.append(data_row)

    # Extract Series and Classes Contracts Information
    series_classes = filing.get("seriesAndClassesContractsInformation", [])
    series_rows = []
    for series in series_classes:
        series_row = base_fields.copy()
        series_row.update({
            "series_id": series.get("series"),
            "series_name": series.get("name"),
        })
        for contract in series.get("classesContracts", []):
            contract_row = series_row.copy()
            contract_row.update({
                "classContract": contract.get("classContract"),
                "classContract_name": contract.get("name"),
                "classContract_ticker": contract.get("ticker"),
            })
            series_rows.append(contract_row)

    return entity_rows + doc_rows + data_rows + series_rows

# Process all filings
all_filing_rows = []
for filing in filings_data:
    all_filing_rows.extend(flatten_filing(filing))

# Create DataFrame
df = pd.DataFrame(all_filing_rows)

df.head(2)

In [59]:
# df.to_csv("sec_filings.csv", index=False)


### Data Types

In [None]:
from collections import Counter
# Get unique data types and their counts using Counter
data_type_counts = Counter(df.data_type.dropna())
data_type_counts


In [None]:
df[df.data_type == "XML"]['data_documentUrl'].nunique()

In [None]:
Counter(df[df.data_type == "XML"]['formType'])

In [None]:
Counter(df.formType)

In [None]:
df.id.nunique(), df.accessionNo.nunique()

In [None]:
df[df.data_type == "XML"].id.nunique()

In [None]:
Counter(df[(df.data_type == "XML") & (df.formType.isin(['8-K','10-K','10-Q']))].formType)

In [None]:
counter_result = df[(df.data_type == "XML") & (df.formType.isin(['8-K', '10-K', '10-Q']))].formType.value_counts()
counter_result


# Final Rules

#### Step 1: Filter for only these 3 report types - '8-K','10-K','10-Q' or for amendments: '8-K/A', '10-K/A', '10-Q/A'

In [None]:
form_types = ['8-K', '10-K', '10-Q']
reports_to_keep = form_types + [str(r)+'/A' for r in form_types]

# Since they are always in XML 
# mask1 = [df.formType.isin(reports_to_keep) & (df.data_type == "XML")]

reports_to_keep

In [None]:
df[mask1]

In [None]:
print(f"Length of df before:{len(df)}")
# df = df[df.formType.isin(reports_to_keep) & (df.data_type == "XML")]
print(f"Length of df after:{len(df)}")

### Supplimentary documents to store (ALWAYS filed as part of a primary filing -  (10-K, 10-Q, or 8-K).)
    All EX-10.x (Material contracts)
    All EX-99.x (Miscellaneous exhibits)

In [None]:
df[df.formType.isin(['8-K', '10-K', '10-Q'])].doc_type.unique()

In [35]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


In [49]:
# Filter for EX-10.x and EX-99.x
mask = df['doc_type'].str.startswith('EX-10.') | df['doc_type'].str.startswith('EX-99.')
# df[mask]


In [None]:
import pandas as pd
from collections import Counter

def filter_sec_filings(df):
    """
    Filter SEC filings:
    1. Keep 10-K/10-Q that have XBRL (checking all XBRL-related data_types)
    2. Keep all 8-K
    3. Keep their EX-10.x and EX-99.x exhibits
    """
    
    # Define XBRL-related patterns
    xbrl_pattern = r'XML|EX-101\.'
    
    # Get accession numbers of filings with XBRL data
    xbrl_accessions = df[
        df['data_type'].str.contains(xbrl_pattern, na=False)
    ]['accessionNo'].unique()
    
    # Print XBRL validation info
    print("\nXBRL Validation Summary:")
    ten_k_q = df[df['formType'].isin(['10-K', '10-Q'])]
    total_acc = ten_k_q['accessionNo'].unique()
    print(f"Total 10-K/10-Q accession numbers: {len(total_acc)}")
    print(f"With XBRL: {len(set(total_acc) & set(xbrl_accessions))}")
    print(f"Without XBRL: {len(set(total_acc) - set(xbrl_accessions))}")
    
    # Filter main documents
    main_forms = ['8-K', '10-K', '10-Q']
    form_pattern = '|'.join(f"^{form}(/A)?$" for form in main_forms)
    
    # For 10-K and 10-Q, must have XBRL
    main_docs = df[
        (df['formType'].str.match(form_pattern, na=False)) &
        (
            (df['formType'].str.startswith('8-K')) |  # Keep all 8-Ks
            (df['accessionNo'].isin(xbrl_accessions))  # Only 10-K/Q with XBRL
        )
    ].copy()
    
    # Get valid accession numbers
    valid_accessions = main_docs['accessionNo'].unique()
    
    # Get exhibits for valid accessions
    exhibit_pattern = r'^EX-(?:10\.|99\.)'
    exhibits = df[
        (df['accessionNo'].isin(valid_accessions)) &
        (df['doc_type'].str.match(exhibit_pattern, na=False))
    ].copy()
    
    # Combine and categorize
    filtered_df = pd.concat([main_docs, exhibits])
    
    # Add category
    def get_category(row):
        if pd.isna(row['doc_type']):
            return 'MAIN'
        elif row['doc_type'].startswith('EX-10.'):
            return 'MATERIAL_CONTRACT'
        elif row['doc_type'].startswith('EX-99.'):
            return 'MISC_EXHIBIT'
        return 'MAIN'  # Default to MAIN for non-exhibit docs
    
    filtered_df['document_category'] = filtered_df.apply(get_category, axis=1)
    
    # Additional validation info
    print("\nXBRL Data Types found:")
    print(df[df['data_type'].str.contains(xbrl_pattern, na=False)]['data_type'].value_counts())
    
    return filtered_df

if __name__ == "__main__":
    # Read CSV
    df = pd.read_csv('sec_filings.csv')
    
    # Apply filter
    filtered_df = filter_sec_filings(df)
    
    # Print summary statistics
    print("\nDocument Categories Summary:")
    print(filtered_df['document_category'].value_counts())
    
    print("\nForm Types Summary:")
    print(filtered_df[filtered_df['document_category'] == 'MAIN']['formType'].value_counts())
    
    print("\nExhibit Types Summary:")
    exhibits = filtered_df[filtered_df['document_category'].isin(['MATERIAL_CONTRACT', 'MISC_EXHIBIT'])]
    print(exhibits['doc_type'].value_counts())

In [None]:
df = pd.read_csv('your_sec_filings.csv')
filtered_df = filter_sec_filings(df)

In [None]:
https://www.sec.gov/Archives/edgar/data/937098/000093709825000016/tnet-123124ex1028warren.htm