In [None]:
!pip install sec-api
!pip install transformers
!pip install sentencepiece
!pip install beautifulsoup4

Collecting sec-api
  Downloading sec_api-1.0.30-py3-none-any.whl.metadata (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.5/64.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading sec_api-1.0.30-py3-none-any.whl (23 kB)
Installing collected packages: sec-api
Successfully installed sec-api-1.0.30


In [None]:
from sec_api import QueryApi
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Initialize SEC API
api_key = "YOUR_SEC_API_KEY"
queryApi = QueryApi(api_key=api_key)  # Replace with your SEC API key


classification_model_name = "nlpaueb/sec-bert-base"
tokenizer = AutoTokenizer.from_pretrained(classification_model_name)
model = AutoModelForSequenceClassification.from_pretrained(classification_model_name)
summary_model = "facebook/bart-large-cnn"
summarizer = pipeline("summarization", model=summary_model)
risk_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)


def extract_filing_content(url):
    """
    Extracts content from a given URL using BeautifulSoup.
    """
    headers = {
        "User-Agent": "Kaushik (kaushiks1223@gmail.com)",
        "Accept-Encoding": "gzip, deflate",
        "Host": "www.sec.gov",
        "Connection": "keep-alive",
    }
    try:
        response = requests.get(url, timeout=10, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")

            # Extract and clean all paragraph text from the document
            paragraphs = soup.find_all("p")
            content = " ".join([p.get_text().strip() for p in paragraphs])

            # If no paragraphs are found, fallback to all text
            if not content:
                content = soup.get_text().strip()

            return content
        else:
            return "Failed to retrieve content."
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving {url}: {e}")
        return "Failed to retrieve content."


def assess_risk_with_explanation(content):
    # Limit content length for SEC-BERT
    content_chunk = content[:1000]

    # Get risk classification result
    risk_result = risk_classifier(content_chunk)
    risk_label = risk_result[0]['label']
    risk_score = risk_result[0]['score']

    # Generate explanation for risk using summarizer
    summary_chunks = [content[i:i+2000] for i in range(0, len(content), 2000)]
    summaries = [summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] for chunk in summary_chunks]
    explanation = " ".join(summaries)

    return risk_label, risk_score, explanation


def extract_and_summarize_reports(ticker, start_date, end_date):
    """
    Extracts SEC reports for a given company and report types,
    and summarizes them using a pre-trained summarizer model.
    """

    query = {
        "query": {
            "query_string": {
                "query": f"ticker:{ticker} AND (formType:\"10-K\" OR formType:\"10-Q\" OR formType:\"8-K\") AND filedAt:[now-2y TO now]"
            }
        },
        "from": "0",  # Pagination start
        "size": "5",  # Limit to 5 filings
        "sort": [{"filedAt": {"order": "desc"}}],  # Sort by most recent filings
    }

    response = queryApi.get_filings(query)

    filings = response["filings"]
    filings_data = []
    for filing in filings:
        filings_data.append({
            "company": filing.get("companyName", "N/A"),
            "ticker": filing.get("ticker", "N/A"),
            "form_type": filing.get("formType", "N/A"),
            "filed_date": filing.get("filedAt", "N/A"),
            "filing_url": filing.get("linkToFilingDetails", "N/A")
        })

    # Create DataFrame for easy viewing
    df = pd.DataFrame(filings_data)
    df['filing_content'] = df['filing_url'].apply(lambda x: extract_filing_content(x))
    df[['risk_label', 'risk_score', 'explanation']] = df['filing_content'].apply(
        lambda x: pd.Series(assess_risk_with_explanation(x))
    )

    df[['company', 'form_type', 'filed_date', 'risk_label', 'risk_score', 'explanation', 'filing_url']].to_csv(
        "risk_analysis_with_explanations.csv", index=False
    )
    print("Risk analysis with explanations saved to risk_analysis_with_explanations.csv")
    print(df)


# Extract and summarize reports
start_date = "2023-01-01"
end_date = "2024-12-31"
extract_and_summarize_reports("TSLA", start_date, end_date)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/221k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/sec-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Device set to use cpu


Risk analysis with explanations saved to risk_analysis_with_explanations.csv
       company ticker form_type                 filed_date  \
0  Tesla, Inc.   TSLA      10-K  2025-01-29T20:42:33-05:00   
1  Tesla, Inc.   TSLA       8-K  2025-01-29T16:09:13-05:00   
2  Tesla, Inc.   TSLA       8-K  2025-01-02T09:06:39-05:00   
3  Tesla, Inc.   TSLA      10-Q  2024-10-23T20:42:47-04:00   
4  Tesla, Inc.   TSLA       8-K  2024-10-23T16:09:26-04:00   

                                          filing_url  \
0  https://www.sec.gov/Archives/edgar/data/131860...   
1  https://www.sec.gov/Archives/edgar/data/131860...   
2  https://www.sec.gov/Archives/edgar/data/131860...   
3  https://www.sec.gov/Archives/edgar/data/131860...   
4  https://www.sec.gov/Archives/edgar/data/131860...   

                                      filing_content risk_label  risk_score  \
0  tsla-20241231false00013186052024FYhttp://fasb....    LABEL_0    0.549124   
1  tsla-20250129FALSE000131860500013186052025-01-...   