In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("trading_sentiment_platform").getOrCreate()


In [0]:
# SEC files modeling schema
from pyspark.sql.types import StructType, StructField, StringType, DateType

raw_filings_schema = StructType ([
    StructField("cik", StringType(), True),
    StructField("company_name", StringType(), True),
    StructField("form_type", StringType(), True),
    StructField("filing_date", DateType(), True),
    StructField("accessionNumber", StringType(), True),
    StructField("filing_url", StringType(), True)
])

In [0]:
# Top 10 selected companies for analysis

companies = [
    {"company_name": "Apple Inc.", "ticker": "AAPL", "cik": "0000320193"},
    {"company_name": "Microsoft Corporation", "ticker": "MSFT", "cik": "0000789019"},
    {"company_name": "Alphabet Inc.", "ticker": "GOOGL", "cik": "0001652044"},
    {"company_name": "Amazon.com, Inc.", "ticker": "AMZN", "cik": "0001018724"},
    {"company_name": "Tesla, Inc.", "ticker": "TSLA", "cik": "0001318605"},
    {"company_name": "NVIDIA Corporation", "ticker": "NVDA", "cik": "0001045810"},
    {"company_name": "Meta Platforms, Inc.", "ticker": "META", "cik": "0001326801"},
    {"company_name": "JPMorgan Chase & Co.", "ticker": "JPM", "cik": "0000019617"},
    {"company_name": "Johnson & Johnson", "ticker": "JNJ", "cik": "0000200406"},
    {"company_name": "Exxon Mobil Corporation", "ticker": "XOM", "cik": "0000034088"}
]

import pandas as pd
# selected_companies_pd = spark.createDataFrame(companies)
selected_companies_df = spark.createDataFrame(pd.DataFrame(companies))
display(selected_companies_df)

In [0]:
from pyspark.sql import functions as F
import requests

headers = {"User-Agent": "joel doh joeljuniordoh19@gmail.com"}

sample_url = f"https://data.sec.gov/submissions/CIK0000320193.json"

sample_res = requests.get(sample_url, headers=headers).json()

for key, value in sample_res["filings"]["recent"].items():
    print(f"{key}")

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import types as T
from datetime import datetime, timedelta
import requests

headers = {"User-Agent": "joel doh joeljuniordoh19@gmail.com"}

cik_list = [row["cik"].zfill(10) for row in selected_companies_df.select("cik").collect()]
# display(cik_list)
records = []
for num in cik_list:
    url = f"https://data.sec.gov/submissions/CIK{num}.json"
    print(url)
    response = requests.get(url, headers=headers)
    # print(str(response)[:200])
    if response.status_code == 200:
        response = response.json()
        recent = response["filings"]["recent"]
        for i in range(len(recent["accessionNumber"])):
            form = recent["form"][i]
            date = recent["filingDate"][i]

            five_years_ago = datetime.now() - timedelta(days=5*365)
            date_obj = datetime.strptime(date, "%Y-%m-%d")

            if form in ["10-K", "10-Q"] and date_obj>=five_years_ago:
                acc = recent["accessionNumber"][i]
                doc = recent["primaryDocument"][i]
                file_url = f"https://www.sec.gov/Archives/edgar/data/{int(response['cik'])}/{acc.replace('-', '')}/{doc}"
                # print(file_url)
                records.append({
                    "cik": response["cik"],
                    "company_name": response["name"],
                    "form_type": form,
                    "filing_date": date_obj.date(),
                    "accessionNumber": acc,
                    "filing_url": file_url
                })
    else:
        print(f"Request failed: {response.status_code}")
print(records[1]["accessionNumber"])


In [0]:
df = spark.createDataFrame(records, raw_filings_schema)
display(raw_filing)

In [0]:
df.write.format("delta").mode("overwrite").saveAsTable("sec_filings.raw_filing_doc")