In [None]:
import logging
from datetime import timedelta
import re
import xml.etree.ElementTree as ET
import json
from typing import Dict

from pandas import read_json
import requests
from bytewax import operators as op
from bytewax.connectors.files import FileSink
from bytewax.dataflow import Dataflow
from bytewax.inputs import SimplePollingSource
# from bytewax.connectors.kafka import operators as kop
# from bytewax.connectors.kafka import KafkaSinkMessage



In [None]:
class SECSource(SimplePollingSource):
    def next_item(self):
        # Base URL for SEC Edgar
        base_url = "https://www.sec.gov/cgi-bin/browse-edgar"

        # User agent header to mimic a browser (SEC requires this to allow access)
        headers = {
            'User-Agent': 'Bytewax, Inc. contact@bytewax.io',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'en-US,en;q=0.5',
            'Cache-Control':'no-cache',
            'Host': 'www.sec.gov'
        }

        # https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=0000070858&type=&company=&dateb=&owner=include&start=0&count=40&output=atom
        params = {
            'action':'getcurrent',
            'CIK': '',
            'type':'',
            'dateb':'',
            'owner':'include',
            'start':'0',
            'count':'200',
            'output':'atom'    # Number of results to return
        }

        # Making the GET request
        response = requests.get(base_url, headers=headers, params=params)
        
        if response.status_code == 200:
            logger.info(f"Successfully retrieved filings :: {response.text}")
            return response.text
        else:
            logger.info(f"Failed to retrieve filings. Status code: {response.status_code}")
            return None



In [None]:
flow = Dataflow("edgar_scraper")
filings_stream = op.input("in", flow, SECSource(timedelta(seconds=10)))

In [6]:
def parse_atom(xml_data):
    # Parse the XML data
    # Set up the namespace map
    namespace = {"atom": "http://www.w3.org/2005/Atom"}

    # Parse the XML
    root = ET.fromstring(xml_data)
    data = []
    # Iterate over each entry and extract the desired information
    for entry in root.findall("atom:entry", namespace):
        id = entry.find("atom:id", namespace).text.split("=")[-1].replace("-", "")
        title = entry.find("atom:title", namespace).text
        link = entry.find("atom:link[@type='text/html']", namespace).get("href")
        cik_match = re.search(r'\((\d+)\)', title)
        cik = cik_match.group(1) if cik_match else "No CIK found"
        form_type = entry.find('atom:category', namespace).attrib['term']

        data.append(
            ("All",
            {
                "id":id,
                "title":title,
                "link":link,
                "cik":cik,
                "form_type":form_type

            })
        )
    return data

In [7]:
processed_stream = op.flat_map("parse_atom", filings_stream, parse_atom)

In [8]:

op.output("output", processed_stream, FileSink('sec_out2.jsonl'))

In [7]:
import bytewax.operators as op
from bytewax.testing import TestingSource
from bytewax.dataflow import Dataflow

flow = Dataflow("flat_map_eg")

inp = ["hello world", "BYE CAPTAIN"]

s = op.input("inp", flow, TestingSource(inp))

_ = op.inspect("outinp", s)

def split_into_words(sentence: str):
    print(f"Processing sentence: {sentence}")
    split = sentence.split()
    print(f"Split into words: {split}")
    return sentence


spliter = op.flat_map("split_words", s, split_into_words)

_ = op.inspect("out", spliter)

from bytewax.testing import run_main

run_main(flow)
# {testoutput}
# flat_map_eg.out: 'hello'
# flat_map_eg.out: 'world'

flat_map_eg.outinp: 'hello world'
Processing sentence: hello world
Split into words: ['hello', 'world']
flat_map_eg.out: 'h'
flat_map_eg.out: 'e'
flat_map_eg.out: 'l'
flat_map_eg.out: 'l'
flat_map_eg.out: 'o'
flat_map_eg.out: ' '
flat_map_eg.out: 'w'
flat_map_eg.out: 'o'
flat_map_eg.out: 'r'
flat_map_eg.out: 'l'
flat_map_eg.out: 'd'
flat_map_eg.outinp: 'BYE CAPTAIN'
Processing sentence: BYE CAPTAIN
Split into words: ['BYE', 'CAPTAIN']
flat_map_eg.out: 'B'
flat_map_eg.out: 'Y'
flat_map_eg.out: 'E'
flat_map_eg.out: ' '
flat_map_eg.out: 'C'
flat_map_eg.out: 'A'
flat_map_eg.out: 'P'
flat_map_eg.out: 'T'
flat_map_eg.out: 'A'
flat_map_eg.out: 'I'
flat_map_eg.out: 'N'


In [None]:
import bytewax.operators as op
from bytewax.testing import TestingSource, run_main
from bytewax.dataflow import Dataflow

flow = Dataflow("stateful_map_eg")

inp = [
    "a",
    "a",
    "a",
    "b",
    "a",
]
s = op.input("inp", flow, TestingSource(inp))

s = op.key_on("self_as_key", s, lambda x: x + 'xx')

_ = op.inspect("hello", s)

def check(running_count, _item):
    # print(f"Processing item: {_item}")
    if running_count is None:
        running_count = 0
        # print("Initializing running count to 0")
    running_count += 1
    # print(f"Running increment: {running_count}")
    return (running_count, running_count)

s = op.stateful_map("running_count", s, check)
_ = op.inspect("out", s)

from bytewax.testing import run_main

run_main(flow)

stateful_map_eg.hello: ('axx', 'a')
stateful_map_eg.out: ('axx', 1)
stateful_map_eg.hello: ('axx', 'a')
stateful_map_eg.out: ('axx', 2)
stateful_map_eg.hello: ('axx', 'a')
stateful_map_eg.out: ('axx', 3)
stateful_map_eg.hello: ('bxx', 'b')
stateful_map_eg.out: ('bxx', 1)
stateful_map_eg.hello: ('axx', 'a')
stateful_map_eg.out: ('axx', 4)


In [43]:
from pandas import read_json
cik_to_ticker = read_json("../company_tickers.json", orient='index')

In [44]:
cik_to_ticker.head()

Unnamed: 0,cik_str,ticker,title
0,789019,MSFT,MICROSOFT CORP
1,320193,AAPL,Apple Inc.
2,1045810,NVDA,NVIDIA CORP
3,1652044,GOOGL,Alphabet Inc.
4,1018724,AMZN,AMAZON COM INC


In [45]:
cik_to_ticker.set_index(cik_to_ticker['cik_str'], inplace=True)

In [62]:
cik_to_ticker.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10379 entries, 789019 to 1921158
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   cik_str  10379 non-null  int64 
 1   ticker   10379 non-null  object
 2   title    10379 non-null  object
dtypes: int64(1), object(2)
memory usage: 582.4+ KB


In [56]:
cik_to_ticker.head()

Unnamed: 0_level_0,cik_str,ticker,title
cik_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
789019,789019,MSFT,MICROSOFT CORP
320193,320193,AAPL,Apple Inc.
1045810,1045810,NVDA,NVIDIA CORP
1652044,1652044,GOOGL,Alphabet Inc.
1018724,1018724,AMZN,AMAZON COM INC


In [51]:
cik = int('0001173313')
cik

1173313

In [53]:
ticker = cik_to_ticker['ticker'].loc[cik]
ticker

'ABVC'

In [57]:
if not isinstance(ticker, str):
    ticker.iloc[0]

In [61]:
ticker = cik_to_ticker['ticker'].loc[1173313]
ticker

'ABVC'

In [8]:
import json
import logging
import os
from typing import List, Optional, Union

from bytewax.inputs import DynamicInput, StatelessSource
from websocket import create_connection

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [9]:
class AlpacaNewsStreamClient:
    """
    Alpaca News Stream Client that uses a web socket to stream news data.

    References used to implement this class:
    * Alpaca Docs: https://alpaca.markets/docs/api-references/market-data-api/news-data/realtime/
    * Source of implementation inspiration: https://github.com/alpacahq/alpaca-py/blob/master/alpaca/common/websocket.py
    """

    NEWS_URL = "wss://stream.data.alpaca.markets/v1beta1/news"

    def __init__(self, api_key: str, api_secret: str, tickers: List[str]):
        """
        Initializes the AlpacaNewsStreamClient.

        Args:
            api_key (str): The Alpaca API key.
            api_secret (str): The Alpaca API secret.
            tickers (List[str]): A list of tickers to subscribe to.
        """

        self._api_key = api_key
        self._api_secret = api_secret
        self._tickers = tickers
        self._ws = None

    def start(self):
        """
        Starts the AlpacaNewsStreamClient.
        """

        self._connect()
        self._auth()

    def _connect(self):
        """
        Connects to the Alpaca News Stream.
        """

        self._ws = create_connection(self.NEWS_URL)

        msg = self.recv()

        if msg[0]["T"] != "success" or msg[0]["msg"] != "connected":
            raise ValueError("connected message not received")
        else:
            logger.info("[AlpacaNewsStream]: Connected to Alpaca News Stream.")

    def _auth(self):
        """
        Authenticates with the Alpaca News Stream.
        """

        self._ws.send(
            self._build_message(
                {
                    "action": "auth",
                    "key": self._api_key,
                    "secret": self._api_secret,
                }
            )
        )

        msg = self.recv()
        if msg[0]["T"] == "error":
            raise ValueError(msg[0].get("msg", "auth failed"))
        elif msg[0]["T"] != "success" or msg[0]["msg"] != "authenticated":
            raise ValueError("failed to authenticate")
        else:
            logger.info("[AlpacaNewsStream]: Authenticated with Alpaca News Stream.")

    def subscribe(self):
        """
        Subscribes to the Alpaca News Stream.
        """

        self._ws.send(
            self._build_message({"action": "subscribe", "news": self._tickers})
        )

        msg = self.recv()
        if msg[0]["T"] != "subscription":
            raise ValueError("failed to subscribe")
        else:
            logger.info("[AlpacaNewsStream]: Subscribed to Alpaca News Stream.")

    def ubsubscribe(self):
        """
        Unsubscribes from the Alpaca News Stream.
        """

        self._ws.send(
            self._build_message({"action": "unsubscribe", "news": self._tickers})
        )

        msg = self.recv()
        if msg[0]["T"] != "subscription":
            raise ValueError("failed to unsubscribe")
        else:
            logger.info("[AlpacaNewsStream]: Unsubscribed from Alpaca News Stream.")

    def _build_message(self, message: dict) -> str:
        """
        Builds a message to send to the Alpaca News Stream.

        Args:
            message (dict): The message to build.

        Returns:
            str: The built message.
        """

        return json.dumps(message)

    def recv(self) -> Union[dict, List[dict]]:
        """
        Receives a message from the Alpaca News Stream.

        Returns:
            Union[dict, List[dict]]: The received message.
        """

        if self._ws:
            message = self._ws.recv()
            logger.info(f"[AlpacaNewsStream]: Received message: {message}")
            message = json.loads(message)

            return message
        else:
            raise RuntimeError("Websocket not initialized. Call start() first.")

    def close(self) -> None:
        """
        Closes the Alpaca News Stream connection.
        """

        if self._ws:
            self._ws.close()
            self._ws = None


In [10]:
alpaca = AlpacaNewsStreamClient(api_key="PKM19APHZSD7EDUI20D6", api_secret="GifphcRRfVCyc4VTfaTBg9z4MZT5nP3rdZVgkq0x", tickers=['*'])

In [11]:
alpaca._connect()

INFO:__main__:[AlpacaNewsStream]: Received message: [{"T":"success","msg":"connected"}]
INFO:__main__:[AlpacaNewsStream]: Connected to Alpaca News Stream.


In [12]:
alpaca.start()
alpaca.subscribe()

INFO:__main__:[AlpacaNewsStream]: Received message: [{"T":"success","msg":"connected"}]
INFO:__main__:[AlpacaNewsStream]: Connected to Alpaca News Stream.
INFO:__main__:[AlpacaNewsStream]: Received message: [{"T":"success","msg":"authenticated"}]
INFO:__main__:[AlpacaNewsStream]: Authenticated with Alpaca News Stream.
INFO:__main__:[AlpacaNewsStream]: Received message: [{"T":"subscription","news":["*"]}]
INFO:__main__:[AlpacaNewsStream]: Subscribed to Alpaca News Stream.


In [None]:
alpaca.recv()

In [25]:
api_key="PKM19APHZSD7EDUI20D6"
api_secret="GifphcRRfVCyc4VTfaTBg9z4MZT5nP3rdZVgkq0x"

In [32]:
from typing import List, Optional
import datetime

class AlpacaNewsBatchClient:
    """
    Alpaca News API Client that uses a RESTful API to fetch news data.

    Attributes:
        NEWS_URL (str): The URL for the Alpaca News API.
        _from_datetime (datetime.datetime): The start datetime for the news data.
        _to_datetime (datetime.datetime): The end datetime for the news data.
        _api_key (str): The API key for the Alpaca News API.
        _api_secret (str): The API secret for the Alpaca News API.
        _tickers (List[str]): A list of tickers to filter the news data.
        _page_token (str): The page token for the next page of news data.
        _first_request (bool): A flag indicating whether this is the first request for news data.
    """

    NEWS_URL = "https://data.alpaca.markets/v1beta1/news"

    def __init__(
        self,
        from_datetime: datetime.datetime,
        to_datetime: datetime.datetime,
        api_key: str,
        api_secret: str,
        tickers: List[str],
    ):
        """
        Initializes a new instance of the AlpacaNewsBatchClient class.

        Args:
            from_datetime (datetime.datetime): The start datetime for the news data.
            to_datetime (datetime.datetime): The end datetime for the news data.
            api_key (str): The API key for the Alpaca News API.
            api_secret (str): The API secret for the Alpaca News API.
            tickers (List[str]): A list of tickers to filter the news data.
        """

        self._from_datetime = from_datetime
        self._to_datetime = to_datetime
        self._api_key = api_key
        self._api_secret = api_secret
        self._tickers = tickers

        self._page_token = None
        self._first_request = True

    @property
    def try_request(self) -> bool:
        """
        A property indicating whether a request should be attempted.

        Returns:
            bool: True if a request should be attempted, False otherwise.
        """

        return self._first_request or self._page_token is not None

    def list(self):
        """
        Convenience function to fetch a batch of news from Alpaca API

        Returns:
            List[Dict]: A list of news items.
        """

        if not self.try_request:
            return None

        self._first_request = False

        # prepare the request URL
        headers = {
            "Apca-Api-Key-Id": self._api_key,
            "Apca-Api-Secret-Key": self._api_secret,
        }

        # Look at all the parameters here: https://alpaca.markets/docs/api-references/market-data-api/news-data/historical/
        # or here: https://github.com/alpacahq/alpaca-py/blob/master/alpaca/data/requests.py#L357
        params = {
            "start": self._from_datetime.strftime("%Y-%m-%dT%H:%M:%SZ"),
            "end": self._to_datetime.strftime("%Y-%m-%dT%H:%M:%SZ"),
            "limit": 10,
            "include_content": True,
            "sort": "ASC",
        }
        if self._page_token is not None:
            params["page_token"] = self._page_token

        response = requests.get(self.NEWS_URL, headers=headers, params=params)

        # parse output
        next_page_token = None
        if response.status_code == 200:  # Check if the request was successful
            # parse response into json
            news_json = response.json()

            # extract next page token (if any)
            next_page_token = news_json.get("next_page_token", None)

        else:
            print("Request failed with status code:", response.status_code)
            return None

        self._page_token = next_page_token

        return news_json["news"]


In [33]:
batch = AlpacaNewsBatchClient(
    from_datetime=datetime.datetime.now() - datetime.timedelta(days=1),
    to_datetime=datetime.datetime.now(),
    api_key=api_key,
    api_secret=api_secret,
    tickers=["AAPL"],)

In [34]:
batch.try_request

True

In [35]:
news = batch.list()

In [36]:
len(news)

10

In [42]:
import json
print(json.dumps(news[4:6], indent=2))
# news[4:6]

[
  {
    "author": "Benzinga Newsdesk",
    "content": "",
    "created_at": "2024-11-15T14:54:32Z",
    "headline": "B of A Securities Upgrades Rocket Companies to Neutral, Maintains Price Target to $15",
    "id": 42010207,
    "images": [],
    "source": "benzinga",
    "summary": "",
    "symbols": [
      "RKT"
    ],
    "updated_at": "2024-11-15T14:54:33Z",
    "url": "https://www.benzinga.com/news/24/11/42010207/b-of-a-securities-upgrades-rocket-companies-to-neutral-maintains-price-target-to-15"
  },
  {
    "author": "Benzinga Newsdesk",
    "content": "",
    "created_at": "2024-11-15T14:54:55Z",
    "headline": "The Arena Group Shares Resume Trade Then Again Halt On Circuit Breaker To The Downside, Stock Now Up 215.8%",
    "id": 42010210,
    "images": [],
    "source": "benzinga",
    "summary": "",
    "symbols": [
      "AREN"
    ],
    "updated_at": "2024-11-15T14:54:56Z",
    "url": "https://www.benzinga.com/trading-ideas/24/11/42010210/the-arena-group-shares-resume-