In [73]:
## Load Libraries

import pandas as pd
import numpy as np
import os
import re
import pytz
import yfinance as yf

In [3]:
## Load Company Data

corps = pd.read_csv("data/corps.csv")

industrials = pd.read_csv("data/corps/industrial.csv")
healthcare = pd.read_csv("data/corps/healthcare.csv")
finance = pd.read_csv("data/corps/finance.csv")
tech = pd.read_csv("data/corps/tech.csv")
consumer = pd.read_csv("data/corps/consumer.csv")
energy = pd.read_csv("data/corps/energy.csv")

In [85]:
## Load Feed Data

ticker = "BA"

d1 = pd.read_csv("data/industrials_073124/news_feed/BA.csv", index_col=0)
pattern = f"{'2024-06-18'}|{'2024-06-19'}"
d1 = d1[~d1["Published"].str.contains(pattern)].reset_index(drop=True)

eastern = pytz.timezone('US/Eastern')
d1["Published"] = pd.to_datetime(d1["Published"]).dt.tz_convert(eastern).dt.tz_localize(None)
d1["Found"] = pd.to_datetime(d1["Found"]).dt.tz_localize('UTC').dt.tz_convert(eastern).dt.tz_localize(None)
d1["Discovery_Time"] = d1["Found"] - d1["Published"]

# Order by found datetime
d1 = d1.sort_values("Found").reset_index(drop=True)

In [99]:
## Download Price Data

prices = yf.Ticker(ticker).history(period="3mo", interval="1h").reset_index()
prices["Datetime"] = prices["Datetime"].dt.tz_localize(None)

In [100]:
prices.tail(25)

Unnamed: 0,Datetime,Open,High,Low,Close,Volume,Dividends,Stock Splits
413,2024-07-29 15:30:00,185.664993,185.979996,185.369995,185.509995,457624,0.0,0.0
414,2024-07-30 09:30:00,186.100006,187.469894,185.149994,186.315002,849971,0.0,0.0
415,2024-07-30 10:30:00,186.270004,187.699997,186.128098,186.550003,803616,0.0,0.0
416,2024-07-30 11:30:00,186.580002,187.380005,186.369995,187.369995,453122,0.0,0.0
417,2024-07-30 12:30:00,187.369995,187.660004,186.5,187.300003,576743,0.0,0.0
418,2024-07-30 13:30:00,187.300003,188.130005,186.880005,187.213898,516526,0.0,0.0
419,2024-07-30 14:30:00,187.214996,187.269897,185.950104,186.294998,588672,0.0,0.0
420,2024-07-30 15:30:00,186.294998,187.610001,185.970001,186.869995,601314,0.0,0.0
421,2024-07-31 09:30:00,189.880005,191.979996,185.300003,189.779999,3665069,0.0,0.0
422,2024-07-31 10:30:00,189.789993,191.729996,183.860001,190.600006,2834458,0.0,0.0


In [88]:
## Merge Feed and Price Data

indices = np.searchsorted(prices["Datetime"].values, d1["Found"].values)
indices[indices == len(prices["Datetime"])] = len(prices["Datetime"]) - 1
d1["Next_Hour"] = prices.iloc[indices]["Datetime"].values
d2 = pd.merge(d1, prices, left_on="Next_Hour", right_on="Datetime", how="left").drop(columns = ["Published", "Datetime"])

# "Next_Hour" is the next trading hour after the news headline was found, not published. "Discovery_Time" will encode
# info on publication date

d2.head(20)

# Five trade opportunities in one day: 11:30am ET, 12:30pm ET, 1:30pm ET, 2:30pm ET, and 3:30pm ET
# Opportunity one, at 11:30am ET, uses data from interval 9:30 - 10:30am ET, and then takes 10:30 - 11:30am ET to process
# and execute. So, most recent data (price and news) for 11:30am ET is 10:30am ET
# This is also helpful for the model, since dynamics overnight are different from dynamics during market hours. For
# instance, it isn't the same to use pre-9:30am price data to trade at 9:30am

Unnamed: 0,Heading,Subheading,Publisher,Found,Discovery_Time,Next_Hour,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,Families of Boeing MAX crash victims seek near...,The move comes a day after Boeing CEO Dave Cal...,www.voanews.com,2024-06-19 23:02:08,0 days 01:34:46,2024-06-20 09:30:00,173.020004,175.160004,172.044998,175.035004,1624984,0.0,0.0
1,Boeing crash victims' families ask DOJ to fine...,Boeing CEO David Calhoun apologized to the rel...,www.fox10phoenix.com,2024-06-20 01:02:02,0 days 03:18:10,2024-06-20 09:30:00,173.020004,175.160004,172.044998,175.035004,1624984,0.0,0.0
2,Opinion: Boeing is getting a hand from the FAA...,What does help Boeing is the plan by the Feder...,www.chicagobusiness.com,2024-06-20 07:02:08,0 days 01:26:12,2024-06-20 09:30:00,173.020004,175.160004,172.044998,175.035004,1624984,0.0,0.0
3,Families Of Boeing Crash Victims Seek $25 Bill...,The move comes a day after Boeing CEO Dave Cal...,www.ndtv.com,2024-06-20 07:02:08,0 days 01:27:21,2024-06-20 09:30:00,173.020004,175.160004,172.044998,175.035004,1624984,0.0,0.0
4,Boeing 737 Max crash victims ask US to impose ...,The families of victims in two Boeing 737 Max ...,www.bbc.com,2024-06-20 07:02:08,0 days 04:43:47,2024-06-20 09:30:00,173.020004,175.160004,172.044998,175.035004,1624984,0.0,0.0
5,Boeing 737 Max crash' families to DoJ: fine fi...,"... Boeing Co. nearly $25 billion, saying the ...",fortune.com,2024-06-20 09:02:13,0 days 01:40:33,2024-06-20 09:30:00,173.020004,175.160004,172.044998,175.035004,1624984,0.0,0.0
6,Boeing CEO apologizes to families for aircraft...,CEO David Calhoun appeared at a Senate hearing...,wisconsinexaminer.com,2024-06-20 09:02:13,0 days 01:46:17,2024-06-20 09:30:00,173.020004,175.160004,172.044998,175.035004,1624984,0.0,0.0
7,Boeing CEO apologizes to families for aircraft...,Boeing CEO David Calhoun testified at a U.S. S...,idahocapitalsun.com,2024-06-20 09:02:13,0 days 08:52:47,2024-06-20 09:30:00,173.020004,175.160004,172.044998,175.035004,1624984,0.0,0.0
8,Engine fire on Boeing 737 forces emergency lan...,A Boeing 737 plane operated by Malaysia Airlin...,www.foxbusiness.com,2024-06-20 12:02:09,0 days 01:00:38,2024-06-20 12:30:00,174.062698,174.397095,173.309998,174.125,370415,0.0,0.0
9,Families of Boeing crash victims demand prosec...,Attorney for families says plane maker should ...,www.theguardian.com,2024-06-20 12:02:09,0 days 01:04:04,2024-06-20 12:30:00,174.062698,174.397095,173.309998,174.125,370415,0.0,0.0


In [None]:
## Thoughts

# Historical event detection through semantic clusterings of past headlines for company
    # Stock price in relation to past events -> indicator of significance
    # New updates on past events should be judged as positive or negative by GPT in relation to event information
        # Can even look for supplemental information on significant events
    # Relevance of new headlines to significant past events
    # Ex: Event in the past triggered negative stock price reaction. Negative development on the event (as judged by GPT) should result in
    # similar negative stock price reaction, and might be weighted differently than regular headlines
    # Coverage of event as number of headlines
    # Indicators of event as distribution of publishers (small first, then big)
# Publisher "relevance": number of times appearing across all stock headlines
# Publisher "impact": publishers with most impact on stock prices
# Publisher "actionability": the speed at which RSS feed picks up on publisher (useful for implementation)
# Type of headline coding through semantic clustering -> model feature
# 10K form assessment of risk factors -> generation of dependency RSS feeds
# Market cap
# Financial metrics on company
# Analyst ratings on company
# Macroeconomic events as monitored by RSS feeds
# Company sector (judged by semantic clustering of company descriptions and labeling)
# Significant movements in competitor stock prices
# Significant movements in other sector stock prices
# Singificant movements in other indices
# Time series features like seasonality and autocorrelation
# Volatility of stock within the hour
# Volume of stock within the hour
# Other external data sources
# Starting time of interval (9:30 - 10:30am vs. 2:30 - 3:30pm will have different trading volumes / volatility)