In [1]:
## Load Libraries

import pandas as pd
import numpy as np
import os
import re
import pytz
import yfinance as yf

In [3]:
## Load Company Data

corps = pd.read_csv("data/corps.csv")

industrials = pd.read_csv("data/corps/industrials.csv")
healthcare = pd.read_csv("data/corps/healthcare.csv")
finance = pd.read_csv("data/corps/finance.csv")
tech = pd.read_csv("data/corps/tech.csv")
consumer = pd.read_csv("data/corps/consumer.csv")
energy = pd.read_csv("data/corps/energy.csv")

In [146]:
## Load and Clean News Data

dfs = []

for ticker in industrials["Ticker"]:
    file_path = os.path.join("data/industrials_073124/news_feed", f"{ticker}.csv")
    if os.path.exists(file_path):
        df = pd.read_csv(file_path, index_col=0)
        df["Ticker"] = ticker
        dfs.append(df)

news1 = pd.concat(dfs, ignore_index=True)
del dfs

pattern = f"{'2024-06-18'}|{'2024-06-19'}"
news1 = news1[~news1["Found"].str.contains(pattern)].reset_index(drop=True)

# Some datetimes will be 2024-06-19 now because of timezone conversion
eastern = pytz.timezone('US/Eastern')
news1["Published"] = pd.to_datetime(news1["Published"]).dt.tz_convert(eastern).dt.tz_localize(None)
news1["Found"] = pd.to_datetime(news1["Found"]).dt.tz_localize('UTC').dt.tz_convert(eastern).dt.tz_localize(None)
news1["Recency"] = news1["Found"] - news1["Published"]

# Order by found datetime
news1 = news1.sort_values("Found").reset_index(drop=True)
news1["Headline"] = "Heading: " + news1["Heading"] + "; Subheading: " + news1["Subheading"]

news2 = pd.merge(news1[["Headline", "Publisher", "Found", "Recency", "Ticker"]],
         corps[["Ticker", "NameCln", "Sector", "Cap"]],
         on="Ticker", how="left").rename(columns={"NameCln": "Company"})
news2["Publisher"] = [publisher.replace("www.", "")
                      if publisher.startswith("www.") else publisher
                      for publisher in news2["Publisher"]]

In [None]:
## Download Price Data

# for ticker in industrials["Ticker"]:
#     try:
#         prices = yf.Ticker(ticker).history(period="3mo", interval="1h").reset_index()
#         prices['Datetime'] = prices['Datetime'].dt.tz_localize(None)
#         file_path = os.path.join("data/industrials_073124/prices", f"{ticker}.csv")
#         prices.to_csv(file_path, index=False)
#         print(f"Price data saved for {ticker}")
#     except Exception as e:
#         print(f"Failed to download price data for {ticker}: {str(e)}")

# Takes about 3 min

In [97]:
## Load Price Data

dfs = []

for ticker in industrials["Ticker"]:
    file_path = os.path.join("data/industrials_073124/prices", f"{ticker}.csv")
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        df["Ticker"] = ticker
        dfs.append(df)

prices1 = pd.concat(dfs, ignore_index=True)
prices1["Datetime"] = pd.to_datetime(prices1["Datetime"]).dt.tz_localize(None)
del dfs

In [147]:
## Merge News and Price Data

price_times = pd.DataFrame({"Datetime": prices1["Datetime"].drop_duplicates().sort_values().reset_index(drop=True)})

last_indices = np.searchsorted(price_times["Datetime"].values, news2["Found"].values) - 1
last_indices[last_indices < 0] = 0
news2["Last_Hour"] = price_times.iloc[last_indices]["Datetime"].values

next_indices = np.searchsorted(price_times["Datetime"].values, news2["Found"].values)
next_indices[next_indices == len(price_times["Datetime"])] = len(price_times["Datetime"]) - 1
news2["Next_Hour"] = price_times.iloc[next_indices]["Datetime"].values

# intervals1 = pd.merge(news2, prices1,
#               left_on="Next_Hour",
#               right_on="Datetime",
#               how="left") \
#               .drop(columns="Capital_Gains")

# "Next_Hour" is the next trading hour after the news headline was found, not published. "Recency" will encode info on
# publication date
# Most recent price data is from the interval "Last_Hour" (ex: 10:30) to "Next_Hour" (ex: 11:30)
# Close price for "Last Hour" will be at 11:30
# Model will lock in data at 11:30, process from 11:30 to 12:30, and should predict 12:30 opening price
# All news headlines will have to be found prior to the cutoff of 11:30
# Thus, "Last_Hour" is t, "Next_Hour" is t+1, and the model will predict opening price of t+2
# For overnight intervals (15:30 -> 9:30), t is 15:30, t+1 (cutoff) is 9:30, and t+2 is 10:30
    # Because of overnight intervals, t close price should actually be replaced with t+1 open price

In [157]:
prices1["Open+1"] = prices1.groupby("Ticker")["Open"].shift(-1)
prices1.head(50)

# Use "Open+1" prices as close price
# Predict ... delta between "Open+2" price and "Close+2" ???
    # This would essentially show price trajectory
    # Then, could sell when predicted delta hits a certain inflexion point

Unnamed: 0,Datetime,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Capital Gains,Open+1
0,2024-05-03 09:30:00,418.079987,421.890015,418.079987,419.540009,644239,0.0,0.0,LIN,,419.584991
1,2024-05-03 10:30:00,419.584991,422.630005,418.73999,422.279999,350928,0.0,0.0,LIN,,422.345001
2,2024-05-03 11:30:00,422.345001,424.5,422.345001,423.725006,209216,0.0,0.0,LIN,,423.792694
3,2024-05-03 12:30:00,423.792694,424.269989,423.079987,424.01001,150140,0.0,0.0,LIN,,423.959991
4,2024-05-03 13:30:00,423.959991,425.024994,423.623596,424.75,187441,0.0,0.0,LIN,,424.75
5,2024-05-03 14:30:00,424.75,425.769989,424.309998,425.517212,258939,0.0,0.0,LIN,,425.359985
6,2024-05-03 15:30:00,425.359985,425.399994,423.470001,423.820007,389861,0.0,0.0,LIN,,426.98999
7,2024-05-06 09:30:00,426.98999,428.579987,424.339996,425.799988,250827,0.0,0.0,LIN,,426.0
8,2024-05-06 10:30:00,426.0,426.290009,423.450012,424.149994,186047,0.0,0.0,LIN,,424.23999
9,2024-05-06 11:30:00,424.23999,425.015015,423.714996,424.994995,113332,0.0,0.0,LIN,,424.994995


In [150]:
news2

Unnamed: 0,Headline,Publisher,Found,Recency,Ticker,Company,Sector,Cap,Last_Hour,Next_Hour
0,Heading: Industrial Food And Beverage Filtrati...,kilgorenewsherald.com,2024-06-19 20:00:02,0 days 00:32:56,MMM,3M,Industrials,54606.0,2024-06-18 15:30:00,2024-06-20 09:30:00
1,Heading: Town files lawsuit to fight PFAS cont...,wickenburgsun.com,2024-06-19 20:00:02,0 days 04:47:45,MMM,3M,Industrials,54606.0,2024-06-18 15:30:00,2024-06-20 09:30:00
2,Heading: Los Angeles Capital Management LLC Lo...,marketbeat.com,2024-06-19 20:00:02,0 days 06:43:01,MMM,3M,Industrials,54606.0,2024-06-18 15:30:00,2024-06-20 09:30:00
3,Heading: Quad Cities International Airport log...,wqad.com,2024-06-19 20:00:47,0 days 01:26:42,ALGT,Allegiant Travel,Industrials,893.0,2024-06-18 15:30:00,2024-06-20 09:30:00
4,Heading: 3 things to do this weekend | Enterta...,apg-wi.com,2024-06-19 20:01:26,0 days 01:44:57,ASH,Ashland,Basic Materials,5018.0,2024-06-18 15:30:00,2024-06-20 09:30:00
...,...,...,...,...,...,...,...,...,...,...
202216,"Heading: Life Time debuts gym, pools, pickleba...",communityimpact.com,2024-08-03 12:15:10,0 days 03:11:55,WLK,Westlake,Basic Materials,20437.0,2024-08-02 15:30:00,2024-08-02 15:30:00
202217,Heading: I took a pay cut and travelled 4800 m...,manchestereveningnews.co.uk,2024-08-03 12:15:22,0 days 02:40:54,WWD,Woodward,Industrials,11282.0,2024-08-02 15:30:00,2024-08-02 15:30:00
202218,Heading: WW Grainger (GWW) Receives a Rating U...,markets.businessinsider.com,2024-08-03 12:15:27,0 days 23:51:52,GWW,WW Grainger,Industrials,46081.0,2024-08-02 15:30:00,2024-08-02 15:30:00
202219,Heading: WW Grainger (GWW) Gets a Hold from RB...,markets.businessinsider.com,2024-08-03 12:15:27,0 days 23:44:42,GWW,WW Grainger,Industrials,46081.0,2024-08-02 15:30:00,2024-08-02 15:30:00


In [128]:
pd.merge(prices1, news2,
              left_on=["Datetime", "Ticker"],
              right_on=["Next_Hour", "Ticker"],
              how="left") \
                .drop(columns=["Capital Gains"])

Unnamed: 0,Datetime,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Headline,Publisher,Found,Recency,Company,Sector,Cap,Next_Hour
0,2024-05-03 09:30:00,418.079987,421.890015,418.079987,419.540009,644239,0.0,0.0,LIN,,,NaT,NaT,,,,NaT
1,2024-05-03 10:30:00,419.584991,422.630005,418.739990,422.279999,350928,0.0,0.0,LIN,,,NaT,NaT,,,,NaT
2,2024-05-03 11:30:00,422.345001,424.500000,422.345001,423.725006,209216,0.0,0.0,LIN,,,NaT,NaT,,,,NaT
3,2024-05-03 12:30:00,423.792694,424.269989,423.079987,424.010010,150140,0.0,0.0,LIN,,,NaT,NaT,,,,NaT
4,2024-05-03 13:30:00,423.959991,425.024994,423.623596,424.750000,187441,0.0,0.0,LIN,,,NaT,NaT,,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346692,2024-08-01 10:30:00,3.350000,3.360000,3.350000,3.360000,998,0.0,0.0,SIF,,,NaT,NaT,,,,NaT
346693,2024-08-01 11:30:00,3.360000,3.360000,3.300000,3.300000,1119,0.0,0.0,SIF,,,NaT,NaT,,,,NaT
346694,2024-08-01 14:30:00,3.300000,3.300000,3.300000,3.300000,0,0.0,0.0,SIF,,,NaT,NaT,,,,NaT
346695,2024-08-02 09:30:00,3.290000,3.290000,3.290000,3.290000,0,0.0,0.0,SIF,,,NaT,NaT,,,,NaT


In [129]:
news2

Unnamed: 0,Headline,Publisher,Found,Recency,Ticker,Company,Sector,Cap,Next_Hour
0,Heading: Industrial Food And Beverage Filtrati...,kilgorenewsherald.com,2024-06-19 20:00:02,0 days 00:32:56,MMM,3M,Industrials,54606.0,2024-06-20 09:30:00
1,Heading: Town files lawsuit to fight PFAS cont...,wickenburgsun.com,2024-06-19 20:00:02,0 days 04:47:45,MMM,3M,Industrials,54606.0,2024-06-20 09:30:00
2,Heading: Los Angeles Capital Management LLC Lo...,marketbeat.com,2024-06-19 20:00:02,0 days 06:43:01,MMM,3M,Industrials,54606.0,2024-06-20 09:30:00
3,Heading: Quad Cities International Airport log...,wqad.com,2024-06-19 20:00:47,0 days 01:26:42,ALGT,Allegiant Travel,Industrials,893.0,2024-06-20 09:30:00
4,Heading: Oregon getaway: Shakespeare and foodi...,sbsun.com,2024-06-19 20:01:26,0 days 04:42:13,ASH,Ashland,Basic Materials,5018.0,2024-06-20 09:30:00
...,...,...,...,...,...,...,...,...,...
183086,Heading: Camp Woodward Week 3 The Great Outdoo...,columbusjewishnews.com,2024-07-31 11:15:15,0 days 04:26:02,WWD,Woodward,Industrials,11282.0,2024-07-31 11:30:00
183087,Heading: Fourth Party Logistics Market Analysi...,openpr.com,2024-07-31 11:15:21,0 days 02:00:54,XPO,XPO,Industrials,12711.0,2024-07-31 11:30:00
183088,Heading: Customers will accept 'double-whammy'...,theloadstar.com,2024-07-31 11:15:21,0 days 03:48:14,XPO,XPO,Industrials,12711.0,2024-07-31 11:30:00
183089,Heading: Top 10: Logistics Brands - Supply Cha...,supplychaindigital.com,2024-07-31 11:15:21,0 days 01:54:58,XPO,XPO,Industrials,12711.0,2024-07-31 11:30:00


In [130]:
prices1

Unnamed: 0,Datetime,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Capital Gains
0,2024-05-03 09:30:00,418.079987,421.890015,418.079987,419.540009,644239,0.0,0.0,LIN,
1,2024-05-03 10:30:00,419.584991,422.630005,418.739990,422.279999,350928,0.0,0.0,LIN,
2,2024-05-03 11:30:00,422.345001,424.500000,422.345001,423.725006,209216,0.0,0.0,LIN,
3,2024-05-03 12:30:00,423.792694,424.269989,423.079987,424.010010,150140,0.0,0.0,LIN,
4,2024-05-03 13:30:00,423.959991,425.024994,423.623596,424.750000,187441,0.0,0.0,LIN,
...,...,...,...,...,...,...,...,...,...,...
194027,2024-08-01 10:30:00,3.350000,3.360000,3.350000,3.360000,998,0.0,0.0,SIF,
194028,2024-08-01 11:30:00,3.360000,3.360000,3.300000,3.300000,1119,0.0,0.0,SIF,
194029,2024-08-01 14:30:00,3.300000,3.300000,3.300000,3.300000,0,0.0,0.0,SIF,
194030,2024-08-02 09:30:00,3.290000,3.290000,3.290000,3.290000,0,0.0,0.0,SIF,


In [None]:
## Thoughts

# How to 


# Historical event detection through semantic clusterings of past headlines for company
    # Stock price in relation to past events -> indicator of significance
    # New updates on past events should be judged as positive or negative by GPT in relation to event information
        # Can even look for supplemental information on significant events
    # Relevance of new headlines to significant past events
    # Ex: Event in the past triggered negative stock price reaction. Negative development on the event (as judged by GPT) should result in
    # similar negative stock price reaction, and might be weighted differently than regular headlines
    # Coverage of event as number of headlines
    # Indicators of event as distribution of publishers (small first, then big)
# Publisher "relevance": number of times appearing across all stock headlines
# Publisher "impact": publishers with most impact on stock prices
# Publisher "actionability": the speed at which RSS feed picks up on publisher (useful for implementation)
# Type of headline coding through semantic clustering -> model feature
# 10K form assessment of risk factors -> generation of dependency RSS feeds
# Market cap
# Financial metrics on company
# Analyst ratings on company
# Macroeconomic events as monitored by RSS feeds
# Company sector (judged by semantic clustering of company descriptions and labeling)
# Significant movements in competitor stock prices
# Significant movements in other sector stock prices
# Singificant movements in other indices
# Time series features like seasonality and autocorrelation
# Volatility of stock within the hour
# Volume of stock within the hour
# Other external data sources
# Starting time of interval (9:30 - 10:30am vs. 2:30 - 3:30pm will have different trading volumes / volatility)