In [1]:
## Load Libraries

import pandas as pd
import numpy as np
import os
import re
import pytz
import yfinance as yf

pd.set_option('display.max_columns', 50)

In [2]:
## Load Company Data

corps = pd.read_csv("data/corps.csv")

industrials = pd.read_csv("data/corps/industrials.csv")
healthcare = pd.read_csv("data/corps/healthcare.csv")
finance = pd.read_csv("data/corps/finance.csv")
tech = pd.read_csv("data/corps/tech.csv")
consumer = pd.read_csv("data/corps/consumer.csv")
energy = pd.read_csv("data/corps/energy.csv")

In [3]:
## Load Cleaned Data

prices = pd.read_csv("data/industrials_073124/prices.csv")
news = pd.read_csv("data/industrials_073124/news.csv")

### **News Features**

In [6]:
news

Unnamed: 0.1,Unnamed: 0,Time,Ticker,Headline,Publisher,Found,Recency
0,0,2024-06-20 09:30:00,MMM,Heading: Industrial Food And Beverage Filtrati...,kilgorenewsherald.com,2024-06-19 20:00:02,0 days 00:32:56
1,1,2024-06-20 09:30:00,MMM,Heading: Town files lawsuit to fight PFAS cont...,wickenburgsun.com,2024-06-19 20:00:02,0 days 04:47:45
2,2,2024-06-20 09:30:00,MMM,Heading: Los Angeles Capital Management LLC Lo...,marketbeat.com,2024-06-19 20:00:02,0 days 06:43:01
3,3,2024-06-20 09:30:00,ALGT,Heading: Quad Cities International Airport log...,wqad.com,2024-06-19 20:00:47,0 days 01:26:42
4,4,2024-06-20 09:30:00,ASH,Heading: 3 things to do this weekend | Enterta...,apg-wi.com,2024-06-19 20:01:26,0 days 01:44:57
...,...,...,...,...,...,...,...
185403,185407,2024-07-31 15:30:00,WWD,"Heading: 1 hurt, 2 arrested in Wrightstown dis...",whby.com,2024-07-31 15:15:25,0 days 01:08:31
185404,185408,2024-07-31 15:30:00,WWD,Heading: Harry Potter: Quidditch Champions sys...,readwrite.com,2024-07-31 15:15:25,0 days 04:32:14
185405,185409,2024-07-31 15:30:00,XYL,Heading: George Oliver to retire as Johnson Co...,msn.com,2024-07-31 15:15:32,0 days 06:58:55
185406,185410,2024-07-31 15:30:00,XYL,Heading: Xylem (XYL) Receives a Rating Update ...,markets.businessinsider.com,2024-07-31 15:15:32,0 days 03:36:15


### **Create Modeling Table**

In [4]:
## Create Modeling Table

news1 = news.groupby(['Time', 'Ticker']).agg({
    'Headline': list,
    'Publisher': list,
    'Found': list,
    'Recency': list
}).reset_index()

modeling = pd.merge(prices, news1,
         on=["Time", "Ticker"],
         how="left").fillna("")

modeling["Headlines_Cnt"] = modeling["Headline"].apply(len)
modeling["ID"] = modeling["Ticker"] + "-" + [x.replace(" ", "-")[5:-3] for x in modeling["Time"].astype(str)]
modeling["Hour"] = [x[11:-3] for x in modeling["Time"].astype(str)]

modeling1 = modeling[["ID", "Delta+1", "Hour", "Company", "Sector", "Cap",
                        "Open-1", "High-1", "Low-1", "Delta-1", "Open_Diff", "Volume-1",
                        "Dividends-1", "Stock_Splits-1",
                        "Headline", "Publisher", "Found", "Recency",
                        "Headlines_Cnt"]]

modeling1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125608 entries, 0 to 125607
Data columns (total 19 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ID              125608 non-null  object 
 1   Delta+1         125608 non-null  float64
 2   Hour            125608 non-null  object 
 3   Company         125608 non-null  object 
 4   Sector          125608 non-null  object 
 5   Cap             125608 non-null  float64
 6   Open-1          125608 non-null  float64
 7   High-1          125608 non-null  float64
 8   Low-1           125608 non-null  float64
 9   Delta-1         125608 non-null  float64
 10  Open_Diff       125608 non-null  float64
 11  Volume-1        125608 non-null  int64  
 12  Dividends-1     125608 non-null  float64
 13  Stock_Splits-1  125608 non-null  float64
 14  Headline        125608 non-null  object 
 15  Publisher       125608 non-null  object 
 16  Found           125608 non-null  object 
 17  Recency   

In [5]:
modeling1

Unnamed: 0,ID,Delta+1,Hour,Company,Sector,Cap,Open-1,High-1,Low-1,Delta-1,Open_Diff,Volume-1,Dividends-1,Stock_Splits-1,Headline,Publisher,Found,Recency,Headlines_Cnt
0,LIN-06-03-09:30,-0.555145,09:30,Linde,Basic Materials,206652.0,430.309998,1.319977,-0.062743,1.271177,-0.376333,480713,0.0,0.0,,,,,0
1,LIN-06-03-10:30,0.180962,10:30,Linde,Basic Materials,206652.0,434.140015,0.562024,-0.498689,-0.192339,-0.019615,267563,0.0,0.0,,,,,0
2,LIN-06-03-11:30,0.121587,11:30,Linde,Basic Materials,206652.0,433.220001,0.251603,-0.595537,-0.555145,0.049905,192816,0.0,0.0,,,,,0
3,LIN-06-03-12:30,0.041627,12:30,Linde,Basic Materials,206652.0,431.029999,0.336406,-0.225043,0.180962,-0.005788,441609,0.0,0.0,,,,,0
4,LIN-06-03-13:30,0.375592,13:30,Linde,Basic Materials,206652.0,431.785004,0.195700,-0.228126,0.121587,0.018509,100792,0.0,0.0,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125603,SIF-07-31-09:30,0.000000,09:30,SIFCO Industries,Industrials,20.0,3.422500,0.000000,-4.747989,-0.949592,1.769910,0,0.0,0.0,,,,,0
125604,SIF-07-31-12:30,0.000000,12:30,SIFCO Industries,Industrials,20.0,3.450000,0.000000,0.000000,0.000000,1.449274,0,0.0,0.0,,,,,0
125605,SIF-07-31-13:30,0.000000,13:30,SIFCO Industries,Industrials,20.0,3.500000,0.000000,-2.854286,0.000000,-1.428570,717,0.0,0.0,,,,,0
125606,SIF-07-31-14:30,-1.179947,14:30,SIFCO Industries,Industrials,20.0,3.450000,0.000000,0.000000,0.000000,0.289855,610,0.0,0.0,,,,,0
