In [23]:
!pip install yfinance pandas lxml matplotlib pandas_market_calendars --quiet

import pandas as pd
import yfinance as yf
import os
import datetime as dt
from pandas.tseries.offsets import CustomBusinessDay
from pandas_market_calendars import get_calendar 

# Step 1: Get the SnP 500 Tickers from Wikipedia

payload=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
constituents_table = payload[0]
df = constituents_table

# Some ticker symbols contain periods (e.g., BRK.B), which Yahoo Finance expects as a dash (BRK-B)
tickers = df['Symbol'].str.replace('.', '-', regex=False).tolist()

df.head(5)

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [24]:
# Step 2: Define the date range for downloading data from yfinance
end_date = dt.datetime.now()
start_date = end_date - dt.timedelta(days=30)  # Adjust as needed

# Step 3: Batch download Yahoo Finance data for all tickers at 1-hour intervals
data = yf.download(
    tickers,
    start=start_date,
    end=end_date,
    interval='1h',
    group_by='ticker',
    progress=False
)

# Step 4: Convert the datetime index to Eastern Time
# If the index is naïve, we assume it is in UTC, then convert to America/New_York
if data.index.tz is None:
    data.index = data.index.tz_localize('UTC')
data.index = data.index.tz_convert('America/New_York')

# Example: Display the first few rows of AAPL
print(data['AAPL'].head(24))


Price                            Open        High         Low       Close  \
Datetime                                                                    
2025-02-07 09:30:00-05:00  232.514999  234.000000  229.880005  230.389999   
2025-02-07 10:30:00-05:00  230.339996  231.268005  228.899994  229.054993   
2025-02-07 11:30:00-05:00  229.085007  229.419998  228.195007  228.400604   
2025-02-07 12:30:00-05:00  228.389999  229.440002  228.363297  229.298904   
2025-02-07 13:30:00-05:00  229.289993  229.500000  227.830002  227.880005   
2025-02-07 14:30:00-05:00  227.889999  228.580002  227.789993  228.000000   
2025-02-07 15:30:00-05:00  227.990005  228.149994  227.259995  227.710007   
2025-02-10 09:30:00-05:00  229.570007  230.585007  228.800003  229.866196   
2025-02-10 10:30:00-05:00  229.889999  229.911606  228.740005  229.360001   
2025-02-10 11:30:00-05:00  229.370804  229.384995  228.239197  228.470703   
2025-02-10 12:30:00-05:00  228.470001  228.880005  228.369995  228.470001   

In [26]:

# Step 5: Add a new 'prev_close' column from previous business day and forward fill to next business day hourly
# intervals for all the tickers in the multi dimentional data frame

# 1. Define the NYSE trading calendar (to handle holidays)
nyse = get_calendar('NYSE')
trading_days = nyse.schedule(start_date=start_date, end_date=end_date)
# Convert to DatetimeIndex
trading_days = pd.DatetimeIndex(trading_days.index)
holidays = nyse.holidays().holidays  # List of NYSE holidays

# Custom business day offset
bday = CustomBusinessDay(holidays=holidays)

# 2. Adjusted function to calculate prev_close
def add_prev_close(df):
    # Get daily closing prices (aligned to trading days)
    daily_close = df['Close'].resample(bday).last()
    # Shift to previous trading day
    prev_day_close = daily_close.shift(1)
    # Forward-fill to hourly intervals
    df['Prev_Close'] = prev_day_close.reindex(df.index, method='ffill')
    return df

# 3. Process data (no grouping by day)
tickers = data.columns.get_level_values(0).unique()
processed_dfs = []

for ticker in tickers:
    df = data[ticker].copy()
    # Process entire history (not daily groups)
    df = add_prev_close(df)
    df.columns = pd.MultiIndex.from_product([[ticker], df.columns])
    processed_dfs.append(df)

data = pd.concat(processed_dfs, axis=1)

# Verify non-NaN results
print(data['AAPL'][['Open','Close', 'Prev_Close']].head(24))


Price                            Open       Close  Prev_Close
Datetime                                                     
2025-02-07 09:30:00-05:00  232.514999  230.389999         NaN
2025-02-07 10:30:00-05:00  230.339996  229.054993         NaN
2025-02-07 11:30:00-05:00  229.085007  228.400604         NaN
2025-02-07 12:30:00-05:00  228.389999  229.298904         NaN
2025-02-07 13:30:00-05:00  229.289993  227.880005         NaN
2025-02-07 14:30:00-05:00  227.889999  228.000000         NaN
2025-02-07 15:30:00-05:00  227.990005  227.710007         NaN
2025-02-10 09:30:00-05:00  229.570007  229.866196  227.710007
2025-02-10 10:30:00-05:00  229.889999  229.360001  227.710007
2025-02-10 11:30:00-05:00  229.370804  228.470703  227.710007
2025-02-10 12:30:00-05:00  228.470001  228.470001  227.710007
2025-02-10 13:30:00-05:00  228.485001  228.699905  227.710007
2025-02-10 14:30:00-05:00  228.699997  227.690002  227.710007
2025-02-10 15:30:00-05:00  227.695007  227.639999  227.710007
2025-02-

In [28]:
THRESHOLD_PCT = 0.05  # 5%

# Get all tickers from the MultiIndex columns
tickers = data.columns.get_level_values(0).unique()

# Create a list to hold processed DataFrames
processed_dfs = []

for ticker in tickers:
    # Get the DataFrame for the current ticker
    df = data[ticker].copy()
    
    # Ensure the index is timezone-aware (NYSE time)
    df.index = df.index.tz_convert('America/New_York')
    
    # Initialize signal column with 'N'
    df['Signal'] = 'N'
    
    # Group by date (using NYSE trading days)
    df['date'] = df.index.date
    grouped = df.groupby('date')
    
    for date, group in grouped:
        # Skip days with <5 hours or missing prev_close
        if len(group) < 5 or group['Prev_Close'].isna().any():
            continue
            
        # Get first hour's low and prev_close
        first_low = group['Low'].iloc[0]
        prev_close = group['Prev_Close'].iloc[0]
        threshold = prev_close * (1 + THRESHOLD_PCT)
        
        # Condition 1: First low > 5% above prev_close
        if first_low > threshold:
            # Condition 2: Next 4 lows > first_low
            next_4_lows = group['Low'].iloc[1:5]
            if (next_4_lows > first_low).all():
                # Mark all rows in this day with 'Y'
                df.loc[group.index, 'Signal'] = 'Y'
    
    # Drop temporary 'date' column
    df = df.drop(columns=['date'])
    
    # Rebuild MultiIndex columns for the ticker
    df.columns = pd.MultiIndex.from_product([[ticker], df.columns])
    processed_dfs.append(df)

# Combine all tickers back into the original DataFrame
data = pd.concat(processed_dfs, axis=1)

# Verify the new column for AAPL
print(data['AAPL'][['Low', 'Prev_Close', 'Signal']].head(50))

Price                             Low  Prev_Close Signal
Datetime                                                
2025-02-07 09:30:00-05:00  229.880005         NaN      N
2025-02-07 10:30:00-05:00  228.899994         NaN      N
2025-02-07 11:30:00-05:00  228.195007         NaN      N
2025-02-07 12:30:00-05:00  228.363297         NaN      N
2025-02-07 13:30:00-05:00  227.830002         NaN      N
2025-02-07 14:30:00-05:00  227.789993         NaN      N
2025-02-07 15:30:00-05:00  227.259995         NaN      N
2025-02-10 09:30:00-05:00  228.800003  227.710007      N
2025-02-10 10:30:00-05:00  228.740005  227.710007      N
2025-02-10 11:30:00-05:00  228.239197  227.710007      N
2025-02-10 12:30:00-05:00  228.369995  227.710007      N
2025-02-10 13:30:00-05:00  228.404999  227.710007      N
2025-02-10 14:30:00-05:00  227.600006  227.710007      N
2025-02-10 15:30:00-05:00  227.199997  227.710007      N
2025-02-11 09:30:00-05:00  228.130005  227.639999      N
2025-02-11 10:30:00-05:00  232.