In [34]:
import json
import pandas as pd
from datetime import datetime, timedelta, time
import ta_formulas as ta
import statistics
import pytz
import warnings
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

warnings.filterwarnings("ignore",category=FutureWarning)
KEY = "XpqF6xBLLrj6WALk4SS1UlkgphXmHQec"

In [35]:
def pull_alerts_data_local():
    data = pd.read_csv(f'/Users/charlesmiller/Documents/model_tester_data/BF/2015-01-01_2023-12-23BF3.csv')
    return data

class CustomRetry(Retry):
    def is_retry(self, method, status_code, has_retry_after=False):
        """ Return True if we should retry the request, otherwise False. """
        if status_code != 200:
            return True
        return super().is_retry(method, status_code, has_retry_after)
    
def setup_session_retries(
    retries: int = 3,
    backoff_factor: float = 0.05,
    status_forcelist: tuple = (500, 502, 504),
):
    """
    Sets up a requests Session with retries.
    
    Parameters:
    - retries: Number of retries before giving up. Default is 3.
    - backoff_factor: A factor to use for exponential backoff. Default is 0.3.
    - status_forcelist: A tuple of HTTP status codes that should trigger a retry. Default is (500, 502, 504).

    Returns:
    - A requests Session object with retry configuration.
    """
    retry_strategy = CustomRetry(
        total=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
        allowed_methods=frozenset(["GET", "POST", "PUT", "DELETE", "HEAD", "OPTIONS"]),
        raise_on_status=False
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session = requests.Session()
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    return session

def execute_polygon_call(url):
    session = setup_session_retries()
    response = session.request("GET", url, headers={}, data={})
    return response 

def convert_timestamp_est(timestamp):
    # Create a naive datetime object from the UNIX timestamp
    dt_naive = datetime.utcfromtimestamp(timestamp)
    # Convert the naive datetime object to a timezone-aware one (UTC)
    dt_utc = pytz.utc.localize(dt_naive)
    # Convert the UTC datetime to EST
    dt_est = dt_utc.astimezone(pytz.timezone('US/Eastern'))
    
    return dt_est


In [36]:
def call_polygon_D(symbol, date_stamp, timespan, multiplier):
    try:
        date_str = date_stamp.split(' ')[0]
        to_stamp = datetime.strptime(date_str, '%Y-%m-%d') - timedelta(days=1)
        from_stamp = to_stamp - timedelta(days=105)
        to_str = to_stamp.strftime('%Y-%m-%d')
        from_str = from_stamp.strftime('%Y-%m-%d')
        url = f"https://api.polygon.io/v2/aggs/ticker/{symbol}/range/{multiplier}/{timespan}/{from_str}/{to_str}?adjusted=true&sort=asc&limit=50000&apiKey={KEY}"
        response = execute_polygon_call(url)

        response_data = json.loads(response.text)
        results = response_data['results']
        results_df = pd.DataFrame(results)
        results_df['t'] = results_df['t'].apply(lambda x: int(x/1000))
        results_df['date'] = results_df['t'].apply(lambda x: convert_timestamp_est(x))
        results_df['hour'] = results_df['date'].apply(lambda x: x.hour)
        results_df['day'] = results_df['date'].apply(lambda x: x.day)
        results_df['minute'] = results_df['date'].apply(lambda x: x.minute)
    except Exception as e:
        print(f"call polygon D {e}")
        print(f"symbol {symbol}, dates {from_str} -  {to_str}, timespan {timespan}, multiplier {multiplier}")
    return results_df

def call_polygon_current_day(symbol, date_stamp, timespan, multiplier,hour):
    try:
        date_str = date_stamp.split(' ')[0]
        url = f"https://api.polygon.io/v2/aggs/ticker/{symbol}/range/{multiplier}/{timespan}/{date_str}/{date_str}?adjusted=true&sort=asc&limit=50000&apiKey={KEY}"
        response = execute_polygon_call(url)

        response_data = json.loads(response.text)
        results = response_data['results']
        results_df = pd.DataFrame(results)
        results_df['t'] = results_df['t'].apply(lambda x: int(x/1000))
        results_df['date'] = results_df['t'].apply(lambda x: convert_timestamp_est(x))
        results_df['hour'] = results_df['date'].apply(lambda x: x.hour)
        results_df['day'] = results_df['date'].apply(lambda x: x.day)
        results_df['minute'] = results_df['date'].apply(lambda x: x.minute)
        trimmed_df = results_df.loc[results_df['hour'].isin([9,10,11,12,13,14,15])]
        filtered_df = trimmed_df.loc[~((trimmed_df['hour'] == 9) & (trimmed_df['minute'] < 30))]
        filtered_df = filtered_df.loc[filtered_df['hour'] < hour]
    except Exception as e:  
        print(f"call polygon current day {e}")
        print(f"symbol {symbol}, date {date_str}, timespan {timespan}, multiplier {multiplier}, hour {hour}")
    return filtered_df


def combine_aggs(day_agg, hour_aggs, hour):
    try:
        # hour_aggs = hour_aggs.loc[hour_aggs["hour"] < int(hour)]
        # if len(hour_aggs) > 1:
        #     hour_aggs = hour_aggs.iloc[:-1]
        volume = hour_aggs.v.sum()
        open = hour_aggs.o.iloc[0]
        close = hour_aggs.c.iloc[-1]
        high = hour_aggs.h.max()
        low = hour_aggs.l.min()
        n = hour_aggs.n.sum()
        t = hour_aggs.t.iloc[-1]
        aggs_list = [volume, volume, open, close, high, low, t, n, hour_aggs.date.iloc[-1], hour, hour_aggs.day.iloc[-1],hour_aggs.minute.iloc[-1]]
        day_agg.loc[len(day_agg)] = aggs_list
    except Exception as e:
        print(f"combine hour aggs {e}")
        # print(day_agg)
    return day_agg

def build_ts_features(aggregates_df):
    aggregates_df['close_diff'] = aggregates_df['c'].pct_change()
    aggregates_df['close_diff3'] = aggregates_df['close_diff'].pct_change(3)
    aggregates_df['close_diff5'] = aggregates_df['close_diff'].pct_change(5)
    aggregates_df['sma_20'] = ta.calculate_sma(aggregates_df['c'],20)
    aggregates_df['sma_50'] = ta.calculate_sma(aggregates_df['c'],50)
    aggregates_df['sma_5'] = ta.calculate_sma(aggregates_df['c'],5)
    aggregates_df['ema_20'] = ta.calculate_ema(aggregates_df['c'],20)
    aggregates_df['ema_50'] = ta.calculate_ema(aggregates_df['c'],50)
    aggregates_df['ema_5'] = ta.calculate_ema(aggregates_df['c'],5)
    aggregates_df['rsi'] = ta.rsi(aggregates_df['c'],14)
    aggregates_df['macd'] = ta.macd(aggregates_df['c'])
    aggregates_df['roc'] = ta.roc(aggregates_df['c'],12)
    upper_band, lower_band, middle_band = ta.bbands(aggregates_df['c'],window=20)
    aggregates_df['bbu'] = upper_band
    aggregates_df['bbl'] = lower_band
    aggregates_df['bbm'] = middle_band
    aggregates_df['bb_spread'] = (aggregates_df['bbu'] - aggregates_df['bbl'])/aggregates_df['c']
    aggregates_df['bb_trend'] = (aggregates_df['c'] - aggregates_df['bbm'])/aggregates_df['bbm']
    aggregates_df['bb_category'] = aggregates_df.apply(lambda x: ta.bbands_category(x['c'],x['bbu'],x['bbl']), axis=1)
    aggregates_df['sma_20_trend'] = (aggregates_df['c'] - aggregates_df['sma_20'])/aggregates_df['sma_20']
    aggregates_df['sma_5_trend'] = (aggregates_df['c'] - aggregates_df['sma_5'])/aggregates_df['sma_5']
    aggregates_df['pct_5d_high'] = ta.calculate_pct_high(aggregates_df,5)
    aggregates_df['pct_5d_low'] = ta.calculate_pct_low(aggregates_df,5)
    aggregates_df['stddev_close_diff_5d'] = ta.calculate_stddev(aggregates_df['close_diff'],5)
    aggregates_df['stddev_close_diff_10d'] = ta.calculate_stddev(aggregates_df['close_diff'],10)

    return aggregates_df


def ts_feature_runner(row):
    polygon_day_df = call_polygon_D(row['symbol'], row['date'], timespan="day", multiplier="1")
    hour_df = call_polygon_current_day(row['symbol'], row['date'], timespan="minute", multiplier="1",hour=row['hour'])
    combined_df = combine_aggs(polygon_day_df, hour_df, row['hour'])
    ts_features_df = build_ts_features(combined_df)
    return ts_features_df

In [39]:
ts_data_dfs = []
alerts_data = pull_alerts_data_local()
alerts_data.reset_index(drop=True,inplace=True)
for index, row in alerts_data.iterrows():
    ts_data = ts_feature_runner(row)
    ts_data['alert_identifier'] = f"{row['symbol']}-{row['date']}-{row['hour']}"
    ts_data_dfs.append(ts_data)
    if index % 100 == 0:
        print(f"index {index}")

full_ts_df = pd.concat(ts_data_dfs)
full_ts_df.to_csv(f'/Users/charlesmiller/Documents/ts_data/day_aggs/all.csv',index=False)


index 0
index 100
index 200
index 300
index 400
index 500
index 600
index 700
index 800
index 900
index 1000
index 1100
index 1200
index 1300
index 1400
index 1500
index 1600
index 1700
index 1800
index 1900
index 2000
index 2100
index 2200
index 2300
index 2400
index 2500
index 2600
index 2700
index 2800
index 2900
index 3000
index 3100
index 3200
index 3300
index 3400
index 3500
index 3600
index 3700
index 3800
index 3900
index 4000
index 4100
index 4200
index 4300
index 4400
index 4500
index 4600
index 4700
index 4800
index 4900
index 5000
index 5100
index 5200
index 5300
index 5400
index 5500
index 5600
index 5700
index 5800
index 5900
index 6000
index 6100
index 6200
index 6300
index 6400
index 6500
index 6600
index 6700
index 6800
index 6900
index 7000
index 7100
index 7200
index 7300
index 7400
index 7500
index 7600
index 7700
index 7800
index 7900
index 8000
index 8100
index 8200
index 8300
index 8400
index 8500
index 8600
index 8700
index 8800
index 8900
index 9000
index 9100


In [None]:
df = pd.read_csv(f'/Users/charlesmiller/Documents/ts_data/day_aggs/test_sample_25000.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/charlesmiller/Documents/ts_data/day_aggs/test_sample_25000.csv'

In [None]:
df.columns

Index(['v', 'vw', 'o', 'c', 'h', 'l', 't', 'n', 'date', 'hour', 'day',
       'minute', 'close_diff', 'sma_20', 'sma_50', 'sma_5', 'ema_20', 'ema_50',
       'ema_5', 'rsi', 'macd', 'roc', 'bbu', 'bbl', 'bbm', 'bb_spread',
       'bb_trend', 'bb_category', 'sma_20_trend', 'sma_5_trend', 'pct_5d_high',
       'pct_5d_low', 'stddev_close_diff_5d', 'stddev_close_diff_10d',
       'alert_identifier'],
      dtype='object')