In [4]:
import pandas as pd
import boto3 

In [11]:
## pull all files from a specific filepath in s3 and read the contents and turn into csv
def pull_files_from_s3(filepath):
    s3 = boto3.client('s3')
    ## list all the files in a filepath using s3 client

    file_keys = []
    dfs = []
    # Use paginator to handle listing of large number of objects
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket="inv-alerts", Prefix=filepath)

    for page in page_iterator:
        if 'Contents' in page:
            for obj in page['Contents']:
                file_keys.append(obj['Key'])
    for file in file_keys:
        response = s3.get_object(Bucket="inv-alerts", Key=file)
        body = response['Body']
        df = pd.read_csv(body)
        dfs.append(df)
    return dfs

In [13]:
dfs = []
for year in [2022,2023,2024]:
    year_dfs = pull_files_from_s3(f'trend_alerts/{year}/')
    dfs.extend(year_dfs)

## concatenate all the dataframes into one
df = pd.concat(dfs)

In [14]:
### Study and build the filter level for the volume Z-score, data exists
df.groupby('hour')['volume_cycle_z_scores'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10,124272.0,0.157687,0.693115,-3.467769,-0.298052,0.077995,0.536738,3.356276
11,124275.0,0.324996,0.724263,-3.467769,-0.154559,0.250081,0.73606,3.250109
12,124279.0,0.342333,0.719421,-3.467769,-0.138011,0.269234,0.762468,3.322981
13,124277.0,0.003157,0.747809,-3.467769,-0.502701,-0.073757,0.432943,3.172622
14,123848.0,-0.580549,0.602673,-3.474143,-0.988177,-0.624305,-0.243852,3.020244
15,123855.0,-0.596983,0.55209,-3.467769,-0.969028,-0.638257,-0.28749,3.144146


In [19]:
df.groupby('hour')['return_vol_8H'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10,139355.0,0.006505,0.004561,0.000324,0.003479,0.0053,0.008131,0.087921
11,139359.0,0.006064,0.004169,0.00037,0.003282,0.004964,0.007588,0.076815
12,139363.0,0.005902,0.004083,0.000364,0.003188,0.004824,0.007373,0.075575
13,139361.0,0.00583,0.004054,0.000339,0.003143,0.004753,0.007288,0.07521
14,139314.0,0.005821,0.004072,0.000353,0.00313,0.004737,0.007263,0.080192
15,139209.0,0.00581,0.004069,0.000274,0.003125,0.004734,0.007252,0.08109


In [25]:
df.groupby('hour')['range_volatility'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10,124272.0,0.013339,0.010206,0.000105,0.00672,0.010529,0.016699,0.230556
11,124275.0,0.007895,0.005524,0.000105,0.004287,0.006403,0.009778,0.127804
12,124279.0,0.006256,0.004339,0.0,0.00343,0.005107,0.00777,0.124392
13,124277.0,0.005587,0.004028,0.0,0.003033,0.004546,0.006927,0.097721
14,123848.0,0.005417,0.004072,0.0,0.002919,0.004351,0.006621,0.138334
15,123855.0,0.005511,0.004365,0.0,0.002912,0.004364,0.006691,0.112673


In [24]:
df['hour_vol_8hour_diff'] = (df['price_change_absolute_H'] - df['return_vol_8H'])/df['return_vol_8H']
df.groupby('hour')['hour_vol_8hour_diff'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10,139355.0,0.322983,1.077284,-1.0,-0.496324,0.06155,0.866627,6.518175
11,139359.0,-0.069433,0.755257,-1.0,-0.644832,-0.249344,0.312449,5.228401
12,139363.0,-0.250957,0.622607,-1.0,-0.71711,-0.400395,0.049978,5.741276
13,139361.0,-0.336263,0.565428,-1.0,-0.754029,-0.474325,-0.074896,5.434529
14,139314.0,-0.34856,0.565045,-1.0,-0.760223,-0.48716,-0.097335,5.241799
15,139209.0,-0.357292,0.569771,-1.0,-0.767083,-0.501227,-0.111373,5.785447


In [20]:
df['cd_vol'] = (df['return_vol_8H']/df['return_vol_5D']).round(3)
df.groupby('hour')['cd_vol'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10,139348.0,0.406379,0.198436,0.022,0.276,0.37,0.492,5.246
11,139351.0,0.376397,0.190131,0.023,0.253,0.34,0.454,5.561
12,139355.0,0.363994,0.187456,0.029,0.243,0.328,0.439,5.58
13,139353.0,0.357962,0.187735,0.026,0.237,0.321,0.431,4.612
14,139306.0,0.355887,0.187665,0.028,0.234,0.318,0.429,4.967
15,139201.0,0.353223,0.188191,0.013,0.232,0.315,0.427,5.021
