In [8]:
import pandas as pd
import numpy as np
from  pandasql import sqldf
import plotly.express as px
import datetime

pd.options.display.max_rows = 50
pd.options.display.max_columns = 100




pysqldf = lambda q: sqldf(q, globals())

In [9]:
def load_format_df():
    ''' Format DF for further processing
    '''
    df = pd.read_csv('spy_5min.csv')
    df = df.iloc[:,1:] # trim first extra column (redundant)
    df['time_converted'] = pd.to_datetime(df['time_converted'])
    # set only day session filter
    day_session = True
    if day_session:
        begin_time = datetime.time(9,30,0)
        end_time = datetime.time(16,0,0)
        df = df[(df['time_converted'].dt.time <= end_time) & (df['time_converted'].dt.time >= begin_time )]
    return df
df = load_format_df()


In [10]:
def cleanse_df(df):
    '''
    Apply some custom logic for further processing
    '''
    df =  pysqldf('''
                    WITH t as (
                        select *
                        ,DATE(time_converted) as date
                        ,ROW_NUMBER() OVER (PARTITION BY DATE(time_converted) ORDER BY DATETIME(time_converted) ASC) AS N_with_day
                   from df s 
                        ),
                        t1 as (
                        SELECT DISTINCT date,date_open,date_close from (
                            select date
                            ,FIRST_VALUE(open) OVER (PARTITION BY date ORDER BY DATETIME(time_converted) ASC) as date_open
                            ,LAST_VALUE(close) OVER (PARTITION BY date ORDER BY DATETIME(time_converted) ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING ) as date_close
                        from t))
                        select t.*, t1.date_open, t1.date_close from t left join t1 on t.date=t1.date
                   ''')
    df['time_converted'] = pd.to_datetime(df['time_converted'])
    df['time'] = df['time_converted'].dt.time
    df['high_pct_open_abs'] = abs((df['high']/df['date_open'])-1)*100
    df['low_pct_open_abs'] = abs(df['low']/df['date_open'] -1)*100
    df['weekday'] = df['time_converted'].dt.day_name()
    temp_df = df.groupby('date')[['high_pct_open_abs','low_pct_open_abs']].max()
    temp_df['abs_max_daily'] = temp_df[['high_pct_open_abs','low_pct_open_abs']].max(axis=1)
    df = df.merge(temp_df, on = 'date', suffixes=('', '_max_daily'))
    return df
df = cleanse_df(df)


In [11]:
def check_N_with_day(df):
    '''
        Check that all N with day are unique
    '''
    ### Some analysis of uniqueness of  N<>hour timing. 
    ### Found out that with N=44 we have 3 dates  07/03/23;07/03/24;11/24/24 where N=44 relates to 16:00
    ### In other dates this is always 13:05.  Probably because of day prior to holiday. 
    ### Need to remove these dates



    ## Conslusion there is one  N = 44

    #temp_df = df[['N_with_day','time','date']]
    # print(
    #         pysqldf('''
    #         select N_with_day, count(distinct time) as N from temp_df
    #         group by  N_with_day
    #         order by N desc
    #         ''')
    # )

    # same stuff as SQL above
    # print(temp_df.groupby(['N_with_day']).agg(N=('time','nunique')).sort_values(by='N', ascending=False).reset_index())


    # Analysis below.

    # print(temp_df[temp_df['N_with_day'] == 44].iloc[:,0:2].value_counts())
    # comp_time = datetime.time(16,0,0)
    # print(temp_df.query("N_with_day == 44 and time== @comp_time")) 
    dates_to_delete = ["2023-11-24","2023-07-03","2024-07-03"]
    for day in dates_to_delete:
        df = df.drop(df[df['date'] == day].index)
    return df
df = check_N_with_day(df)

In [12]:
def candle_up_or_down(open,close):
        '''
        return whether  candle is up (1) or down (-1) or zero (open=close)
        '''
        # open = df['open']
        # close = df['close']
        candle_class = (
            "-1" if (close<open)
            else "1"  if (close>open)
            else "0"
        )
        return candle_class

df['candle_type'] = np.vectorize(candle_up_or_down)(df['open'],df['close'])

In [None]:
## create y variable (last hour direction)
def create_y(df):
    start_time = datetime.time(15,0,0)
    end_time = datetime.time(15,55,0)
    df = df.query("time >= @start_time and time <= @end_time")
    df.set_index('time_converted', inplace=True)
    df = df.resample('h').agg({
            'open' : 'first',
            'high' : 'max',
            'low'  : 'min',
            'close': 'last',
            'number of trades' : 'sum',
            'volume': 'sum',
            'high_pct_open_abs_max_daily' : 'first',
            'low_pct_open_abs_max_daily' : 'first',
            'abs_max_daily' : 'first'

    }).dropna()
    df['y'] = np.vectorize(candle_up_or_down)(df['open'],df['close'])
    return df
df_y= create_y(df)

In [19]:
df.corr()

ValueError: could not convert string to float: '2023-01-03'

fig = px.bar(x=["a", "b", "c"], y=[1, 3, 2])
fig.show()

In [None]:
df_group = df.groupby([df['time_converted'].dt.year,df['time_converted'].dt.month])
for (year,month), df_group  in df_group:
    fig = px.box(df_group, x='weekday', y='abs_max_daily', color='weekday',
                 category_orders={'weekday': ['Monday','Tuesday','Wednesday','Thursday','Friday']}
                 ,title=f"Year: {year}, Month: {month}")
    fig.update_traces(quartilemethod="inclusive")  # To include all data points in the box plot
    fig.show()
