# Labeling Function

In [None]:
def labeling(df):
    df_new_col = pd.DataFrame()
    for date in df['Date'].unique():
        # Filter so that it does not calculate price change from 1 day to the next
        single_day = df[df['Date'] == date].copy()
        single_day.dropna(subset=['Close_2m'], inplace=True)
        # Calculate the 6 minute change in price
        single_day['SixMinChange'] = single_day['Close_2m'].shift(-3) - single_day['Close_2m']
        # concatenate data into a new data frame
        df_new_col = pd.concat([df_new_col, single_day], ignore_index=True)

    df = df_new_col.copy()
    # Labeling conditions: top 33% are considered bull, bottome 33 percent are considered bear
    percentiles = [.67, .33]

    # This calculates the price to seperate labels on
    bull_condition = df['SixMinChange'].quantile(percentiles[0])
    bear_condition = df['SixMinChange'].quantile(percentiles[1])

    # Create 'Label' column based on conditions
    df['SixMinLabel'] = 'Neutral'
    df.loc[df['SixMinChange'] > bull_condition, 'SixMinLabel'] = 'Bullish'
    df.loc[df['SixMinChange'] < bear_condition, 'SixMinLabel'] = 'Bearish'

    # Drop NaNs after calculations
    df.dropna(subset=['SixMinChange'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

# Function 2: Momentum Indicator Creation

In [None]:
def momentum_columns(df):
  df_new_cols = pd.DataFrame()
  for date in df['Date'].unique():
    # Find the Moving Averages and Hourly Change
    single_day = df[df['Date'] == date].copy()
    single_day['TenMinMovingAvg'] = single_day['Close_2m'].rolling(window=5).mean()
    single_day['TwentyMinMovingAvg'] = single_day['Close_2m'].rolling(window=10).mean()
    single_day['ThirtyMinMovingAvg'] = single_day['Close_2m'].rolling(window=15).mean()
    single_day['HourChange'] = single_day['Close_2m'].shift(30) - single_day['Close_2m']
    df_new_cols = pd.concat([df_new_cols, single_day], ignore_index=True)
  df = df_new_cols
  ## Create Labels for Dummy Variables
  # 10 Minute MA
  df['TenMinMALabel'] = 'Neutral'
  df.loc[100*df['Close_2m'] > 100*df['TenMinMovingAvg'], 'TenMinMALabel'] = 'Above'
  df.loc[100*df['Close_2m'] < 100*df['TenMinMovingAvg'], 'TenMinMALabel'] = 'Below'

  # 20 Minute MA
  df['TwentyMinMALabel'] = 'Neutral'
  df.loc[100*df['Close_2m'] > 100*df['TwentyMinMovingAvg'], 'TwentyMinMALabel'] = 'Above'
  df.loc[100*df['Close_2m'] < 100*df['TwentyMinMovingAvg'], 'TwentyMinMALabel'] = 'Below'

  # Thirty Minute MA
  df['ThirtyMinMALabel'] = 'Neutral'
  df.loc[100*df['Close_2m'] > 100*df['ThirtyMinMovingAvg'], 'ThirtyMinMALabel'] = 'Above'
  df.loc[100*df['Close_2m'] < 100*df['ThirtyMinMovingAvg'], 'ThirtyMinMALabel'] = 'Below'

  # Ten Cross Twently Label
  df['TenCrossTwenty'] = 'Neutral'
  df.loc[100*df['TenMinMovingAvg'] > 100*df['TwentyMinMovingAvg'], 'TenCrossTwenty'] = 'Above'
  df.loc[100*df['TenMinMovingAvg'] < 100*df['TwentyMinMovingAvg'], 'TenCrossTwenty'] = 'Below'

  # Ten Cross Twently Label
  df['TenCrossThirty'] = 'Neutral'
  df.loc[100*df['TenMinMovingAvg'] > 100*df['ThirtyMinMovingAvg'], 'TenCrossThirty'] = 'Above'
  df.loc[100*df['TenMinMovingAvg'] < 100*df['ThirtyMinMovingAvg'], 'TenCrossThirty'] = 'Below'

  # Hourly Change
  df['HourChangeLabel'] = 'Neutral'
  df.loc[10*df['HourChange'] > 0, 'HourChangeLabel'] = 'Above'
  df.loc[10*df['HourChange'] < 0, 'HourChangeLabel'] = 'Below'

  # Drop nulls
  df.dropna(subset=['HourChange'], inplace = True)
  df = df.reset_index()
  return df

# Data Cleaning

In [None]:
def intraday_cleaning(data):
    # Change to datetime data type, normalize to utc time zone
    data['Datetime'] = pd.to_datetime(data['Datetime'], utc=True)
    # Create a Date column
    data['Date'] = data['Datetime'].dt.date
    # Create a time column
    data['Time'] = data['Datetime'].dt.time
    # Adj Close is not used and Datetime become repetitive
    data.drop(['Datetime', 'Adj Close'], inplace=True, axis=1)
    return data

# Candlestick Creation

In [None]:
def candle_sticks(data, image_folder_base, ticker, candles, step):
    # Assuming 'Date' and 'Label' columns exist in the dataframe
    unique_dates = data['Date'].unique()

    for date in unique_dates:
        day_df = data.loc[data['Date'] == date]

        # Select 10-period sequences
        start_candle = 0
        end_candle = candles

        while end_candle <= len(day_df):
            # Filter the dataframe for the sequence
            sequence = day_df.iloc[start_candle:end_candle]

            # Create candlestick chart
            fig = go.Figure(data=[go.Candlestick(x=sequence['Time'],
                                                 open=sequence['Open_2m'],
                                                 high=sequence['High_2m'],
                                                 low=sequence['Low_2m'],
                                                 close=sequence['Close_2m'])])

            # Get label for the last candlestick in the sequence
            label = sequence['SixMinLabel'].iloc[-1]
            output_folder = f'{image_folder_base}/{label}'

            # Write to a JPEG file
            image_file = f"{output_folder}/candles_{date}_{ticker}_{start_candle}_{end_candle}_label_{label}.jpeg"
            fig.write_image(image_file)

            # Move to the next sequence
            start_candle += step
            end_candle += step