In [1]:
import pandas as pd
import numpy as np

In [3]:
# Load data
df = pd.read_csv("/home/paddy/Documents/linc_hackathon/linc_hackathon_2025/given_resources/stockPrices_hourly.csv")
df["gmtTime"] = pd.to_datetime(df["gmtTime"])

# Dictionary to store processed data for each stock
stock_dfs = {}

# Feature engineering for each stock
for symbol in df["symbol"].unique():
    df_stock = df[df["symbol"] == symbol].copy()

    # Round numerical columns
    cols_to_round = [col for col in df_stock.columns if col not in ["gmtTime", "symbol"]]
    df_stock[cols_to_round] = df_stock[cols_to_round].round(2)

    # Time-based features
    df_stock['hour'] = df_stock['gmtTime'].dt.hour
    df_stock['day_of_week'] = df_stock['gmtTime'].dt.dayofweek

    # Rolling statistics
    df_stock['askMedian_rolling_mean_3h'] = df_stock['askMedian'].rolling(window=3, min_periods=1).mean()
    df_stock['bidMedian_rolling_mean_3h'] = df_stock['bidMedian'].rolling(window=3, min_periods=1).mean()
    df_stock['askMedian_rolling_std_3h'] = df_stock['askMedian'].rolling(window=3, min_periods=1).std()
    df_stock['bidMedian_rolling_std_3h'] = df_stock['bidMedian'].rolling(window=3, min_periods=1).std()

    # Percentage changes
    df_stock['askMedian_pct_change'] = df_stock['askMedian'].pct_change()
    df_stock['bidMedian_pct_change'] = df_stock['bidMedian'].pct_change()

    # Spread-related features
    df_stock['spread_ratio'] = df_stock['spreadMedian'] / (df_stock['askMedian'] + df_stock['bidMedian'])
    df_stock['spread_pct_change'] = df_stock['spreadMedian'].pct_change()

    # Volume-related features
    df_stock['askVolume_relative'] = df_stock['askVolume'] / df_stock['askVolume'].rolling(window=5, min_periods=1).mean()
    df_stock['bidVolume_relative'] = df_stock['bidVolume'] / df_stock['bidVolume'].rolling(window=5, min_periods=1).mean()
    df_stock['volume_imbalance'] = (df_stock['askVolume'] - df_stock['bidVolume']) / (df_stock['askVolume'] + df_stock['bidVolume'])

    # Lagged features (e.g., previous hour's values)
    for lag in range(1, 50):  # Add lags for the last 3 hours
        df_stock[f'askMedian_lag_{lag}'] = df_stock['askMedian'].shift(lag)
        df_stock[f'bidMedian_lag_{lag}'] = df_stock['bidMedian'].shift(lag)
        df_stock[f'spreadMedian_lag_{lag}'] = df_stock['spreadMedian'].shift(lag)

    # Target variable: Direction of price movement (1 if bidMedian increases next hour, 0 otherwise)
    df_stock['target'] = (df_stock['bidMedian'].shift(-1) > df_stock['bidMedian']).astype(int)

    # Drop rows with missing values (due to lags and rolling features)
    df_stock = df_stock.dropna()

    # Store processed dataframe
    stock_dfs[symbol] = df_stock

# Example: View processed data for one stock
print(stock_dfs['STOCK1'].head())

# Save processed data to CSV (optional)
for symbol, df_stock in stock_dfs.items():
    df_stock.to_csv(f"{symbol}_processed.csv", index=False)

  df_stock[f'spreadMedian_lag_{lag}'] = df_stock['spreadMedian'].shift(lag)
  df_stock[f'askMedian_lag_{lag}'] = df_stock['askMedian'].shift(lag)
  df_stock[f'bidMedian_lag_{lag}'] = df_stock['bidMedian'].shift(lag)
  df_stock[f'spreadMedian_lag_{lag}'] = df_stock['spreadMedian'].shift(lag)
  df_stock[f'askMedian_lag_{lag}'] = df_stock['askMedian'].shift(lag)
  df_stock[f'bidMedian_lag_{lag}'] = df_stock['bidMedian'].shift(lag)
  df_stock[f'spreadMedian_lag_{lag}'] = df_stock['spreadMedian'].shift(lag)
  df_stock[f'askMedian_lag_{lag}'] = df_stock['askMedian'].shift(lag)
  df_stock[f'bidMedian_lag_{lag}'] = df_stock['bidMedian'].shift(lag)
  df_stock[f'spreadMedian_lag_{lag}'] = df_stock['spreadMedian'].shift(lag)
  df_stock[f'askMedian_lag_{lag}'] = df_stock['askMedian'].shift(lag)
  df_stock[f'bidMedian_lag_{lag}'] = df_stock['bidMedian'].shift(lag)
  df_stock[f'spreadMedian_lag_{lag}'] = df_stock['spreadMedian'].shift(lag)
  df_stock[f'askMedian_lag_{lag}'] = df_stock['askMedian'].s

                gmtTime  askMedian  bidMedian  askVolume  bidVolume  \
542 2015-04-30 07:00:00      73.19      73.12     197.10     228.58   
551 2015-04-30 08:00:00      73.38      73.34     370.73     341.57   
564 2015-04-30 09:00:00      73.36      73.32     371.82     268.50   
567 2015-04-30 10:00:00      73.60      73.57     322.02     367.43   
575 2015-04-30 11:00:00      73.61      73.57     255.77     251.45   

     spreadMedian  symbol  hour  day_of_week  askMedian_rolling_mean_3h  ...  \
542          0.07  STOCK1     7            3                  73.693333  ...   
551          0.04  STOCK1     8            3                  73.380000  ...   
564          0.04  STOCK1     9            3                  73.310000  ...   
567          0.03  STOCK1    10            3                  73.446667  ...   
575          0.03  STOCK1    11            3                  73.523333  ...   

     askMedian_lag_47  bidMedian_lag_47  spreadMedian_lag_47  \
542             74.51       