<div class="list-group" id="list-tab" role="tablist">
  <h3 class="list-group-item list-group-item-action active" data-toggle="list"  role="tab" aria-controls="home"  >Table of Contents</h3>
  <a class="list-group-item list-group-item-action" data-toggle="list" href="#data" role="tab" aria-controls="profile">DATA LOADING AND PREPROCESSING<span class="badge badge-primary badge-pill "></span></a>
    <a class="list-group-item list-group-item-action" data-toggle="list" href="#eda" role="tab" aria-controls="messages">TIME SERIES ANALYSIS AND EDA<span class="badge badge-primary badge-pill"></span></a>
    <a class="list-group-item list-group-item-action"  data-toggle="list" href="#model" role="tab" aria-controls="settings">MODELLING<span class="badge badge-primary badge-pill"></span></a>
    <a class="list-group-item list-group-item-action" data-toggle="list" href="#sub" role="tab" aria-controls="settings">SUB TO SERVER<span class="badge badge-primary badge-pill"></span></a> 


# <div id='data' style="color:white;   font-weight:bold; font-size:120%; text-align:center;padding:12.0px; background:black"> DATA LOADING AND PREPROCESSING</div>

<a href="#list-tab" class="btn btn-success btn-lg active" role="button" aria-pressed="true" style="color:Blue; font-size:140%; background:lightgrey;  font-weight:bold; " data-toggle="popover" title="go to Colors">GO BACK</a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv, pd.read_parquet )
import polars as pl

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter

import os, gc
from tqdm.auto import tqdm
import pickle # module to serialize and deserialize objects
import re # for Regular expression operations 

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data  import Dataset, DataLoader
from pytorch_lightning import (LightningDataModule, LightningModule, Trainer)
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Timer

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor

import lightgbm as lgb
from lightgbm import LGBMRegressor

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

import kaggle_evaluation.jane_street_inference_server

In [None]:
gridColor = 'lightgrey'

In [None]:
path = "/kaggle/input/jane-street-real-time-market-data-forecasting"
sample_df = pd.read_parquet(f"{path}/train.parquet/partition_id=9/part-0.parquet")
sample_df = sample_df.dropna()
sample_df.round(1)

print(sample_df.describe())

print(sample_df.head())

In [None]:
sample_dfs = {}

for sequence_value in range(10):
    
    sample_dfs[sequence_value] = sample_df.iloc[int(sequence_value*len(sample_df)*.1):int((sequence_value+1) * len(sample_df)*.1):1]

In [None]:
sample_dfs[0].head()

In [None]:
print(len(sample_dfs[0]))

In [None]:
sum_length = sum(len(sample_dfs[i]) for i in sample_dfs)
print(sum_length)


In [None]:
for i in range(10):
    sample_dfs[i] = sample_dfs[i].dropna()

In [None]:
sample_dfs[0].describe()

In [None]:
sample = sample_dfs[0].iloc[500:750]

ax = sample['responder_6'].plot.box()
ax.set_title('Box and Whiskers')

ax.set_ylabel('Responder 6')
plt.figure(figsize = (12,6))
plt.show()

# <div id='eda'  style="color:white;   font-weight:bold; font-size:120%; text-align:center;padding:12.0px; background:black"> TIME SERIES ANALYSIS AND EDA</div>

<a href="#list-tab" class="btn btn-success btn-lg active" role="button" aria-pressed="true" style="color:Blue; font-size:140%; background:lightgrey;  font-weight:bold; " data-toggle="popover" title="go to Colors">GO BACK</a>

Let us take a look at the target values over time (for  `symbol_id`=1)

In [None]:
def train(i):
    train = sample_dfs[i]

    # Select rows where symbol_id == 1
    filtered_data = train[train.symbol_id == 1]
    xx = filtered_data['time_id']
    yy = filtered_data['responder_6']

    # Create the plot
    plt.figure(figsize=(16, 5))
    plt.plot(xx, yy, color='black', linewidth=0.05)
    plt.suptitle('Returns, responder_6', weight='bold', fontsize=16)
    plt.xlabel("Time", fontsize=12)
    plt.ylabel("Returns", fontsize=12)
    plt.grid(color='lightgray', linewidth=0.8)
    plt.axhline(0, color='red', linestyle='-', linewidth=1.2)
    plt.show()

train(0)

Let us take a look at the cumulative values of response over time

In [None]:
def train(i):
    train = sample_dfs[i]

    # Select rows where symbol_id == 1
    filtered_data = train[train.symbol_id == 1]
    xx = filtered_data['time_id']
    yy = filtered_data['responder_6']

    return xx, yy

# Create subplots for i = 0 to 10
fig, axes = plt.subplots(5, 2, figsize=(20, 25))  # Adjust grid size (5 rows, 2 columns)

for i, ax in enumerate(axes.flatten()[:10]):  # Loop over 10 plots
    xx, yy = train(i)
    ax.plot(xx, yy, color='black', linewidth=0.05)
    ax.set_title(f'Returns, responder_6 (i={i})', fontsize=14, weight='bold')
    ax.set_xlabel("Time", fontsize=12)
    ax.set_ylabel("Returns", fontsize=12)
    ax.grid(color='lightgray', linewidth=0.8)
    ax.axhline(0, color='red', linestyle='-', linewidth=1.2)

plt.tight_layout()  # Adjust layout for better spacing
plt.show()


Now let's compare this responder (6) with other responders

In [None]:
# for symbol_id == 0
plt.figure(figsize=(18, 7))
predictor_cols = [col for col in sample_df.columns if 'responder' in col]
for i in predictor_cols: 
    if i == 'responder_6': 
        c='red'
        lw=2.5
        plt.plot((sample_df.groupby(['date_id'])[i].mean()).cumsum(), linewidth = lw, color = c)
    else: 
        lw=1
        plt.plot((sample_df.groupby(['date_id'])[i].mean()).cumsum(), linewidth = lw)

plt.xlabel('Trade days')
plt.ylabel('Cumulative response')
plt.title('Response time series over trade days  \n Responder 6 (red) and other responders', weight='bold')
plt.grid(visible=True, color = gridColor, linewidth = 0.7)
plt.axhline(0, color='blue', linestyle='-', linewidth=1)
plt.legend(predictor_cols)
sns.despine()
#plt.show()

- We can see that `resp6` (red) most closely follows `resp0` and `resp3`

Let's build a correlation matrix and see it numerically.

In [None]:
plt.figure(figsize=(6, 6))
responders = pd.read_csv(f"{path}/responders.csv")
matrix = responders[[ f"tag_{no}" for no in range(0,5,1) ] ].T.corr()
sns.heatmap(matrix, square=True, cmap="coolwarm", alpha =0.9, vmin=-1, vmax=1, center= 0, linewidths=0.5, 
            linecolor='white', annot=True, fmt='.2f')
plt.xlabel("Responder_0 - Responder_8")
plt.ylabel("Responder_0 - Responder_8")
plt.show()

In [None]:
#Seaborn Correlation Heatmap for Responders

responder_sample = []

for col in sample.columns:
    if col.startswith('responder'):
        responder_sample.append(col)

responder_df = sample[responder_sample]

responder_correlation = responder_df.corr(method = 'spearman').round(1)


sns.heatmap(responder_correlation, square = True, cmap = 'coolwarm', vmin=-1, vmax = 1, linewidth = 0.5, annot = True)

plt.show()

Let us take a look at the returns and cumulative daily returns, and disribution of returns for all responders

In [None]:
df_train=sample_df
s_id = 0                        # Change params to take a look at other symbols
res_columns = [col for col in df_train.columns if re.match("responder_", col)]
row = 9
j = 0

fig, axs = plt.subplots(figsize=(18, 4*row))
for i in range(1, 3 * len(res_columns) + 1, 3):
    xx= sample_dfs[0][(sample_df.symbol_id==s_id)] ['time_id']
    yy=sample_dfs[0][ (sample_df.symbol_id==s_id)][f'responder_{j}']
    c='black'
    if j == 6: c='red'
        
    ax1 = plt.subplot(9, 3, i)
    ax1.plot(   xx,yy.cumsum()   , color = c, linewidth =0.8 )
    plt.axhline(0, color='blue', linestyle='-', linewidth=0.9)
    plt.grid(color =gridColor )
    
    ax2 = plt.subplot(9, 3, i+1)
    #by_date = df_symbolX.groupby(["date_id"])
    ax2.plot(xx,yy   , color = c, linewidth =0.05)
    plt.axhline(0, color='blue', linestyle='-', linewidth=1.2)
    ax2.set_title(f"responder_{j}", fontsize = 14)
    plt.grid(color = gridColor)
    
    ax3 = plt.subplot(9, 3, i+2)
    b=1000
    ax3.hist(yy, bins=b, color = c,density=True, histtype="step" )
    ax3.hist(yy, bins=b, color = 'lightgrey',density=True)
    plt.grid(color = gridColor)
    ax3.set_ylim([0, 3.5])
    ax3.set_xlim([-2.5, 2.5])
    
    j = j + 1
    
fig.patch.set_linewidth(3)
fig.patch.set_edgecolor('#000000')
fig.patch.set_facecolor('#eeeeee') 
plt.show()

We can see that responders have different behavior and distributions.

Let us now study the behavior of `responder 6`  for different `symbol_id`

In [None]:
res_columns = [col for col in df_train.columns if re.match("responder_", col)]
row=10
fig, axs = plt.subplots(figsize=(18, 5*row))
b=300
j = 0
for i in range(1, 3 * row + 1, 3):
    xx= sample_df[(sample_df.symbol_id==j)] ['time_id']
    yy= sample_df[(sample_df.symbol_id==j)]['responder_6']
    c='black'
        
    ax1 = plt.subplot(row, 3, i)
    ax1.plot(   xx,yy.cumsum()   , color = c, linewidth =0.8 )
    plt.axhline(0, color='red', linestyle='-', linewidth=0.7)
    plt.grid(color = gridColor)
    plt.xlabel('Time')
    
    ax2 = plt.subplot(row, 3, i+1)
    ax2.plot(xx,yy   , color = c, linewidth =0.05)
    plt.axhline(0, color='red', linestyle='-', linewidth=0.7)
    ax2.set_title(f"symbol_id={j}", fontsize = '14')
    plt.grid(color = gridColor)
    plt.xlabel('Time')
    
    ax3 = plt.subplot(row, 3, i+2)
    ax3.hist(yy, bins=b, color = c, density=True, histtype="step" )
    ax3.hist(yy, bins=b, color = 'lightgrey',density=True)
    plt.grid(color = gridColor)
    ax3.set_xlim([-2.5, 2.5])
    ax3.set_ylim([0, 1.5])
    plt.xlabel('Time')
    
    j = j + 1
    
fig.patch.set_linewidth(3)
fig.patch.set_edgecolor('#000000')
fig.patch.set_facecolor('#eeeeee') 
plt.show()

- We see that the behavior and distribution of one `responder 6` is  different for different `symbol_id`

Now let's study the data in more detail and then continue diving into time series analysis


## Files and variables overview

### Features.csv
features.csv - metadata pertaining to the anonymized features

#### Features have many missing values.

In [None]:
df_train = sample_df
plt.figure(figsize=(20, 3))    # Plot missing values
plt.bar(x=df_train.isna().sum().index, height=df_train.isna().sum().values, color="red", label='missing')   # analog: using missingno
plt.xticks(rotation=90)
plt.title(f'Missing values over the {len(df_train)} samples which have a target')
plt.grid()
plt.legend()
plt.show()

- Some columns are not very useful in our sample (either Null or show the partition number).

#### Structure of features:

In [None]:
features = pd.read_csv(f"{path}/features.csv")
features

#### Tags visualizing:

In [None]:
plt.figure(figsize=(18, 6))
plt.imshow(features.iloc[:, 1:].T.values, cmap="gray_r")
plt.xlabel("feature_00 - feature_78")
plt.ylabel("tag_0 - tag_16")
plt.yticks(np.arange(17))
plt.xticks(np.arange(79))
plt.grid(color = 'lightgrey')
plt.show()

#### Correlation matrix between feature_XX and feature_YY

In [None]:
plt.figure(figsize=(11, 11))
matrix = features[[ f"tag_{no}" for no in range(0,17,1) ] ].T.corr()
sns.heatmap(matrix, square=True, cmap="coolwarm", alpha =0.9, vmin=-1, vmax=1, center= 0, linewidths=0.5, linecolor='white')
plt.show()

### Responders.csv
responders.csv - metadata pertaining to the anonymized responders
#### Structure of responders:

In [None]:
responders = pd.read_csv(f"{path}/responders.csv")
responders

### Weights
#### Basic stats:

In [None]:
sample_df['weight'].describe()

> 

In [None]:
plt.figure(figsize=(8,3))
plt.hist(sample_df['weight'], bins=30, color='grey', edgecolor = 'white',density=True )
plt.title('Distribution of weights')
plt.grid(color = 'lightgrey', linewidth=0.5)
plt.axvline( 2.37399, color='red', linestyle='-', linewidth=0.7)
plt.show()

### Sample submission.csv
sample_submission.csv - This file illustrates the format of the predictions your model should make.

In [None]:
sub = pd.read_csv(f"{path}/sample_submission.csv")
print( f"shape = {sub.shape}" )
sub.head(10)

### Train.parquet

- **train.parquet** - The training set, contains historical data and returns. For convenience, the training set has been partitioned into ten parts.
  - `date_id` and `time_id` - Integer values that are ordinally sorted, providing a chronological structure to the data, although the actual time intervals between `time_id` values may vary.
  - `symbol_id` - Identifies a unique financial instrument.
  - `weight` - The weighting used for calculating the scoring function.
  - `feature_{00...78}` - Anonymized market data.
  - `responder_{0...8}` - Anonymized responders clipped between -5 and 5. The `responder_6` field is what you are trying to predict.
  
  
Each row in the `{train/test}.parquet` dataset corresponds to a unique combination of a symbol (identified by `symbol_id`) and a timestamp (represented by `date_id` and `time_id`). You will be provided with multiple responders, with `responder_6` being the only responder used for scoring. The date_id column is an integer which represents the day of the event, while `time_id` represents a time ordering. It's important to note that the real time differences between each time_id are not guaranteed to be consistent.

- The `symbol_id` column contains encrypted identifiers. Each `symbol_id` is not guaranteed to appear in all `time_id` and `date_id` combinations.
- Additionally, new `symbol_id` values **may appear in future** test sets.est sets.

## Responders: analysis, statistics and distributions

In [None]:
col =[]
for i in range(9):
    col.append(f"responder_{i}") 

sample_df[col].describe().round(1)

#### Interesting fact:
- The values ​​of all variables are strictly within the range of `[-5, 5]`

#### Responders-Responder distributions
Let's dive deeper into the relationships between respondents and plot mutual distributions between 'reps 6' and other responders

In [None]:
numerical_features=[]
numerical_features=sample_df.filter(regex='^responder_').columns.tolist() # Separate responders
numerical_features.remove('responder_6')

gs=600
k=1;
col = 3
row = 3
fig, axs = plt.subplots(row, col, figsize=(5*col, 5*row))

for i in numerical_features:
    
    plt.subplot(col,row, k)
    plt.hexbin(sample_df[i], sample_df['responder_6'], gridsize=gs, cmap='CMRmap', bins='log', alpha = 0.2)
    plt.xlabel(f'{i}', fontsize = 12)
    plt.ylabel('responder_6', fontsize = 12)
    plt.tick_params(axis='x', labelsize=6)
    plt.tick_params(axis='y', labelsize=6)
    k=k+1
fig.patch.set_linewidth(3)
fig.patch.set_edgecolor('#000000')
fig.patch.set_facecolor('#eeeeee')   

plt.show()

#### Responder6-Features distributions
Now let's plot mutual distributions between 'reps 6' and some features

In [None]:
numerical_features=[]
for i in ['05', '06', '07', '08', '12', '15', '19', '32', '38', '39', '50', '51', '65', '66', '67']:
    numerical_features.append(f'feature_{i}') 

gs=600
k=1;
col = 3
row = int(np.ceil(len(numerical_features) /3 ))
sz=5
w=sz*col
h = w/col *row
plt.figure(figsize=(w, h))

fig, axs = plt.subplots(figsize=(w, h))

for i in numerical_features:
    
    plt.subplot(row, col, k)
    plt.hexbin(sample_df['responder_6'], sample_df[i], gridsize=gs, cmap='CMRmap', bins='log', alpha = 0.3)
    
    plt.xlabel(f'{i}')
    plt.ylabel('responder_6')
    plt.tick_params(axis='x', labelsize=6)
    plt.tick_params(axis='y', labelsize=6)
    k=k+1

fig.patch.set_linewidth(3)
fig.patch.set_edgecolor('#000000')
fig.patch.set_facecolor('#eeeeee')   
plt.show()   

In [None]:
numerical_features=[]

for i in range(5,9):
    numerical_features.append(f'feature_0{i}') 
for i in range(15,20):
    numerical_features.append(f'feature_{i}') 
    
a=0; k=1;
n=3; 

fig, axs = plt.subplots(figsize=(15, 4))
for i in numerical_features[:-1]:
    a=a+1
    for j in numerical_features[a:]:
        plt.subplot(1,n, k)
        plt.hexbin(sample_df[i], sample_df[j], gridsize=200, cmap='CMRmap', bins='log', alpha = 1)
        plt.grid()
        plt.xlabel(f'{i}', fontsize = 14)
        plt.ylabel(f'{j}', fontsize = 14)
        plt.tick_params(axis='x', labelsize=6)
        plt.tick_params(axis='y', labelsize=6)
        
        k=k+1
        if k == (n+1):    
            k=1
            plt.show()
            plt.figure(figsize=(15, 4)) 

- There are many nonlinear and non-trivial distributions

# <div id='sub'  style="color:white;   font-weight:bold; font-size:120%; text-align:center;padding:12.0px; background:black">SUB TO SERVER</div>

<a href="#list-tab" class="btn btn-success btn-lg active" role="button" aria-pressed="true" style="color:Blue; font-size:140%; background:lightgrey;  font-weight:bold; " data-toggle="popover" title="go to Colors">GO BACK</a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import polars as pl
import kaggle_evaluation
import kaggle_evaluation.jane_street_inference_server as inference_server_module

# Define global variables
lags_: pd.DataFrame | None = None
lstm_model = None
scaler = None

# Initialize the LSTM model
def initialize_lstm(input_shape):
    """Initializes the LSTM model."""
    global lstm_model
    if lstm_model is None:
        lstm_model = tf.keras.Sequential([
            tf.keras.layers.LSTM(32, return_sequences=True, input_shape=input_shape),
            tf.keras.layers.LSTM(16, return_sequences=False),
            tf.keras.layers.Dense(1)
        ])
        lstm_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

def train_lstm_model(df, epochs=1, batch_size=32):
    """Prepares data and trains the LSTM model."""
    global scaler, lstm_model

    # Drop NaN values
    df = df.dropna()

    # Extract features and target
    features = df.filter(like="feature_", axis=1)  # Keep only feature columns
    target = df['responder_6']

    # Normalize features
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        features_scaled, target, test_size=0.2, random_state=42
    )

    # Further split for validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )

    # Reshape for LSTM input
    X_train = np.expand_dims(X_train, axis=-1)
    X_val = np.expand_dims(X_val, axis=-1)

    # Initialize the model
    initialize_lstm(input_shape=X_train.shape[1:])

    # Train the model
    lstm_model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        verbose=1
    )

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pd.DataFrame:
    """Make a prediction."""
    global lags_, lstm_model, scaler

    # Convert test and lag data to Pandas
    if isinstance(test, pl.DataFrame):
        test = test.to_pandas()
    if lags is not None and isinstance(lags, pl.DataFrame):
        lags = lags.to_pandas()

    # Update lags for subsequent batches
    if lags is not None:
        lags_ = lags

    # Ensure the model is trained
    if lstm_model is None:
        print("Error: LSTM model is not initialized or trained.")
        return pd.DataFrame({'row_id': test['row_id'], 'responder_6': [np.nan] * len(test)})

    # Prepare test features
    try:
        test_features = test.filter(like="feature_", axis=1)  # Keep only feature columns
        if scaler is not None:
            test_features = scaler.transform(test_features)
        test_features = np.expand_dims(test_features, axis=-1)  # Reshape for LSTM
    except Exception as e:
        print("Error during test feature preparation:", e)
        return pd.DataFrame({'row_id': test['row_id'], 'responder_6': [np.nan] * len(test)})

    # Debugging: Check input shape and NaN
    print("Input shape for LSTM:", test_features.shape)
    if np.isnan(test_features).any():
        print("Error: NaN values found in test_features")
        print(test_features)
        test_features = np.nan_to_num(test_features)  # Replace NaN with zeros

    # Make predictions
    try:
        predictions = lstm_model.predict(test_features)
        print("Predictions:", predictions)
    except Exception as e:
        print("Error during prediction:", e)
        return pd.DataFrame({'row_id': test['row_id'], 'responder_6': [np.nan] * len(test)})

    # Format predictions for Kaggle submission
    result = pd.DataFrame({
        'row_id': test['row_id'],
        'responder_6': predictions.flatten()
    })

    return result



# Main function for Kaggle submission
def main():
    # Load training data and train the LSTM model
    train_data_path = '/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=9/part-0.parquet'
    df = pd.read_parquet(train_data_path)
    train_lstm_model(df, epochs=3, batch_size=32)

    # Set up Kaggle inference server
    inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        inference_server.serve()
    else:
        inference_server.run_local_gateway(
            [
                '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
                '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet'
            ]
        )

# Run the main function
if __name__ == "__main__":
    main()
