In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
base_path = '/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet'

train_df = {}

for partition_id in range(10):
    
    file_path = f'{base_path}/partition_id={partition_id}/part-0.parquet'
    train_df[partition_id] = pd.read_parquet(file_path)

In [None]:
for partition_id in range(10):
    train_df[partition_id] = train_df[partition_id].dropna()

In [None]:
#just now realized we need a sample of a singular train_df because they are simply too large to run statistical analysis on in this kernel
#We have 5.7 million rows of each variable, lets take 50,000, that is about 1%

sample_size = 50_000
sample_train_df = {}  # Initialize as an empty dictionary

# Loop through the first 10 partitions of train_df
for partition_id in range(10):
    if isinstance(train_df[partition_id], pd.DataFrame):  # Check if it’s a DataFrame
        sample_train_df[partition_id] = train_df[partition_id].tail(sample_size)
    else:
        print(f"train_df[{partition_id}] is not a DataFrame!")

# Verify the sample from partition 9
print(sample_train_df[9].describe())


In [None]:
#Let us examine the responder to see what it might be
train_df[9]['responder_6'].describe()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize = (10,6))
plt.plot(train_df[9]['time_id'], train_df[9]['responder_6'])
plt.title('responder 6 over time')
plt.xlabel('time_id')
plt.ylabel('responder 6')
plt.show


In [None]:
#ok so that was totally not understandable to me, let's plot over date_id

plt.figure(figsize = (10,6))
plt.plot(train_df[9]['date_id'], train_df[9]['responder_6'])
plt.title('responder 6 over dates')
plt.xlabel('date_id')
plt.ylabel('responder_6')
plt.show


In [None]:
#and that is our first real breakthrough... because it appears to be clustering back towards a mean of 0... it is volatility we are looking for
#let us try the ADF test which tests for stationarity, The ADF test evaluates the null hypothesis (H₀): The time series is non-stationary (has a unit root).
#ok so we got a highly negative ADF statistic which means if the ADF Statistic is less than the critical value → Reject the null hypothesis (stationary). and we got a p-value of almost 0 which means there is almost a 0% chance of H0 being true with this being result of our sample.
from statsmodels.tsa.stattools import adfuller

result = adfuller(sample_train_df[9]['responder_6'].dropna())
print("ADF Statistic:", result[0])
print("p-value:", result[1])

if result[1] < 0.05:
    print("Responder_6 is stationary (volatility-like behavior).")
else:
    print("Responder_6 is non-stationary (unlikely volatility).")

In [None]:
#ok now it appears we have two candidates for responder_6, volatility and log-return
#let us make a histogram and if it creates a normal-distribution it could very well be log-returns and not vol
# oh wait... there are negative spikes in the plot... that is impossible for volatility

import seaborn as sns
sns.histplot(sample_train_df[9]['responder_6'], kde=True)
plt.title("Distribution of Responder_6")
plt.show()


In [None]:
#I want to see the distribution of the sample of responder_6
#I also want to use .loc to see where the -5 occurence takes place and see around there maybe 100 responder_6 samples

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sns.histplot(sample_train_df[5]['responder_6'], kde = True)
plt.title('Sample histogram plot of responder_6')
plt.show


In [None]:
#ok what the heck... let's create a new sample from the head, because how could all of the data samples fit this dist, it doesnt make any sense

sample_train_df2 = {}

for parameter_id in range(10):

    sample_train_df2[partition_id] = train_df[partition_id].head(sample_size)

sample_train_df2[9].describe()

In [None]:
#new histogram

sns.histplot(sample_train_df2[9]['responder_6'], kde=True)
plt.title('responder_6 distribution when from second sample')
plt.show

In [None]:
#ok this is very interesting... this means that sampling from the head and tail gives you the exact same distribution?? or am i doing something wrong?
#i am now going to collect a random sample
#The sample will be from 300,000 to 350,000 so 1% of the total df
random_sample_train_df = {}

for parameter_id in range(10):
    random_sample_train_df[parameter_id] = train_df[parameter_id].iloc[300000:350000]

random_sample_train_df[9].describe()

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

sns.histplot(random_sample_train_df[9]['responder_6'], kde=True)
plt.show

In [None]:
random_sample_train_df[9]['responder_6'].describe()

In [None]:
#ok... its the same lol, my plan now is to take a super small subsection to see if it is user error

final_sample_train_df = {}

for parameter_id in range(10):
    final_sample_train_df[parameter_id] = train_df[parameter_id].iloc[750000:750100]

final_sample_train_df[9].describe()
final_sample_train_df[9]['responder_6'].describe()

In [None]:
sns.histplot(final_sample_train_df[9]['responder_6'], kde=True)
plt.show

In [None]:
#ok so yay, this is real data!
#lets create a function to generate a sample dataframe
import numpy as np

def current_sample_df(train_df, sample_size):
    current_sample_df = {}

    for parameter_id in range(10):
        total_rows = len(train_df[parameter_id])

        # Check if sample_size is valid for the current partition
        if total_rows >= sample_size:
            start_idx = np.random.randint(0, total_rows - sample_size)
            current_sample_df[parameter_id] = train_df[parameter_id].iloc[start_idx:start_idx + sample_size]
        else:
            print(f"Warning: Partition {parameter_id} has only {total_rows} rows, less than {sample_size}. Skipping.")
            current_sample_df[parameter_id] = None  # Mark as None or handle differently

    return current_sample_df

print(current_sample_df(train_df, 30000))

In [None]:
#created a function to examine the count for values within a dataframe (think it will help me get a grasp on outliers)
#There are 1525 -5 values and 8539 +5 values in train_df[9]; that is a 5.6X, so our graph is heavily skewed to the right indicating log-returns imo
def count_values(df, column_name, test_value):

    count_values = 0

    for value in df[column_name]:
        if value == test_value:
            count_values +=1
    return count_values


print(count_values(train_df[9], 'responder_6', -5))
print(count_values(train_df[9], 'responder_6', 5))

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
import matplotlib.pyplot as plt

plot_acf(sample_train_df[9]['responder_6'], lags=40)
plt.ylim(-0.1,0.1)
plt.title("ACF of Responder_6")
plt.show()

plot_acf(sample_train_df[9]['responder_6']**2, lags=40)
plt.ylim(-0.1,0.1)
plt.title("ACF of Squared Responder_6")
plt.show()


In [None]:
from statsmodels.stats.diagnostic import acorr_ljungbox

# Test for autocorrelation in raw and squared responder_6
lb_raw = acorr_ljungbox(sample_train_df[9]['responder_6'].dropna(), lags=[10, 20, 30], return_df=True)
lb_squared = acorr_ljungbox(sample_train_df[9]['responder_6'].dropna()**2, lags=[10, 20, 30], return_df=True)

print("Ljung-Box Test for Raw Responder_6:")
print(lb_raw)
print("\nLjung-Box Test for Squared Responder_6:")
print(lb_squared)

In [None]:
#examination of types of distribution, starting with the laplace distribution.

import numpy as np
import matplotlib.pyplot as plt

# Generate Laplace-distributed data
mu = np.median(sample_train_df[9]['responder_6'])
b = np.sqrt((0.752648**2)/2)

data = np.random.laplace(mu, b, len(sample_train_df[9]['responder_6']))

# Plot the histogram
plt.hist(data, bins=50, density=True, alpha=0.6, color='blue')

# Add the theoretical PDF
x = np.linspace(-6, 6, 100000)
pdf = (1 / (2 * b)) * np.exp(-np.abs(x - mu) / b)
plt.plot(x, pdf, 'r', linewidth=2, label="Laplace PDF")

plt.title("Laplace Distribution")
plt.xlabel("Value")
plt.ylabel("Density")
plt.legend()
plt.show()


In [None]:
from scipy.stats import ks_1samp, laplace

mu = np.median(sample_train_df[9]['responder_6'])
b = np.sqrt((0.752648**2) / 2)  # Scale parameter
x = sample_train_df[9]['responder_6']

# Use the theoretical Laplace CDF
laplace_cdf = lambda x: laplace.cdf(x, loc=mu, scale=b)

# Perform Kolmogorov-Smirnov test
ks_stat, p_value = ks_1samp(x, laplace_cdf)
print(f"KS Statistic: {ks_stat}, p-value: {p_value}")


In [None]:
#examination of weight column... it appears that they go in a pattern that has a period of about 40 index units

#weight has a high of 5.162702, a low of 0.728737, a mean of 2.101985, and a std dev of 1.001273

#The mean is around 2... it is heavily skewed to the right, indicating that the median is most likely less than 2.... it is 2.03 which is less than the mean

df = {}

for parameter_id in range(10):
    df[parameter_id] = sample_train_df[parameter_id].iloc[0:150]

df[9].head()

import matplotlib.pyplot as plt

plt.plot(df[9].index, df[9]['weight'])
plt.ylabel('weight')
plt.xlabel('time')
plt.title('The weight values over time')
plt.show

In [None]:
#plotted the weights compared to the responder 6 values... it seems to me that the weight is not very meaningful as 
import matplotlib.pyplot as plt

plt.plot(df[9].index, df[9]['weight'], df[9]['responder_6'])
plt.ylabel('weight')
plt.xlabel('time')
plt.title('The weight values over time')
# Adjust tick density
plt.show

fig, ax1 = plt.subplots()

# Plot weight on the primary y-axis
ax1.plot(df[9].index, df[9]['weight'], label='Weight', color='blue')
ax1.set_ylabel('Weight', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# Create a second y-axis for the responder_weight_ratio
ax2 = ax1.twinx()  # Instantiate a second y-axis that shares the same x-axis
ax2.plot(df[9].index, df[9]['responder_weight_ratio'], label='Responder/Weight Ratio', color='orange')
ax2.set_ylabel('Responder/Weight Ratio', color='orange')
ax2.tick_params(axis='y', labelcolor='orange')

# Add title and show plot
plt.title('Weight and Responder/Weight Ratio Over Time')
plt.show()

In [None]:
#Final function for running test

def predict(test: pd.DataFrame, lags: pd.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_
    if lags is not None:
        lags_ = lags

    # Replace this section with your own predictions
    predictions = test.select(
        'row_id',
        pl.lit(0.0).alias('responder_6'),
    )

    
    if isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()
    else:
        raise TypeError('The predict function must return a DataFrame')
    # Confirm has as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions