##### Final Project Collaboration CS 5665

The purpose of this document is to share code and work on the project simultaniously. Please use the sections and text features to explain what you are doing and how it is applicable to the project.

## Imports ...


In [None]:
import os
import re
# import io
# import zipfile
# import kaggle
import numpy as np
import pandas as pd
# from kaggle.api.kaggle_api_extended import KaggleApi
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# from sklearn.naive_bayes import GaussianNB
# import datetime as dt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# import optiver2023
# env = optiver2023.make_env()

In [None]:
data_df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")

# downloads_path = str(Path.home() / "Downloads")
# api = KaggleApi()
# api.authenticate()

# api.competition_download_files("optiver-trading-at-the-close",
#                                path=os.getcwd())
# api.competition_download_file("optiver-trading-at-the-close",
#                               "train.csv",
#                                path=os.getcwd())

# with zipfile.ZipFile("train.csv.zip", 'r') as zip_ref:
#     zip_ref.extractall(path=os.getcwd())

# with zipfile.ZipFile("optiver-trading-at-the-close.zip", 'r') as zip_ref:
#     zip_ref.extractall(path=os.getcwd())

# data_df = pd.read_csv("train.csv")

# **DATA CLEANSING**

In [None]:

print(data_df.info())
# train_df = train_df.drop(columns=['row_id'])

# handle missing values values in train data
columns_to_fill = ['imbalance_size', 'reference_price', 'matched_size', 'bid_price', 'ask_price']
grouped_means = data_df.groupby('stock_id')[columns_to_fill].transform('mean')

# Fill NaN values with the grouped means
data_df[columns_to_fill] = data_df[columns_to_fill].fillna(grouped_means)
data_df.columns

# Define the formula for wap
wap_formula = (data_df['bid_price'] * data_df['ask_size'] + data_df['ask_price'] * data_df['bid_size']) / (data_df['bid_size'] + data_df['ask_size'])

# Fill missing values in 'wap' using np.where
data_df['wap'] = np.where(data_df['wap'].isna(), wap_formula, data_df['wap'])
data_df = data_df.dropna()
print(data_df.shape)

# **TRIAN TEST SPLIT**

In [None]:
from sklearn.model_selection import train_test_split
X = data_df.drop(columns=['target', 'row_id', 'time_id', 'near_price', 'far_price'])
y = data_df[['target']]

# First split the data into training (80%) and temporary (20%) sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2)

# Then split the temporary set into validation (10%) and test (10%) sets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.5)

# Convert the targets to float
y_train = y_train['target'].astype('float')
y_val = y_val['target'].astype('float')
y_test = y_test['target'].astype('float')

# **Error Testing**

In [None]:
from sklearn.metrics import mean_absolute_error

# Calculate the MAE
# mae = mean_absolute_error(true_values, predictions)

# print('The Mean Absolute Error of our forecasts is {}'.format(round(mae, 2)))


# **Linear Regression**

In [None]:
lr = LinearRegression()
lr_model = lr.fit(X_train, y_train)
lr_predict = lr_model.predict(X_test)

lr_mae = mean_absolute_error(y_test, lr_predict)
print('The Mean Absolute Error of our forecasts is {}'.format(round(lr_mae, 2)))

# **MLP MODEL**

In [None]:
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(hidden_layer_sizes=(5,5,5), activation='identity', max_iter=500)

mlp_model = mlp.fit(X_train, y_train)
mlp_model.score(X_test, y_test)
mlp_predict = mlp_model.predict(X_test)

mlp_mae = mean_absolute_error(y_test, mlp_predict)
print('The Mean Absolute Error of our forecasts is {}'.format(round(mlp_mae, 2)))

# **Cat Boosting**

In [None]:
from catboost import CatBoostRegressor as CBR;

cbr = CBR(objective='MAE',iterations=300, depth=4, learning_rate=.01,l2_leaf_reg=1.4,early_stopping_rounds=15)
cbr_model = cbr.fit(X_train, y_train)
cbr_model.score(X_test, y_test)

# **Tabnet**

In [None]:
target = "target"
data_df.drop(columns=['row_id', 'time_id', 'near_price', 'far_price'], inplace=True)
n_total = len(data_df)

train_val_indices, test_indices = train_test_split(
    range(n_total), test_size=0.2, random_state=0)
train_indices, valid_indices = train_test_split(
    train_val_indices, test_size=0.2 / 0.6, random_state=0)

categorical_columns = []
categorical_dims =  {}
for col in data_df.columns[data_df.dtypes == object]:
    print(col, data_df[col].nunique())
    l_enc = LabelEncoder()
    data_df[col] = data_df[col].fillna("VV_likely")
    data_df[col] = l_enc.fit_transform(data_df[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

for col in data_df.columns[data_df.dtypes == 'float64']:
    data_df.fillna(data_df.loc[train_indices, col].mean(), inplace=True)
    
unused_feat = []

features = [ col for col in data_df.columns if col not in unused_feat+[target]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [None]:
!pip install torch
!pip install pytorch-tabnet wget
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import KFold
# from sklearn.preprocessing import LabelEncoder, MinMaxScalar
from sklearn.metrics import accuracy_score

clf = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=1,
    lambda_sparse=1e-4, momentum=0.3, clip_value=2.,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params = {"gamma": 0.95,
                     "step_size": 20},
    scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15

if os.getenv("CI", False):
# Take only a subsample to run CI
    X_train = train[features].values[train_indices][:1000,:]
    y_train = train[target].values[train_indices][:1000]
else:
    X_train = train[features].values[train_indices]
    y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]
    
max_epochs = 5 if not os.getenv("CI", False) else 2
    
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    max_epochs=max_epochs, patience=100,
    batch_size=16384, virtual_batch_size=256
) 

In [None]:
plt.plot(clf.history['train']['loss'])
plt.plot(clf.history['valid']['loss'])
plt.show()

plt.pltot([-x for x in clf.history['train']['metric']])
plt.pltot([-x for x in clf.history['valid']['metric']])
plt.show()

In [None]:
preds = clf.predict_proba(X_test)
test_ac = roc_auc_score(y_score=preds[:,1], y_true=y_test)

# Optiver Dummy Test from Optiver

**Baseline**

In [None]:
# import optiver2023
# env = optiver2023.make_env()
# iter_test = env.iter_test()
# counter = 0
# for (test, revealed_targets, sample_prediction) in iter_test:
#     if counter == 0:
#         print(test.head(3))
#         print(revealed_targets.head(3))
#         print(sample_prediction.head(3))
#     sample_prediction['target'] = 0
#     env.predict(sample_prediction)
#     counter += 1

**Linear Regression Submit**

In [None]:
# import optiver2023
# env = optiver2023.make_env()
# iter_test = env.iter_test()
# counter = 0
# for (test, revealed_targets, sample_prediction) in iter_test:
#     print(test.head(3))
#     print(revealed_targets.head(3))
#     print(sample_prediction.head(3))
#     test = test.drop(columns=['row_id', 'near_price', 'far_price'])
#     columns_to_fill = ['imbalance_size', 'reference_price', 'matched_size', 'bid_price', 'ask_price']
#     grouped_means = test.groupby('stock_id')[columns_to_fill].transform('mean')

#     # Fill NaN values with the grouped means
#     test[columns_to_fill] = test[columns_to_fill].fillna(grouped_means)

#     # Define the formula for wap
#     wap_formula = (test['bid_price'] * test['ask_size'] + test['ask_price'] * test['bid_size']) / (test['bid_size'] + test['ask_size'])

#     # Fill missing values in 'wap' using np.where
#     test['wap'] = np.where(test['wap'].isna(), wap_formula, test['wap'])
#     print(test.isna())
#     sample_prediction['target'] = lr_model.predict(test)
#     env.predict(sample_prediction)

https://www.kaggle.com/code/sohier/optiver-2023-basic-submission-demo?cellIds=1&kernelSessionId=144492283

**MLP Submit**

In [None]:
# import optiver2023
# env = optiver2023.make_env()
# iter_test = env.iter_test()
# counter = 0
# for (test, revealed_targets, sample_prediction) in iter_test:
#     print(test.head(3))
#     print(revealed_targets.head(3))
#     print(sample_prediction.head(3))
#     test = test.drop(columns=['row_id', 'near_price', 'far_price'])
#     columns_to_fill = ['imbalance_size', 'reference_price', 'matched_size', 'bid_price', 'ask_price']
#     grouped_means = test.groupby('stock_id')[columns_to_fill].transform('mean')

#     # Fill NaN values with the grouped means
#     test[columns_to_fill] = test[columns_to_fill].fillna(grouped_means)

#     # Define the formula for wap
#     wap_formula = (test['bid_price'] * test['ask_size'] + test['ask_price'] * test['bid_size']) / (test['bid_size'] + test['ask_size'])

#     # Fill missing values in 'wap' using np.where
#     test['wap'] = np.where(test['wap'].isna(), wap_formula, test['wap'])
#     print(test.isna())
#     sample_prediction['target'] = mlp.predict(test)
#     env.predict(sample_prediction)

**Cad Boosting Submit**

In [None]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    print(test.head(3))
    print(revealed_targets.head(3))
    print(sample_prediction.head(3))
    test = test.drop(columns=['row_id','near_price','far_price'])
    columns_to_fill = ['imbalance_size', 'reference_price', 'matched_size', 'bid_price', 'ask_price']
    grouped_means = test.groupby('stock_id')[columns_to_fill].transform('mean')

    # Fill NaN values with the grouped means
    test[columns_to_fill] = test[columns_to_fill].fillna(grouped_means)

    # Define the formula for wap
    wap_formula = (test['bid_price'] * test['ask_size'] + test['ask_price'] * test['bid_size']) / (test['bid_size'] + test['ask_size'])

    # Fill missing values in 'wap' using np.where
    test['wap'] = np.where(test['wap'].isna(), wap_formula, test['wap'])
    print(test.isna())
    sample_prediction['target'] = cbr.predict(test)
    env.predict(sample_prediction)