In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import random as python_random
import tensorflow_addons as tfa

from keras import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Input, Dense, LSTM, Dropout
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTMCell, RNN, Dense, Dropout, Input, Lambda, Bidirectional, BatchNormalization
from tensorflow.keras.optimizers import Adam

In [None]:
SEED = 42

N_LAGS = 55

BATCH_SIZE = 32
BUFFER_SIZE = 100000
EPOCHS = 10000
PATIENCE = 25
DROPOUT = 0.5
LEARNING_RATE = 1e-4

SPLIT_DAY = 390

N_STOCKS = 200
N_DATES = 481
N_SECONDS = 55

RUN_TRAINING = False
RUN_FOR_SUBMISSION = True

In [None]:
os.environ['PYTHONHASHSEED'] = str(SEED)
tf.keras.utils.set_random_seed(SEED)

In [None]:
df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")
df = df[["stock_id", "date_id", "seconds_in_bucket", "target"]]
df.shape

In [None]:
def reduce_mem_usage(df, verbose=0):
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                else:
                    df[col] = df[col].astype(np.float32)

    print(f"Memory usage of dataframe is {start_mem:.2f} MB")
    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    decrease = 100 * (start_mem - end_mem) / start_mem
    print(f"Decreased by {decrease:.2f}%")

    return df

df = reduce_mem_usage(df, verbose=1)

In [None]:
all_stock_ids = range(N_STOCKS)
all_date_ids = range(N_DATES)
all_seconds = [i * 10 for i in range(N_SECONDS)]

multi_index = pd.MultiIndex.from_product([all_stock_ids, all_date_ids, all_seconds], 
                                         names=['stock_id', 'date_id', 'seconds_in_bucket'])

df_full = df.set_index(['stock_id', 'date_id', 'seconds_in_bucket']).reindex(multi_index)
df_full = df_full.fillna(0)
df_full = df_full.reset_index()

assert(df_full.shape[0] == N_STOCKS * N_DATES * N_SECONDS)

df_full

In [None]:
def windowed_dataset(dataset, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices(dataset)
    dataset = dataset.window(N_LAGS + 1, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(N_LAGS + 1))
    dataset = dataset.map(lambda window: (window[:-1], window[-1]))
    if shuffle:
      dataset = dataset.shuffle(BUFFER_SIZE)
    dataset = dataset.batch(BATCH_SIZE).prefetch(1)
    return dataset

In [None]:
def build_features(df):

    all_stock_ids = range(N_STOCKS)
    all_date_ids = df["date_id"].unique()
    all_seconds = [i * 10 for i in range(N_SECONDS)]
    
    multi_index = pd.MultiIndex.from_product([all_stock_ids, all_date_ids, all_seconds], 
                                             names=['stock_id', 'date_id', 'seconds_in_bucket'])
    df_full = df.set_index(['stock_id', 'date_id', 'seconds_in_bucket']).reindex(multi_index)
    df_full = df_full.fillna(0)
    df_full = df_full.reset_index()
    
    df_pivoted = df_full.pivot_table(
                values='target', 
                index=['date_id', 'seconds_in_bucket'], 
                columns='stock_id')

    df_pivoted = df_pivoted.reset_index(drop=True)
    df_pivoted.columns.name = None
    
    return df_pivoted

build_features(df_full)

In [None]:
# import warnings
# from keras.initializers import GlorotUniform

# warnings.filterwarnings('ignore', category=FutureWarning)
# warnings.filterwarnings('ignore', message="The initializer GlorotUniform is unseeded")

# initializer = GlorotUniform(seed=1004)

In [None]:
def build_model(dropout=DROPOUT):
#     model = Sequential()
#     model.add(Input(shape=(N_LAGS, N_STOCKS)))
#     model.add(Dropout(dropout))
#     # LSTMCell에 peephole 연결 활성화
#     lstm_cell = tfa.rnn.PeepholeLSTMCell(25)
#     # LSTMCell을 RNN 레이어로 감싸기
#     model.add(tf.keras.layers.RNN(lstm_cell, return_sequences=True))
#     model.add(Dropout(dropout))
#     # Lambda 레이어를 사용하여 시퀀스의 마지막 타임스텝을 선택
#     model.add(Lambda(lambda x: x[:, -1, :]))
#     model.add(Dense(N_STOCKS))
#     model.compile(loss='mae', optimizer=Adam(learning_rate=LEARNING_RATE))
    model = Sequential()
    model.add(Input(shape=(N_LAGS, N_STOCKS)))
    model.add(Bidirectional(RNN(tfa.rnn.PeepholeLSTMCell(55,kernel_regularizer=l2(0.0001)), return_sequences=True))) # 25
    model.add(BatchNormalization())
    #model.add(Dropout(dropout))
    model.add(Bidirectional(RNN(tfa.rnn.PeepholeLSTMCell(25, kernel_regularizer=l2(0.0001))))) # 25
    # model.add(Dropout(dropout))
    model.add(Dense(N_STOCKS, kernel_regularizer=l2(0.0001)))
    model.compile(loss='mae', optimizer=Adam(learning_rate=LEARNING_RATE))
    
    return model

In [None]:
%%time 
if RUN_TRAINING:
    
    split = df_full['date_id'] > SPLIT_DAY
    df_train = df_full[~split]
    df_valid = df_full[split]
    
    df_train_features = build_features(df_train)
    df_valid_features = build_features(df_valid)

    scaler = StandardScaler()
    train_features = scaler.fit_transform(df_train_features)
    valid_features = scaler.transform(df_valid_features)
 
    train_dataset = windowed_dataset(train_features)
    valid_dataset = windowed_dataset(valid_features, shuffle=False)

    model = build_model()

    early_stopping = EarlyStopping(monitor='val_loss',
                      mode='min',
                      patience=10,
                      restore_best_weights=True,
                      verbose=True)

    history = model.fit(train_dataset,
                        validation_data=valid_dataset,
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE,
                        callbacks=[early_stopping],
                        verbose=True)

    ## Evaluate ## 
    y_pred = model.predict(valid_dataset)

    y_pred = scaler.inverse_transform(y_pred)
    y_true = df_valid_features[N_LAGS:]
    
    mae = mean_absolute_error(y_true, y_pred)
    print(f"MAE score: {mae}")

    ## Plots ##
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model train vs validation loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper right')
    plt.show()

In [None]:
# model.save('peephole-version-55-25_ep17-5.73.h5')

In [None]:
from tensorflow.keras.models import load_model

if RUN_FOR_SUBMISSION:
    # model = load_model('/kaggle/input/model-version-chan-peephole/my_peephole_model.h5', custom_objects={'PeepholeLSTMCell': tfa.rnn.PeepholeLSTMCell})
    # model 불러오는거는 데이터셋 이름에 맞게..!
    model = load_model('/kaggle/input/peephole-version-55-25-ep17-5-73/peephole-version-55-25_ep17-5.73.h5', custom_objects={'PeepholeLSTMCell': tfa.rnn.PeepholeLSTMCell})
    scaler = StandardScaler()

    split = df_full['date_id'] > SPLIT_DAY
    df_train = df_full[~split]
    df_valid = df_full[split]

    df_train_features = build_features(df_train)
    df_valid_features = build_features(df_valid)

    train_features = scaler.fit_transform(df_train_features)
    valid_features = scaler.transform(df_valid_features)

In [None]:
if RUN_FOR_SUBMISSION:
    
    import optiver2023
    optiver2023.make_env.func_dict['__called__'] = False
    env = optiver2023.make_env()
    iter_test = env.iter_test()
    
    counter = 0
    
    for i, (test, revealed_targets, sample_prediction) in enumerate(iter_test):
        
        if test.currently_scored.iloc[0]== False:
            sample_prediction['target'] = 0
            env.predict(sample_prediction)
            counter += 1
            continue        
            
        if test.seconds_in_bucket.unique()[0] == 0:

            df_revealed_targets = revealed_targets[["stock_id", "revealed_date_id", "seconds_in_bucket", "revealed_target"]]
            df_revealed_targets = df_revealed_targets.rename(columns={'revealed_date_id': 'date_id', 'revealed_target': 'target'})

            df_features = build_features(df_revealed_targets)

            history_scaled = scaler.transform(df_features)

        y_pred_scaled = model.predict(
            history_scaled[-N_LAGS:][np.newaxis, :, :],
            verbose=True)
        
        y_pred = scaler.inverse_transform(y_pred_scaled)
        
        sample_prediction['target'] = y_pred[0]
        env.predict(sample_prediction)
        counter += 1
        
        history_scaled = np.concatenate([history_scaled, y_pred_scaled])

else:
    print("Run for submission skipped")