In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install datatable

Collecting datatable
  Downloading datatable-1.1.0-cp310-cp310-manylinux_2_35_x86_64.whl (82.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datatable
Successfully installed datatable-1.1.0


In [4]:
%reload_ext autoreload
%autoreload 2
import sys
sys.path.append("./drive/MyDrive/millennium_project")

import glob
import os, gc
import numpy as numpy
import pandas as pd
import scipy as sp
import datatable as dt
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from sklearn.metrics import r2_score
from numba import njit
from utils import *

from IPython.display import clear_output

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
# TF
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

In [5]:
ticker_name = "GOOG"

In [6]:
N_FOLD = 5
N_MINS = 5
MIN_SIZE = 600 // N_MINS

SOL_NAME = ticker_name+'-unet'
DATA_NAME = ticker_name
mkdir(f'/content/drive/MyDrive/millennium_project/models/{SOL_NAME}/')

# Functions

In [7]:
def rmse_keras(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_true - y_pred)))

def create_mlp(num_columns, num_labels, hidden_units, dropout_rates, learning_rate):
    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
    # Output layer
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation('linear')(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=[mse_loss],
    )
    return model

In [14]:
def cbr(x, out_layer, kernel, stride, dilation):
    x = tf.keras.layers.Conv1D(out_layer, kernel_size=kernel, dilation_rate=dilation, strides=stride, padding="same")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation("relu")(x)
    return x

def resblock(x_in, layer_n, kernel, dilation, use_se=True):
    x = cbr(x_in, layer_n, kernel, 1, dilation)
    x = cbr(x, layer_n, kernel, 1, dilation)
    if use_se:
        x = se_block(x, layer_n)
    x = tf.keras.layers.Add()([x_in, x])
    return x

def se_block(x_in, layer_n):
    x = tf.keras.layers.GlobalAveragePooling1D()(x_in)
    x = tf.keras.layers.Dense(layer_n//8, activation="relu")(x)
    x = tf.keras.layers.Dense(layer_n, activation="sigmoid")(x)
    x_out = tf.keras.layers.Multiply()([x_in, x])
    return x_out

def create_unet(num_columns, num_labels, learning_rate, layer_n, kernel_size, depth):
    # input
    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)

    # 1dcnn
    x = tf.keras.layers.Dense(4096, activation='relu')(x)
    x = tf.keras.layers.Reshape((64, 64))(x)

    input_layer_1 = tf.keras.layers.AveragePooling1D(16)(x)
    input_layer_2 = tf.keras.layers.AveragePooling1D(64)(x)

    ###########  Encoder  ###########
    x = cbr(x, layer_n, kernel_size, 4, 1)
    for i in range(depth):
        x = resblock(x, layer_n, kernel_size, 1)
    out_0 = x

    x = cbr(x, layer_n*2, kernel_size, 5, 1)
    for i in range(depth):
        x = resblock(x, layer_n*2, kernel_size, 1)
    out_1 = x

    x = tf.keras.layers.Concatenate()([x, input_layer_1])

    x = cbr(x, layer_n*3, kernel_size, 5, 1)
    for i in range(depth):
        x = resblock(x, layer_n*3, kernel_size, 1)
    out_2 = x

    x = tf.keras.layers.Concatenate()([x, input_layer_2])
    x = cbr(x, layer_n*4, kernel_size, 5, 1)
    for i in range(depth):
        x = resblock(x, layer_n*4, kernel_size, 1)

    ########### Decoder  ###########
    x = tf.keras.layers.UpSampling1D(1)(x)
    x = tf.keras.layers.Concatenate()([x, out_2])
    x = cbr(x, layer_n*3, kernel_size, 1, 1)

    x = tf.keras.layers.UpSampling1D(4)(x)
    x = tf.keras.layers.Concatenate()([x, out_1])
    x = cbr(x, layer_n*2, kernel_size, 1, 1)

    x = tf.keras.layers.UpSampling1D(4)(x)
    x = tf.keras.layers.Concatenate()([x, out_0])
    x = cbr(x, layer_n, kernel_size, 1, 1)

    #x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)
    x = tf.keras.layers.Flatten()(x)

    x = tf.keras.layers.Dense(num_labels)(x)

    model = tf.keras.models.Model(inputs=inp, outputs=x)
    model.compile(
        optimizer=tf.keras.optimizers.Adam( learning_rate=5e-3),
        loss=rmse_keras,
    )
    return model

# Loading data

In [9]:
# train
df_train = dt.fread(f'./drive/MyDrive/millennium_project/dataset/train_{DATA_NAME}_LGB.csv').to_pandas()
fea_cols = fea_cols_GOOG
df_train["name"] = ticker_name
# result
df_result = dt.fread('./drive/MyDrive/millennium_project/dataset/train.csv').to_pandas()
df_result = gen_row_id(df_result)

In [10]:
df_train = gen_row_id(df_train)
df_train = add_time_fold(df_train, N_FOLD)

In [11]:
df_train.index = df_train.row_id
df_result.index = df_result.row_id

# Evaluation

In [12]:
batch_size = 1024
layer_n = 16
kernel_size = 4
depth = 2
learning_rate = 6e-3
epochs = 1000

list_seeds = [0, 11, 42]

In [16]:
list_rmse = []
for i_seed, seed in enumerate(list_seeds):
    df_train = add_time_fold(df_train, N_FOLD, seed=seed)
    list_rmse += [[]]
    for i_fold in range(N_FOLD):
        gc.collect()
        df_tr = df_train.loc[df_train.fold!=i_fold]
        df_te = df_train.loc[df_train.fold==i_fold]

        X_train = df_tr[fea_cols].values
        y_train = df_tr[['value_to_predict']].values
        X_test = df_te[fea_cols].values
        y_test = df_te[['value_to_predict']].values
        idx_test = df_train.loc[df_train.fold==i_fold].index
        print(f'Fold {i_seed+1}/{len(list_seeds)} | {i_fold+1}/{N_FOLD}', X_train.shape, X_test.shape)


        # Callbacks
        ckp_path = f'./models/{SOL_NAME}/model_{i_seed}_{i_fold}.hdf5'
        rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_delta=1e-5, verbose=2)
        es = EarlyStopping(monitor='val_loss', min_delta=1e-5, patience=12, restore_best_weights=True, verbose=2)

        model = create_unet(X_train.shape[1], 1, learning_rate, layer_n, kernel_size, depth)
        history = model.fit(X_train, y_train,
            epochs=epochs,
            validation_data=(X_test, y_test),
            validation_batch_size=len(y_test),
            batch_size=batch_size,
            verbose=2,
            callbacks=[rlr, es]
        )
        # model = tf.keras.models.load_model(ckp_path, custom_objects={'mspe_loss': mspe_loss})

        y_pred = model.predict(X_test, batch_size=len(y_test))
        curr_rmse = rmse_keras(y_test, y_pred)
        list_rmse[-1] += [curr_rmse]
        model.save(ckp_path)
        # generate and save preds
        df_result.loc[idx_test, f'pred_{i_seed}'] = y_pred
        clear_output()
        print(list_rmse)

[[<tf.Tensor: shape=(), dtype=float64, numpy=0.04650916719038975>, <tf.Tensor: shape=(), dtype=float64, numpy=0.044729229491769454>, <tf.Tensor: shape=(), dtype=float64, numpy=0.04300638803282923>, <tf.Tensor: shape=(), dtype=float64, numpy=0.03577393568970782>, <tf.Tensor: shape=(), dtype=float64, numpy=0.03802542473266218>], [<tf.Tensor: shape=(), dtype=float64, numpy=0.04956552910051011>, <tf.Tensor: shape=(), dtype=float64, numpy=0.043476426332601306>, <tf.Tensor: shape=(), dtype=float64, numpy=0.04209218293566483>, <tf.Tensor: shape=(), dtype=float64, numpy=0.044814574107493226>, <tf.Tensor: shape=(), dtype=float64, numpy=0.3252149954757106>], [<tf.Tensor: shape=(), dtype=float64, numpy=0.35264281613429194>, <tf.Tensor: shape=(), dtype=float64, numpy=0.3135018346089511>, <tf.Tensor: shape=(), dtype=float64, numpy=0.046013823818307875>, <tf.Tensor: shape=(), dtype=float64, numpy=0.05886154765534021>, <tf.Tensor: shape=(), dtype=float64, numpy=0.048496733808998646>]]


In [19]:
df_result.to_csv(f'./drive/MyDrive/millennium_project/results/{SOL_NAME}.csv', index=False)

In [23]:
df_result = df_result.dropna()

In [24]:
for i in range(len(list_seeds)):
    print(i, rmse(df_result['value_to_predict'], df_result[f'pred_{i}']))
print('All: ', rmse(df_result['value_to_predict'], df_result[[f'pred_{i}' for i in range(len(list_seeds))]].mean(axis=1)))

0 0.04180882437211766
1 0.1508808269251388
2 0.21478682514873496
All:  0.10288502871812916


In [25]:
list_rmse

[[<tf.Tensor: shape=(), dtype=float64, numpy=0.04650916719038975>,
  <tf.Tensor: shape=(), dtype=float64, numpy=0.044729229491769454>,
  <tf.Tensor: shape=(), dtype=float64, numpy=0.04300638803282923>,
  <tf.Tensor: shape=(), dtype=float64, numpy=0.03577393568970782>,
  <tf.Tensor: shape=(), dtype=float64, numpy=0.03802542473266218>],
 [<tf.Tensor: shape=(), dtype=float64, numpy=0.04956552910051011>,
  <tf.Tensor: shape=(), dtype=float64, numpy=0.043476426332601306>,
  <tf.Tensor: shape=(), dtype=float64, numpy=0.04209218293566483>,
  <tf.Tensor: shape=(), dtype=float64, numpy=0.044814574107493226>,
  <tf.Tensor: shape=(), dtype=float64, numpy=0.3252149954757106>],
 [<tf.Tensor: shape=(), dtype=float64, numpy=0.35264281613429194>,
  <tf.Tensor: shape=(), dtype=float64, numpy=0.3135018346089511>,
  <tf.Tensor: shape=(), dtype=float64, numpy=0.046013823818307875>,
  <tf.Tensor: shape=(), dtype=float64, numpy=0.05886154765534021>,
  <tf.Tensor: shape=(), dtype=float64, numpy=0.04849673380

In [27]:
df_result.tail(100)

Unnamed: 0_level_0,C0,name,Date,value_to_predict,row_id,pred_0,pred_1,pred_2
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GOOG-2023-05-09 00:00:00,10387,GOOG,2023-05-09,0.190725,GOOG-2023-05-09 00:00:00,0.273192,0.351463,0.291763
GOOG-2023-05-10 00:00:00,10388,GOOG,2023-05-10,0.106384,GOOG-2023-05-10 00:00:00,0.230603,0.175166,0.628111
GOOG-2023-05-11 00:00:00,10389,GOOG,2023-05-11,0.048198,GOOG-2023-05-11 00:00:00,0.035711,0.511716,-0.043458
GOOG-2023-05-12 00:00:00,10390,GOOG,2023-05-12,0.013207,GOOG-2023-05-12 00:00:00,-0.001646,-0.012881,0.603193
GOOG-2023-05-15 00:00:00,10391,GOOG,2023-05-15,-0.032371,GOOG-2023-05-15 00:00:00,0.009066,0.538488,-0.094845
...,...,...,...,...,...,...,...,...
GOOG-2023-09-25 00:00:00,10482,GOOG,2023-09-25,0.554258,GOOG-2023-09-25 00:00:00,0.501009,0.492445,0.534981
GOOG-2023-09-26 00:00:00,10483,GOOG,2023-09-26,0.559539,GOOG-2023-09-26 00:00:00,0.556976,0.567357,0.573067
GOOG-2023-09-27 00:00:00,10484,GOOG,2023-09-27,0.561397,GOOG-2023-09-27 00:00:00,0.527642,0.476380,0.582250
GOOG-2023-09-28 00:00:00,10485,GOOG,2023-09-28,0.550145,GOOG-2023-09-28 00:00:00,0.536846,0.537095,0.559421
