# TRAIN

In [None]:
import tensorflow as tf
import datetime
import pandas as pd
import numpy as np
import os
import gc

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

## CONFIGs

In [None]:
# 데이터 분포 고려, 적절한 길이이면서 + 모델 layer 특성 상 중간에서 zero_padding 일어나지 않는 길이.
MAX_SEQ_LEN = 499200
N_CLASSES = 1
N_CHANNELS = 2

K_SPLITS = 5   # Number of folds for cross validation.

EPOCHS=100
BATCH_SIZE = 4
LEARNING_RATE = 1e-3

# Prediction threshold ! over this will be considered as sleeping steps.
# it is equal to model's threshold.(maybe, need to be checked)
THRESHOLD = 0.5

In [None]:
tf.random.set_seed(42)
np.random.seed(42)

## DATA PREPARATION

In [None]:
def series_cleaner(series, ID, clean_type=1):
    """
    child method for below 'series_segmentation_preprocess_cleaning' method.
    """
    to_clean_indices=[]
    if clean_type==1:
        to_clean_indices+=[165, 200, 4, 17, 24, 53, 65, 100]

        if ID==to_clean_indices[0]:
            return series[:230000]
        elif ID==to_clean_indices[1]:
            return series[150000:400000]
        elif ID==to_clean_indices[2]:
            return series[:140000]
        elif ID==to_clean_indices[3]:
            return series[:75000]
        elif ID==to_clean_indices[4]:
            return series[:90000]
        elif ID==to_clean_indices[5]:
            return series[:200000]
        elif ID==to_clean_indices[6]:
            return series   # train whole.
        elif ID==to_clean_indices[7]:
            return series[:300000]
        
        # else
        return series

# Premise: y_true is one hot vector
def series_segmentation_preprocess_cleaning(series, clean_type=1):
    """
    INPUTS
        - series: preprocessed series
        - clean_type
            0: do nothing.
            1: drop unlabeled area which don't have pattern. (suspicious ones)

    OUTPUTS
        - X: (N, MAX_SEQ_LEN, 2) truncated, padded series
    """

    # for my preprocessed data only
    X = []
    X_len_type=[]   # for event preprocessing
    total_id_num = int(series['id_index'].iloc[-1]) + 1
    for i in range(total_id_num):
        if i==163:
            # do not train id==163. too noisy?
            # TODO : need check if model is good to have this data.
            X_len_type.append(100)  # 100 means pass.
            continue
        series_per_id = series.loc[series['id_index'] == i].drop(['id_index'], axis=1)

        # if using data cleaning
        if clean_type != 0:
            series_per_id = series_cleaner(series_per_id, i, clean_type)

        # fit series into model's config
        if len(series_per_id) <= MAX_SEQ_LEN:
            seq1 = series_per_id.to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq1)
            seq1 = np.pad(seq1, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            X += [seq1]
            len_type=0
            X_len_type.append(len_type)
        elif MAX_SEQ_LEN < len(series_per_id) <= 2*MAX_SEQ_LEN:
            seq1 = series_per_id[:MAX_SEQ_LEN].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq1)
            seq1 = np.pad(seq1, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            seq2 = series_per_id[MAX_SEQ_LEN:].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq2)
            seq2 = np.pad(seq2, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            X += [seq1,seq2]
            len_type=1
            X_len_type.append(len_type)
        elif 2*MAX_SEQ_LEN < len(series_per_id) <= 3*MAX_SEQ_LEN:
            seq1 = series_per_id[:MAX_SEQ_LEN].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq1)
            seq1=np.pad(seq1, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            seq2 = series_per_id[MAX_SEQ_LEN:2*MAX_SEQ_LEN].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq2)
            seq2=np.pad(seq2, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            seq3 = series_per_id[2*MAX_SEQ_LEN:].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq3)
            seq3=np.pad(seq3, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            X += [seq1,seq2,seq3]
            len_type=2
            X_len_type.append(len_type)
        elif 3*MAX_SEQ_LEN < len(series_per_id) <= 4*MAX_SEQ_LEN:
            seq1 = series_per_id[:MAX_SEQ_LEN].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq1)
            seq1 = np.pad(seq1, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            seq2 = series_per_id[MAX_SEQ_LEN:2*MAX_SEQ_LEN].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq2)
            seq2 = np.pad(seq2, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            seq3 = series_per_id[2*MAX_SEQ_LEN:3*MAX_SEQ_LEN].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq3)
            seq3 = np.pad(seq3, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            seq4 = series_per_id[3*MAX_SEQ_LEN:].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq4)
            seq4 = np.pad(seq4, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            X += [seq1,seq2,seq3,seq4]
            len_type=3
            X_len_type.append(len_type)
        else:
            assert False, "MAX_SEQ_LEN > 1,840,000 is yet implemented for this dataset. something you are doing wrong."

    return np.array(X), np.array(X_len_type)

In [None]:
def event_cleaner(events, ID, clean_type=1):
    """
    I tried to make labels using event['step']. 
    If series were modified, event['step'] should also be modified.
    This function does that role.
    """

    if clean_type==1:
        to_clean_indices = [165, 200, 4, 17, 24, 53, 65, 100]
        
        # modify event step
        if ID==to_clean_indices[1]:
            events['step'] -= 150000
            return events
    # else
    return events

def events_segmentation_preprocess_cleaning(events, X_len_type, clean_type=1):
    """
    events: preprocessed events data
    X_len_type: from series_preprocess output
    clean_type: same as 'series_segmentation_preprocess'.
    """

    df_to_gt=[]
    for ID, ID_len_type in enumerate(X_len_type):

        if ID in [163]:
            assert ID_len_type != 'pass', "Internal Error."
            # do not trian id==163. too noisy?
            # TODO : need check if model is good to have this data.
            continue
        labels = events.loc[events['id_index']==ID].reset_index(drop=False)

        # If use cleaning,
        if clean_type != 0:
            labels = event_cleaner(labels, ID, clean_type)

        if ID_len_type == 0:
            mask1 = np.zeros(MAX_SEQ_LEN)
            for i in range(len(labels)):
                if labels.loc[i]['event']==0:
                    start=labels.loc[i]['step']
                    end=labels.loc[i+1]['step']
                    mask1[start:end]=1
            df_to_gt+=[mask1]

        elif ID_len_type==1:
            mask1 = np.zeros(MAX_SEQ_LEN)
            mask2 = np.zeros(MAX_SEQ_LEN)
            for i in range(len(labels)):
                if labels.loc[i]['event']==0:
                    start=labels.loc[i]['step']   # onset
                    end=labels.loc[i+1]['step']   # wakeup
                    if start<MAX_SEQ_LEN and end<MAX_SEQ_LEN:  #1
                        mask1[start:end]=1
                    elif start<MAX_SEQ_LEN and end>=MAX_SEQ_LEN:  #2
                        mask1[start:]=1
                        mask2[:end-MAX_SEQ_LEN]=1
                    elif start>=MAX_SEQ_LEN:  #3
                        mask2[start-MAX_SEQ_LEN:end-MAX_SEQ_LEN]=1
            df_to_gt+=[mask1,mask2]

        elif ID_len_type==2:
            mask1 = np.zeros(MAX_SEQ_LEN)
            mask2 = np.zeros(MAX_SEQ_LEN)
            mask3 = np.zeros(MAX_SEQ_LEN)
            for i in range(len(labels)):
                if labels.loc[i]['event']==0:
                    start=labels.loc[i]['step']   # onset
                    end=labels.loc[i+1]['step']   # wakeup

                    if start<MAX_SEQ_LEN and end<=MAX_SEQ_LEN: #1
                        mask1[start:end]=1
                    elif start<MAX_SEQ_LEN and end>MAX_SEQ_LEN: #2
                        mask1[start:]=1
                        mask2[:end-MAX_SEQ_LEN]=1
                    elif start>=MAX_SEQ_LEN and end<=2*MAX_SEQ_LEN: #3
                        mask2[start-MAX_SEQ_LEN:end-MAX_SEQ_LEN]=1
                    elif start>=MAX_SEQ_LEN and end>2*MAX_SEQ_LEN: #4
                        mask2[start-MAX_SEQ_LEN:]=1
                        mask3[:end-3*MAX_SEQ_LEN]
                    elif start>=2*MAX_SEQ_LEN:  #5
                        mask3[start-2*MAX_SEQ_LEN:end-2*MAX_SEQ_LEN]=1
            df_to_gt+=[mask1,mask2,mask3]

        elif ID_len_type==3:
            mask1 = np.zeros(MAX_SEQ_LEN)
            mask2 = np.zeros(MAX_SEQ_LEN)
            mask3 = np.zeros(MAX_SEQ_LEN)
            mask4 = np.zeros(MAX_SEQ_LEN)
            for i in range(len(labels)):
                if labels.loc[i]['event']==0:
                    start=labels.loc[i]['step']   # onset
                    end=labels.loc[i+1]['step']   # wakeup

                    if start<MAX_SEQ_LEN and end<=MAX_SEQ_LEN: #1
                        mask1[start:end]=1
                    elif start<MAX_SEQ_LEN and end>MAX_SEQ_LEN: #2
                        mask1[start:]=1
                        mask2[:end-MAX_SEQ_LEN]=1
                    elif start>=MAX_SEQ_LEN and end<=2*MAX_SEQ_LEN: #3
                        mask2[start-MAX_SEQ_LEN:end-MAX_SEQ_LEN]=1
                    elif start>=MAX_SEQ_LEN and end>2*MAX_SEQ_LEN: #4
                        mask2[start-MAX_SEQ_LEN:]=1
                        mask3[:end-3*MAX_SEQ_LEN]
                    elif start>=2*MAX_SEQ_LEN and end<=3*MAX_SEQ_LEN: #5
                        mask3[start-2*MAX_SEQ_LEN:end-2*MAX_SEQ_LEN]=1
                    elif start>=2*MAX_SEQ_LEN and end>3*MAX_SEQ_LEN:  #6
                        mask3[start-2*MAX_SEQ_LEN:]=1
                        mask4[:end-3*MAX_SEQ_LEN]=1
                    elif start>=3*MAX_SEQ_LEN:  #7
                        mask4[start-3*MAX_SEQ_LEN:end-3*MAX_SEQ_LEN]=1
            df_to_gt+=[mask1,mask2,mask3,mask4]

    return np.array(df_to_gt)

In [None]:
DATA_PATH = '/kaggle/input/zzz-utime-preprocessing-version1/'

In [None]:
train_series = pd.read_parquet(os.path.join(DATA_PATH, 'data/preprocessed/preprocessed_series.parquet')).drop(['step'], axis=1)
X, X_len_type = series_segmentation_preprocess_cleaning(train_series, clean_type=1)
del train_series

In [None]:
train_events = pd.read_csv(os.path.join(DATA_PATH, 'data/preprocessed/preprocessed_events.csv'), index_col=0)  # events length min-2 max-70 avg-36
Y_gt = events_segmentation_preprocess_cleaning(train_events, X_len_type, clean_type=1)
del train_events

### Train Vaild Split

In [None]:
# # Training Data
# x_train = X[:int(len(X)*(1-VALID_PORTION))]
# y_train = Y_gt[:int(len(Y_gt)*(1-VALID_PORTION))]

# # Valid Data
# x_val = X[int(len(X)*(1-VALID_PORTION)):]
# y_val = Y_gt[int(len(Y_gt)*(1-VALID_PORTION)):]

In [None]:
# train all data!
x_train = X
y_train = Y_gt

## MODEL

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Input, BatchNormalization, Cropping2D, \
                                    Concatenate, MaxPooling2D, Dense, \
                                    UpSampling2D, ZeroPadding2D, Lambda, Conv2D, \
                                    AveragePooling2D, DepthwiseConv2D

In [None]:
# node1을 node2의 모양과 같게 앞뒤를 '잘라주는' intermediate layer
def crop_nodes_to_match(node1, node2, n_crops):
    # node1: x  --> max_seq_len으로 0패딩한 feature
    # node2: input
    s1 = np.array(node1.get_shape().as_list())[1:-2]    # batch빼고, feature_dim(n_classes)랑, 2D계산용 임시차원 빼고
    s2 = np.array(node2.get_shape().as_list())[1:-2]    # 한마디로 seq

    # 만일 input이랑 모양이 틀리다면...
    if np.any(s1 != s2):
        n_crops += 1
        c = (s1-s2).astype(np.int)      # 유격 얼마나?
        cr = np.array([c//2, c//2]).flatten()
        cr[n_crops % 2] += c%2
        cropped_node1 = Cropping2D([list(cr), [0,0]])(node1)    # layer
    else:
        cropped_node1 = node1
    return cropped_node1, n_crops

def get_test_model(n_classes, seq_len, channels, depth,
                   pools, filters, kernel_size, activation, dilation, padding, kernel_reg,
                   dense_classifier_activation="tanh",
                   transition_window=1):
    inputs = Input(shape=(seq_len,channels))
    reshaped = [-1, seq_len, 1, channels]
    inp = Lambda(lambda x: tf.reshape(x, reshaped))(inputs)
    tmp_inp = inp

    """
    1. Encoding path
    """
    residual_connections = []
    for i in range(depth):
        l_name = "encoder" + f"_L{i}"
        conv = Conv2D(filters, (kernel_size,1),
                      activation=activation, padding=padding,
                      kernel_regularizer=kernel_reg,
                      dilation_rate=dilation,
                      name=l_name+"_conv1")(inp)
        bn = BatchNormalization(name=l_name+"_BN1")(conv)
        conv = Conv2D(filters, (kernel_size,1),
                activation=activation, padding=padding,
                kernel_regularizer=kernel_reg,
                dilation_rate=dilation,
                name=l_name+"_conv2")(bn)
        bn = BatchNormalization(name=l_name+"_BN2")(conv)
        inp = MaxPooling2D(pool_size=(pools[i],1),
                           name=l_name + "_pool")(bn)
        residual_connections.append(bn)
        filters = int(filters * 2)

    # Encoding path - Bottom
    l_name = "bottom" + f"_L{i}"
    conv = Conv2D(filters, (kernel_size,1),
                  activation=activation, padding=padding,
                  kernel_regularizer=kernel_reg,
                  dilation_rate=1,
                  name=l_name+"_conv1")(inp)
    bn = BatchNormalization(name=l_name+"_BN1")(conv)
    conv = Conv2D(filters, (kernel_size, 1),
                  activation=activation, padding=padding,
                  kernel_regularizer=kernel_reg,
                  dilation_rate=1,
                  name=l_name+"_conv2")(bn)
    x = BatchNormalization(name=l_name+"_BN2")(conv)

    """
    2. Decoding path
    """
    n_crops = 0
    residual_connections.reverse()
    for i in range(depth):
        filters = int(filters/2)
        l_name = "decoder" + f"_L{i}"

        # Up-sampling block
        fs = pools[::-1][i]
        up = UpSampling2D(size=(fs,1),
                          name=l_name + "_up")(x)
        conv = Conv2D(filters, (fs,1),
                      activation=activation,
                      padding=padding, kernel_regularizer=kernel_reg,
                      name=l_name + "_conv1")(up)
        bn = BatchNormalization(name=l_name+"_BN1")(conv)

        # Crop and concatenate
        cropped_res,n_crops = crop_nodes_to_match(residual_connections[i], bn, n_crops)
        # cropped_res = residual_connections[i]
        merge = Concatenate(axis=-1,
                            name=l_name + "_concat")([cropped_res, bn])
        conv = Conv2D(filters, (kernel_size, 1),
                        activation=activation, padding=padding,
                        kernel_regularizer=kernel_reg,
                        name=l_name + "_conv2")(merge)
        bn = BatchNormalization(name=l_name + "_BN2")(conv)
        conv = Conv2D(filters, (kernel_size, 1),
                        activation=activation, padding=padding,
                        kernel_regularizer=kernel_reg,
                        name=l_name + "_conv3")(bn)
        x = BatchNormalization(name=l_name + "_BN3")(conv)

    """
    3. Dense class modeling
    """
    x = Conv2D(filters=n_classes,
                 kernel_size=(1,1),
                 activation=dense_classifier_activation,    # tanh
                 name="dense_classifier_out")(x)
    s = seq_len - x.get_shape().as_list()[1]
    x = ZeroPadding2D(padding=[[s//2, s//2 + s%2],[0,0]])(x)
    # feature x에 0을 부족한 만큼 앞뒤에 붙여준다.

    # 처음 input이랑 모양 같도록 앞뒤 잘라준다.
    x, n_crops = crop_nodes_to_match(
        node1=x,
        node2=tmp_inp,
        n_crops=n_crops
    )

    """
    Sequence modeling
    """
    # x = AveragePooling2D((1,1),
    #                        name="average_pool")(x)
    x = Conv2D(filters=n_classes,
                 kernel_size=(transition_window, 1),
                 activation='sigmoid',  # HERE!!!
                 kernel_regularizer=regularizers.l2(1e-5),
                 padding='same',
                 name='sequence_conv_out')(x)
    # Conv2D에 sigmoid달면 연산이 어떻게 이뤄지는거지?
    s = [-1,seq_len,n_classes]
    out = Lambda(lambda x:tf.reshape(x,s),
                 name="sequence_classification_reshaped")(x)

    return Model(inputs, out)

## Dice Loss

from https://www.kaggle.com/code/bigironsphere/loss-function-library-keras-pytorch

In [None]:
import numpy
import tensorflow as tf
import keras
import keras.backend as K

In [None]:
def dice_loss(y_true, y_pred, smooth=1e-6):
    intersection = tf.math.reduce_sum(tf.math.multiply(y_true,y_pred))
    dice = (2*intersection + smooth) / (tf.math.reduce_sum(y_true)+tf.math.reduce_sum(y_pred)+smooth)
    return 1 - dice
# K.sum() = tf.math.reduce_sum() [sum over all dimensions.]
# K.dot() = tf.math.multiply(tf.math.reduce_sum())

## TRAINING

In [None]:
# model
model = get_test_model(n_classes=N_CLASSES,
                       seq_len=MAX_SEQ_LEN,
                       channels=N_CHANNELS,
                       depth=4,
                       pools=(10,8,6,4),
                       filters=int(8*2),
                       kernel_size=5,
                       activation='elu',
                       dilation=2,
                       padding="same",
                       kernel_reg=None)

model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss=dice_loss,
              metrics=[tf.keras.metrics.Recall(thresholds=THRESHOLD),
                       tf.keras.metrics.Precision(thresholds=THRESHOLD),
                       tf.keras.metrics.BinaryAccuracy(threshold=THRESHOLD)
                    ],
)

# log_dir = OUT_PATH + 'logs/fit/' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, 
#                                                       histogram_freq=1)

In [None]:
# TRAIN!
model.fit(x=x_train,
          y=y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          verbose=2,
)

In [None]:
# # Temporary
# CKPT_DIR_PATH = os.path.join(OUT_PATH, 'models/ckpt')

# model.save_weights(CKPT_DIR_PATH+'/baseline_bs1_epoch50')

# Inference

In [None]:
# loss, acc = model.evaluate(x_val, y_val, verbose=2)
# print(f'acc: {acc}')
# print(f'loss: {loss}')

plot some loss and acc here, diverse metrics

In [None]:
# idx = 10

# sample_x = tf.reshape(x_val[idx],(1,-1,2))
# sample_y = y_val[idx]

# loc=[]
# for i in range(len(sample_y)-1):
#     if sample_y[i]==0 and sample_y[i+1]==1:
#         loc.append(i+1)
# print(len(loc))

# sample_pred = model.predict(sample_x)
# sample_pred = sample_pred.reshape(-1)

# for i in loc:
#     print(sample_pred[i-100:i])
#     break

In [None]:
del X, Y_gt

# Let's submit this lame DL approach.

In [None]:
# Prediction threshold ! over this will be considered as sleeping steps.
# it is equal to model's threshold.(maybe, need to be checked)
THRESHOLD = 0.5

In [None]:
# Premise: y_true is one hot vector
def series_segmentation_preprocess_test(series):
    """
    INPUTS
        - series: preprocessed series

    OUTPUTS
        - X: (N, MAX_SEQ_LEN, 2) truncated, padded series
    """

    # for my preprocessed data only
    X = []
    X_len_type=[]
    # how many people?
    total_id_num = int(series['id_index'].iloc[-1]) + 1

    # make them into numpy for training
    for i in range(total_id_num):
        series_per_id = series.loc[series['id_index'] == i].drop(['id_index'], axis=1)
        # crop series longer than MAX_SEQ_LEN for the model!
        if len(series_per_id) <= MAX_SEQ_LEN:
            seq1 = series_per_id.to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq1)
            seq1 = np.pad(seq1, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            X += [seq1]
            len_type=0
            X_len_type.append(len_type)
        elif MAX_SEQ_LEN < len(series_per_id) <= 2*MAX_SEQ_LEN:
            seq1 = series_per_id[:MAX_SEQ_LEN].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq1)
            seq1 = np.pad(seq1, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            seq2 = series_per_id[MAX_SEQ_LEN:].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq2)
            seq2 = np.pad(seq2, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            X += [seq1,seq2]
            len_type=1
            X_len_type.append(len_type)
        elif 2*MAX_SEQ_LEN < len(series_per_id) <= 3*MAX_SEQ_LEN:
            seq1 = series_per_id[:MAX_SEQ_LEN].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq1)
            seq1=np.pad(seq1, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            seq2 = series_per_id[MAX_SEQ_LEN:2*MAX_SEQ_LEN].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq2)
            seq2=np.pad(seq2, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            seq3 = series_per_id[2*MAX_SEQ_LEN:].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq3)
            seq3=np.pad(seq3, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            X += [seq1,seq2,seq3]
            len_type=2
            X_len_type.append(len_type)
        elif 3*MAX_SEQ_LEN < len(series_per_id) <= 4*MAX_SEQ_LEN:
            seq1 = series_per_id[:MAX_SEQ_LEN].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq1)
            seq1 = np.pad(seq1, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            seq2 = series_per_id[MAX_SEQ_LEN:2*MAX_SEQ_LEN].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq2)
            seq2 = np.pad(seq2, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            seq3 = series_per_id[2*MAX_SEQ_LEN:3*MAX_SEQ_LEN].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq3)
            seq3 = np.pad(seq3, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            seq4 = series_per_id[3*MAX_SEQ_LEN:].to_numpy()
            pad_amount = MAX_SEQ_LEN - len(seq4)
            seq4 = np.pad(seq4, ((0,pad_amount),(0,0)), 'constant', constant_values=0)
            X += [seq1,seq2,seq3,seq4]
            len_type=3
            X_len_type.append(len_type)
        else:
            assert False, "MAX_SEQ_LEN > 1,840,000 is yet implemented for this dataset. something you are doing wrong."

    return np.array(X), np.array(X_len_type)

In [None]:
# test data processing
series = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet').drop(['timestamp','step'], axis=1)

id_map = pd.DataFrame({'series_id':series.series_id.unique(),
                       'id_index': [i for i in range(len(series.series_id.unique()))]})
id_map.id_index = id_map.id_index.astype(np.uint16)
series = series.merge(right=id_map, on='series_id').drop(columns='series_id')

# Normalize anglez, enmo
mean_enmo = 0.041315034
std_enmo = 0.09743800759315491
series['enmo'] = (series['enmo'] - mean_enmo)/std_enmo
mean_anglez = -8.810453
std_anglez = 30.157093048095703
series['anglez'] = (series['anglez'] - mean_anglez)/std_anglez

In [None]:
def index_to_id(ID_int, id_map):
    ID_str = id_map.iloc[ID_int]["series_id"]
    return ID_str

In [None]:
# Naive Prediction
test_inputs, test_len_types = series_segmentation_preprocess_test(series)
test_lens = np.add(test_len_types, 1)
del series

sub_ids=[]
sub_steps=[]
sub_events=[]
sub_scores=[]

for ID, test_len in enumerate(test_lens):
    # make prediction
    start_i = np.sum(test_lens[:ID])
    end_i = start_i + test_len
    preds=[]
    for i in range(start_i,end_i):
        input = test_inputs[i][np.newaxis,:]
        output = model.predict(input).reshape(-1)
        preds.append(output)
    pred = np.concatenate(preds,axis=0)
    
    # make submission from predictioion
    sleeping_flag=False
    for j in range(len(pred)):
        # Inference algorithm here !
        if pred[j]>THRESHOLD and not sleeping_flag:
            onset=j
            sub_ids.append(index_to_id(ID,id_map))
            sub_steps.append(onset)
            sub_events.append('onset')
            sub_scores.append(pred[onset])
            sleeping_flag=True
        if pred[j]<THRESHOLD and sleeping_flag:
            wakeup=j
            sub_ids.append(index_to_id(ID,id_map))
            sub_steps.append(wakeup)
            sub_events.append('wakeup')
            sub_scores.append(pred[wakeup])
            sleeping_flag=False

submission = pd.DataFrame(data={'series_id':sub_ids, 'step':sub_steps, 'event':sub_events, 'score':sub_scores})
sub_row_id = [i for i in range(len(submission))]
submission.insert(0, "row_id", sub_row_id)
submission.to_csv('submission.csv', index=False)
# display(submission)