In [1]:
from __future__ import absolute_import, division, print_function

import numpy as np
import os
from sklearn import model_selection 

In [2]:
np.random.seed(42)

In [3]:
n_sample = 100
n_dim = 7
n_max_timestamp = 17
n_class = 3

In [4]:
input = np.empty(shape=(n_sample), dtype=object)
masking =  np.empty(shape=(n_sample), dtype=object)
timestamp = np.empty(shape=(n_sample), dtype=object)
label_taskname = np.empty(shape=(n_sample, n_class), dtype=int)
print(label_taskname.shape)

(100, 3)


In [5]:

label_taskname = np.stack((
    np.random.binomial(1, 0.3, size=(n_sample)), 
    np.random.binomial(1, 0.6, size=(n_sample)), 
    np.random.binomial(1, 0.2, size=(n_sample))
), axis=-1)
print(label_taskname.shape)

(100, 3)


In [6]:
for i in range(n_sample):
    len_t_i = np.random.randint(5, 17)
    timestamp_i = np.random.random(size=(len_t_i)) * 10 + 1
    timestamp_i = np.cumsum(timestamp_i) - timestamp_i[0]
    timestamp[i] = timestamp_i
print(timestamp.shape)
print(timestamp[0].shape)

(100,)
(8,)


In [7]:
for i in range(n_sample):
    start = np.random.random(size=n_dim)*np.pi*2
    input_i = np.zeros(shape=(n_dim, len(timestamp[i])), dtype=float)
    if label_taskname[i][0]:
        input_i += np.cos(start[:, np.newaxis] + timestamp[i][np.newaxis, :])
    if label_taskname[i][1]:
        input_i += np.cos(2 * (start[:, np.newaxis] + timestamp[i][np.newaxis, :])) + 1
    if label_taskname[i][2]:
        input_i += np.cos(5 * (start[:, np.newaxis] + timestamp[i][np.newaxis, :])) + 2
    input[i] = input_i.T

print(input.shape)
print(input[0].shape)

(100,)
(8, 7)


In [8]:
for i in range(n_sample):
    masking_i = (np.random.random(size=(len(timestamp[i]), n_dim)) > 0.7).astype(int)
    masking[i] = masking_i
    input[i][masking_i == 0] = np.nan

print(masking.shape)
print(masking[0].shape)
print(masking[0][0])
print(input[0][0])

(100,)
(8, 7)
[1 0 1 0 1 0 0]
[0.32611525        nan 1.86661421        nan 1.98591935        nan
        nan]


In [9]:
n_split = 5

In [10]:
fold_taskname = np.empty(shape=(n_split, 3), dtype=object)

idx_all = sorted(range(100))
for i_split, idx in enumerate(model_selection.KFold(5, shuffle=False).split(idx_all)):
    fold_taskname[i_split][2] = idx[-1]  # indices for testing
for i_split in range(n_split):
    fold_taskname[i_split][1] = fold_taskname[(i_split + 1) % n_split][2] # indices for validation
    print(fold_taskname[i_split][1])
    fold_taskname[i_split][0] = np.asarray(sorted(set(idx_all).difference(fold_taskname[i_split][1]).difference(fold_taskname[i_split][2]))) # indices for training

print(fold_taskname[0][0].shape, fold_taskname[0][1].shape, fold_taskname[0][2].shape)

[20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39]
[40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59]
[60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79]
[80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
(60,) (20,) (20,)


In [11]:
mean_taskname = np.zeros((n_split, 3, n_dim)) * np.nan
std_taskname = np.zeros((n_split, 3, n_dim)) * np.nan
for i_split in range(n_split):
    x_tr = np.concatenate(input[fold_taskname[i_split][0]], axis=0)
    mean_taskname[i_split][0] = np.nanmean(x_tr, axis=0)  # mean for every row
    std_taskname[i_split][0] = np.nanstd(x_tr, axis=0)
    
print(mean_taskname[0][0])
print(std_taskname[0][0])

[1.19445546 1.12043461 1.1152174  1.2170161  1.23047909 1.12230011
 1.1178312 ]
[1.28396076 1.23046311 1.21772513 1.31423306 1.34789895 1.36239225
 1.3876464 ]


In [12]:
os.makedirs(os.path.join('.', 'data', 'sample'), exist_ok=True)
np.savez(os.path.join('.', 'data', 'sample', 'data.npz'), input=input, masking=masking, timestamp=timestamp, label_taskname=label_taskname)
np.savez(os.path.join('.', 'data', 'sample', 'fold.npz'), fold_taskname=fold_taskname, mean_taskname=mean_taskname, std_taskname=std_taskname)