In [1]:
import h5py
import numpy as np
import tensorflow as tf
import os
from tensorflow.keras.layers import BatchNormalization, Dense, LSTM, LayerNormalization, Normalization

In [2]:
def load_embeddings(source):
    file = open(source, 'rb')
    hf = h5py.File(file)
    att = hf['att'][()]
    x = hf['x'][()]
    t = hf['t'][()]
    y = hf['y'][()]
    l = hf['id'][()]
    m = hf['m'][()]
    return att, y, l, m, x, t

In [6]:
%%time
for fold_n in range(1):
    for dataset in ['alcock_50']:
        folder = '../../encoded/{}/fold_0'.format(dataset, fold_n)
        os.makedirs(folder, exist_ok=True)

        att_0, y_0, i_0, m_0, x_0, t_0 = load_embeddings('../../embeddings/{}/train.h5'.format(dataset))
        att_1, y_1, i_1, m_1, x_1, t_1 = load_embeddings('../../embeddings/{}/val.h5'.format(dataset))
        att_2, y_2, i_2, m_2, x_2, t_2 = load_embeddings('../../embeddings/{}/test.h5'.format(dataset))

        att = np.concatenate([att_0, att_1, att_2])
        y = np.concatenate([y_0, y_1, y_2])
        i = np.concatenate([i_0, i_1, i_2])
        m = np.concatenate([m_0, m_1, m_2])
        x = np.concatenate([x_0, x_1, x_2])
        t = np.concatenate([t_0, t_1, t_2])

        indices = np.arange(len(att))
        np.random.shuffle(indices)
        n_test = int(len(att)*0.2)

        att_test = att[indices[:n_test]]
        y_test   = y[indices[:n_test]]
        i_test   = i[indices[:n_test]]
        m_test   = m[indices[:n_test]]
        x_test   = x[indices[:n_test]]
        t_test   = t[indices[:n_test]]

        att_val = att[indices[n_test:2*n_test]]
        y_val   = y[indices[n_test:2*n_test]]
        i_val   = i[indices[n_test:2*n_test]]
        m_val   = m[indices[n_test:2*n_test]]
        x_val   = x[indices[n_test:2*n_test]]
        t_val   = t[indices[n_test:2*n_test]]

        att_train = att[indices[2*n_test:]]
        y_train   = y[indices[2*n_test:]]
        i_train   = i[indices[2*n_test:]]
        m_train   = m[indices[2*n_test:]]
        x_train   = x[indices[2*n_test:]]
        t_train   = t[indices[2*n_test:]]

        with h5py.File('{}/train.h5'.format(folder), 'w') as hf:
            hf.create_dataset('att', data=att_train)
            hf.create_dataset('x', data=x_train)
            hf.create_dataset('t', data=t_train)
            hf.create_dataset('y', data=y_train)
            hf.create_dataset('id', data=i_train.astype('S'))
            hf.create_dataset('m', data=m_train)

        with h5py.File('{}/val.h5'.format(folder), 'w') as hf:
            hf.create_dataset('att', data=att_val)
            hf.create_dataset('x', data=x_val)
            hf.create_dataset('t', data=t_val)
            hf.create_dataset('y', data=y_val)
            hf.create_dataset('id', data=i_val.astype('S'))
            hf.create_dataset('m', data=m_val)

        with h5py.File('{}/test.h5'.format(folder), 'w') as hf:
            hf.create_dataset('att', data=att_test)
            hf.create_dataset('x', data=x_test)
            hf.create_dataset('t', data=t_test)
            hf.create_dataset('y', data=y_test)
            hf.create_dataset('id', data=i_test.astype('S'))
            hf.create_dataset('m', data=m_test)

CPU times: user 39.8 ms, sys: 280 ms, total: 320 ms
Wall time: 346 ms
