In [1]:
import numpy as np
import pandas as pd
from utils.dataloaders.InstanceDataset import InstanceDataset
from utils.plot import plot_trace_prediction
import torch
import math
import h5py
from torchinfo import summary 
from nn.eq_transformer import DataGenerator

In [2]:
# event_hdf5_file = "data/instance_samples/Instance_events_counts_10k.hdf5"
# event_metadata_file = "data/instance_samples/metadata_Instance_events_10k.csv"
# noise_hdf5_file = "data/instance_samples/Instance_noise_1k.hdf5"
# noise_metadata_file = "data/instance_samples/metadata_Instance_noise_1k.csv"

event_hdf5_file = "data/instance_samples/earthquake_20k.hdf5"
event_metadata_file = "data/instance_samples/earthquake_20k.csv"
noise_hdf5_file = "data/instance_samples/noise_20k.hdf5"
noise_metadata_file = "data/instance_samples/noise_20k.csv"

In [3]:

def _shuffle_events(event_metadata_file, noise_metadata_file, random_state, output_event_metadata_file, output_noise_metadata_file, balance=False, remove_empty_phase=False):
    
    """ 
    
    Split the list of input data into training, validation, and test set.

    Parameters
    ----------
    event_metadata_file: str
        path to event metadata file 
    
    noise_metadata_file: str
        path to event metadata file 

    random_state: int
        random state for reproducibility

    output_event_metadata_file: bool
       Path to the output event metadata file

    output_noise_metadata_file: str
       Path to the output noise metadata file

    balance: bool
       Should have equal noise and events 

    remove_empty_phase: bool
       Remove events that do not have both phases
              
    Returns
    -------   
    (output_event_metadata_file, output_noise_metadata_file)
    """       

    noise_metadata = pd.read_csv(noise_metadata_file)
    event_metadata = pd.read_csv(event_metadata_file, usecols=["trace_name", 'trace_start_time', 'trace_P_arrival_time', 'trace_S_arrival_time', 'trace_P_arrival_sample', 'trace_S_arrival_sample', 'trace_GPD_P_number', 'trace_GPD_S_number', 'trace_EQT_P_number', 'trace_EQT_S_number', 'trace_EQT_number_detections'])

    if remove_empty_phase:
        event_metadata = event_metadata[event_metadata["trace_P_arrival_sample"].notna() & event_metadata["trace_S_arrival_sample"].notna()]
    
    event_metadata = event_metadata.sample(frac=1, random_state=random_state) 
    noise_metadata = noise_metadata.sample(frac=1, random_state=random_state) 

    if balance:
        min_count = min(event_metadata.shape[0], noise_metadata.shape[0])
        event_metadata = event_metadata.head(min_count)
        noise_metadata = noise_metadata.head(min_count)

    event_metadata.to_csv(output_event_metadata_file, index=False)
    noise_metadata.to_csv(output_noise_metadata_file, index=False)

    return output_event_metadata_file, output_noise_metadata_file 


shuffled_event_metadata_file, shuffled_noise_metadata_file = _shuffle_events(
    event_metadata_file=event_metadata_file,
    noise_metadata_file=noise_metadata_file,
    random_state=9,
    output_event_metadata_file='temp/event_metadata.csv',
    output_noise_metadata_file='temp/noise_metadata.csv',
    balance=True,
    remove_empty_phase=True
)

In [4]:
event_metadata = pd.read_csv(shuffled_event_metadata_file, usecols=["trace_name", 'trace_start_time', 'trace_P_arrival_time', 'trace_S_arrival_time', 'trace_P_arrival_sample', 'trace_S_arrival_sample', 'trace_GPD_P_number', 'trace_GPD_S_number', 'trace_EQT_P_number', 'trace_EQT_S_number', 'trace_EQT_number_detections'])

event_metadata = event_metadata[event_metadata["trace_P_arrival_sample"].notna() & event_metadata["trace_S_arrival_sample"].notna()]

event_metadata.head(10)
# event_metadata.info(verbose=True)
# event_metadata.describe()
# event_metadata.shape

# test = event_metadata['trace_S_arrival_sample'] - event_metadata['trace_P_arrival_sample']
# test.describe()

# print(event_metadata["trace_name"].iloc[4969])
# print(event_metadata["trace_P_arrival_sample"][3])


Unnamed: 0,trace_start_time,trace_P_arrival_time,trace_S_arrival_time,trace_P_arrival_sample,trace_S_arrival_sample,trace_name,trace_GPD_P_number,trace_GPD_S_number,trace_EQT_number_detections,trace_EQT_P_number,trace_EQT_S_number
0,2016-11-19T17:07:47.00Z,2016-11-19T17:08:03.75Z,2016-11-19T17:08:10.47Z,1675,2347.0,10220691.IV.LNSS..HH,0.0,3.0,3.0,3.0,2.0
1,2016-11-20T01:41:35.53Z,2016-11-20T01:41:55.05Z,2016-11-20T01:41:56.60Z,1952,2107.0,10239411.IV.T1299..EH,2.0,2.0,1.0,1.0,1.0
2,2016-11-21T03:45:47.84Z,2016-11-21T03:46:06.14Z,2016-11-21T03:46:14.25Z,1830,2641.0,10299921.IV.CESX..HH,1.0,1.0,1.0,1.0,1.0
3,2016-11-18T00:09:43.93Z,2016-11-18T00:10:02.27Z,2016-11-18T00:10:11.94Z,1834,2801.0,10128831.IV.FIAM..HN,,,,,
4,2016-11-19T21:53:23.98Z,2016-11-19T21:53:44.94Z,2016-11-19T21:53:48.33Z,2096,2435.0,10230551.IV.LNSS..HH,1.0,4.0,1.0,1.0,1.0
5,2016-11-16T02:59:59.93Z,2016-11-16T03:00:17.15Z,2016-11-16T03:00:20.15Z,1721,2021.0,10004521.IV.SSM1..HN,,,,,
6,2016-11-22T01:28:01.85Z,2016-11-22T01:28:21.34Z,2016-11-22T01:28:27.65Z,1948,2580.0,10350381.IV.T1256..HN,,,,,
7,2016-11-18T17:55:03.37Z,2016-11-18T17:55:22.75Z,2016-11-18T17:55:26.55Z,1937,2318.0,10171861.IV.GUMA..HH,1.0,3.0,1.0,1.0,1.0
8,2016-11-16T15:07:58.35Z,2016-11-16T15:08:16.67Z,2016-11-16T15:08:21.85Z,1832,2350.0,10037661.IV.T1247..HH,1.0,1.0,1.0,1.0,1.0
9,2016-11-20T19:21:16.26Z,2016-11-20T19:21:34.69Z,2016-11-20T19:21:41.30Z,1843,2504.0,10280601.IV.LNSS..HH,0.0,3.0,1.0,1.0,1.0


In [5]:
noise_metadata = pd.read_csv(shuffled_noise_metadata_file)

noise_metadata.head()

# print(noise_metadata["trace_name"].iloc[170])

Unnamed: 0,source_id,station_network_code,station_code,station_location_code,station_channels,station_latitude_deg,station_longitude_deg,station_elevation_m,station_vs_30_mps,station_vs_30_detail,...,trace_Z_upper_quartile_counts,trace_E_spikes,trace_N_spikes,trace_Z_spikes,trace_name,trace_GPD_P_number,trace_GPD_S_number,trace_EQT_number_detections,trace_EQT_P_number,trace_EQT_S_number
0,20060912T030438,IV,VVLD,,EH,41.86965,13.62324,1051.0,505.0,Vs30 extracted from ShakeMap,...,21.0,0.0,0.0,0.0,20060912T030438.IV.VVLD..EH,0,0,0.0,0.0,0.0
1,20090520T062013,IV,T0104,,EH,42.3599,13.3382,754.0,473.0,Vs30 extracted from ShakeMap,...,42.0,0.0,0.0,0.0,20090520T062013.IV.T0104..EH,0,1,0.0,0.0,0.0
2,20090317T025034,IV,VVLD,,HH,41.86965,13.62324,1051.0,505.0,Vs30 extracted from ShakeMap,...,143.0,0.0,0.0,0.0,20090317T025034.IV.VVLD..HH,0,0,0.0,0.0,0.0
3,20090613T122035,IV,CSNT,,HH,43.47311,11.29017,636.0,580.0,Vs30 extracted from ShakeMap,...,29.0,0.0,0.0,0.0,20090613T122035.IV.CSNT..HH,1,3,0.0,0.0,0.0
4,20080930T112359,IV,IVPL,,HH,38.3763,14.9801,486.0,664.0,Vs30 extracted from ShakeMap,...,35.0,0.0,0.0,0.0,20080930T112359.IV.IVPL..HH,0,1,0.0,0.0,0.0


In [6]:
#Pass split percentage = [1] to not split and access the whole data

dataset = InstanceDataset(event_hdf5_file, shuffled_event_metadata_file, noise_hdf5_file, shuffled_noise_metadata_file, "binary", split_index=0, split_percentage=[1], padding_type="sample", start_padding_value=100, end_padding_value=1000, target_phase=True, phase_padding=20)

print(f"Dataset size = {len(dataset)}")

print("################## Accessing first event ##################")
index = 0
f1, t1, p1, s1, trace_name = dataset[index]
print(f"index = {index}")
print(f"Detection target = {t1}")
print(f"input shape = {f1.shape}")
print(f"trace name = {trace_name}")

print("################## Accessing last event ##################")
index = event_metadata.shape[0] - 1
f1, t1, p1, s1, trace_name = dataset[index]
print(f"index = {index}")
print(f"Detection target = {t1}")
print(f"input shape = {f1.shape}")
print(f"trace name = {trace_name}")


print("################## Accessing first noise ##################")
index = event_metadata.shape[0]
f1, t1, p1, s1, trace_name = dataset[index]
print(f"index = {index}")
print(f"Detection target = {t1}")
print(f"input shape = {f1.shape}")
print(f"trace name = {trace_name}")


print("################## Accessing last noise ##################")
index = noise_metadata.shape[0] + event_metadata.shape[0] - 1
f1, t1, p1, s1, trace_name = dataset[index]
print(f"index = {index}")
print(f"Detection target = {t1}")
print(f"input shape = {f1.shape}")
print(f"trace name = {trace_name}")

print("################## Out of Index ##################")
index = noise_metadata.shape[0] + event_metadata.shape[0]
print(f"index = {index}")
f1, t1, p1, s1, trace_name = dataset[index]
print(f"Detection target = {t1}")
print(f"input shape = {f1.shape}")
print(f"trace name = {trace_name}")

Dataset size = 28014
################## Accessing first event ##################
index = 0
Detection target = 1
input shape = torch.Size([3, 12000])
trace name = 10220691.IV.LNSS..HH
################## Accessing last event ##################
index = 14006
Detection target = 1
input shape = torch.Size([3, 12000])
trace name = 10243491.IV.T1245..HN
################## Accessing first noise ##################
index = 14007
Detection target = 0
input shape = torch.Size([3, 12000])
trace name = 20060912T030438.IV.VVLD..EH
################## Accessing last noise ##################
index = 28013
Detection target = 0
input shape = torch.Size([3, 12000])
trace name = 20080914T010641.IV.MODR..HH
################## Out of Index ##################
index = 28014
Detection target = 0
input shape = torch.Size([3, 12000])
trace name = 20060912T030438.IV.VVLD..EH


In [8]:
def get_3c_wave_form(filename, trace_name):
    with h5py.File(filename, 'r') as f:
        return torch.FloatTensor(f['data'][trace_name][:])

In [9]:
#split_percentage=[train, validation, test]

split_percentage=[0.85, 0.05, 0.1]

train_dataset = InstanceDataset(event_hdf5_file, shuffled_event_metadata_file, noise_hdf5_file, shuffled_noise_metadata_file, "binary", split_index=0, split_percentage=split_percentage, padding_type="sample", start_padding_value=100, end_padding_value=1000, target_phase=False, phase_padding=20)
val_dataset = InstanceDataset(event_hdf5_file, shuffled_event_metadata_file, noise_hdf5_file, shuffled_noise_metadata_file, "binary", split_index=1, split_percentage=split_percentage, padding_type="sample", start_padding_value=100, end_padding_value=1000, target_phase=False, phase_padding=20)
test_dataset = InstanceDataset(event_hdf5_file, shuffled_event_metadata_file, noise_hdf5_file, shuffled_noise_metadata_file, "binary", split_index=2, split_percentage=split_percentage, padding_type="sample", start_padding_value=100, end_padding_value=1000, target_phase=False, phase_padding=20)

#Check dataset sizes
train_events_count = math.floor(event_metadata.shape[0] * split_percentage[0])
train_noise_count = math.floor(noise_metadata.shape[0] * split_percentage[0])
print(f"Train size - Expected:{train_events_count+train_noise_count} - Actual:{len(train_dataset)}")


val_events_count = math.floor(event_metadata.shape[0] * split_percentage[1])
val_noise_count = math.floor(noise_metadata.shape[0] * split_percentage[1])
print(f"Val size - Expected:{val_events_count+val_noise_count} - Actual:{len(val_dataset)}")


test_events_count = math.floor(event_metadata.shape[0] * split_percentage[2])
test_noise_count = math.floor(noise_metadata.shape[0] * split_percentage[2])
print(f"Test size - Expected:{test_events_count+test_noise_count} - Actual:{len(test_dataset)}")


#Check train dataset indexes
absolute_index = 0
data, target, p, s, name = train_dataset[0]
actual_data = get_3c_wave_form(event_hdf5_file, event_metadata["trace_name"].iloc[absolute_index])
print(f"Train first event data matches: {torch.equal(data, actual_data)}")
print(f"Train first target matches: {target == 1}")

data, target, p, s, name = train_dataset[train_events_count]
actual_data = get_3c_wave_form(noise_hdf5_file, noise_metadata["trace_name"].iloc[absolute_index])
print(f"Train first noise data matches: {torch.equal(data, actual_data)}")
print(f"Train first noise target matches: {target == 0}")

absolute_index = train_events_count - 1
data, target, p, s, name = train_dataset[train_events_count - 1]
actual_data = get_3c_wave_form(event_hdf5_file, event_metadata["trace_name"].iloc[absolute_index])
print(f"Train last event data matches: {torch.equal(data, actual_data)}")
print(f"Train last event target matches: {target == 1}")

absolute_index = train_noise_count - 1
data, target, p, s, name = train_dataset[train_events_count + train_noise_count - 1]
actual_data = get_3c_wave_form(noise_hdf5_file, noise_metadata["trace_name"].iloc[absolute_index])
print(f"Train last noise data matches: {torch.equal(data, actual_data)}")
print(f"Train last noise target matches: {target == 0}")

#Check validation dataset indexes
absolute_index = train_events_count
data, target, p, s, name = val_dataset[0]
actual_data = get_3c_wave_form(event_hdf5_file, event_metadata["trace_name"].iloc[absolute_index])
print(f"Val first event data matches: {torch.equal(data, actual_data)}")
print(f"Val first target matches: {target == 1}")

absolute_index = train_noise_count
data, target, p, s, name = val_dataset[val_events_count]
actual_data = get_3c_wave_form(noise_hdf5_file, noise_metadata["trace_name"].iloc[absolute_index])
print(f"Val first noise data matches: {torch.equal(data, actual_data)}")
print(f"Val first noise target matches: {target == 0}")

absolute_index = train_events_count + val_events_count - 1
data, target, p, s, name = val_dataset[val_events_count - 1]
actual_data = get_3c_wave_form(event_hdf5_file, event_metadata["trace_name"].iloc[absolute_index])
print(f"Val last event data matches: {torch.equal(data, actual_data)}")
print(f"Val last event target matches: {target == 1}")

absolute_index = train_noise_count + val_noise_count - 1
data, target, p, s, name = val_dataset[val_events_count + val_noise_count - 1]
actual_data = get_3c_wave_form(noise_hdf5_file, noise_metadata["trace_name"].iloc[absolute_index])
print(f"Val last noise data matches: {torch.equal(data, actual_data)}")
print(f"Val last noise target matches: {target == 0}")

#Check test dataset indexes
absolute_index = train_events_count + val_events_count
data, target, p, s, name = test_dataset[0]
actual_data = get_3c_wave_form(event_hdf5_file, event_metadata["trace_name"].iloc[absolute_index])
print(f"Test first event data matches: {torch.equal(data, actual_data)}")
print(f"Test first target matches: {target == 1}")

absolute_index = train_noise_count + val_noise_count
data, target, p, s, name = test_dataset[test_events_count]
actual_data = get_3c_wave_form(noise_hdf5_file, noise_metadata["trace_name"].iloc[absolute_index])
print(f"Test first noise data matches: {torch.equal(data, actual_data)}")
print(f"Test first noise target matches: {target == 0}")

absolute_index = train_events_count + val_events_count + test_events_count - 1
data, target, p, s, name = test_dataset[test_events_count - 1]
actual_data = get_3c_wave_form(event_hdf5_file, event_metadata["trace_name"].iloc[absolute_index])
print(f"Test last event data matches: {torch.equal(data, actual_data)}")
print(f"Test last event target matches: {target == 1}")

absolute_index = train_noise_count + val_noise_count + test_noise_count - 1
data, target, p, s, name = test_dataset[test_events_count + test_noise_count - 1]
actual_data = get_3c_wave_form(noise_hdf5_file, noise_metadata["trace_name"].iloc[absolute_index])
print(f"Test last noise data matches: {torch.equal(data, actual_data)}")
print(f"Test last noise target matches: {target == 0}")


Train size - Expected:23810 - Actual:23810
Val size - Expected:1400 - Actual:1400
Test size - Expected:2800 - Actual:2800
Train first event data matches: True
Train first target matches: True
Train first noise data matches: True
Train first noise target matches: True
Train last event data matches: True
Train last event target matches: True
Train last noise data matches: True
Train last noise target matches: True
Val first event data matches: True
Val first target matches: True
Val first noise data matches: True
Val first noise target matches: True
Val last event data matches: True
Val last event target matches: True
Val last noise data matches: True
Val last noise target matches: True
Test first event data matches: True
Test first target matches: True
Test first noise data matches: True
Test first noise target matches: True
Test last event data matches: True
Test last event target matches: True
Test last noise data matches: True
Test last noise target matches: True


In [11]:
#split_percentage=[train, validation, test]

split_percentage=[0.85, 0.05, 0.1]
split_index = 0

index = 102

dataset = InstanceDataset(event_hdf5_file, shuffled_event_metadata_file, noise_hdf5_file, shuffled_noise_metadata_file, "box_square", padding_type="percentage", start_padding_value=0, end_padding_value=1.4, target_phase=True, phase_padding=20, split_index=split_index, split_percentage=split_percentage)


trace_name = f'{event_metadata["trace_name"].iloc[index]}-square-sample'
p_sample = event_metadata["trace_P_arrival_sample"].iloc[index]
s_sample = event_metadata["trace_S_arrival_sample"].iloc[index]
input, target, p_target, s_target, name = dataset[index]

sample_size = input.shape[1]

if np.isnan(p_sample):
    p_sample = 0

if np.isnan(s_sample):
    s_sample = sample_size - 1

s_sample = round(s_sample)

plot_trace_prediction(trace_name, input, np.array([p_sample]), np.array([s_sample]), target, p_target, s_target, "output/figures")

  plt.tight_layout()


In [59]:
#split_percentage=[train, validation, test]

split_percentage=[0.85, 0.05, 0.1]
split_index = 0

index = 5

dataset = InstanceDataset(event_hdf5_file, shuffled_event_metadata_file, noise_hdf5_file, shuffled_noise_metadata_file, "binary", padding_type="sample", start_padding_value=40, end_padding_value=40, target_phase=False, phase_padding=100, split_index=split_index, split_percentage=split_percentage)


trace_name = f'{event_metadata["trace_name"].iloc[index]}-square-sample'
p_sample = event_metadata["trace_P_arrival_sample"].iloc[index]
s_sample = event_metadata["trace_S_arrival_sample"].iloc[index]
input, target, p_target, s_target, _ = dataset[index]

sample_size = input.shape[1]

if np.isnan(p_sample):
    p_sample = 0

if np.isnan(s_sample):
    s_sample = sample_size - 1

s_sample = round(s_sample)

plot_trace_prediction(trace_name, input, np.array([p_sample]), np.array([s_sample]), target, p_target, s_target, "output/figures")

  plt.tight_layout()
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


In [60]:
#split_percentage=[train, validation, test]

split_percentage=[0.85, 0.05, 0.1]
split_index = 0

dataset = InstanceDataset(event_hdf5_file, shuffled_event_metadata_file, noise_hdf5_file, shuffled_noise_metadata_file, "binary", padding_type="sample", start_padding_value=40, end_padding_value=40, target_phase=False, phase_padding=100, remove_empty_phase=False, split_index=split_index, split_percentage=split_percentage)


train_loader = torch.utils.data.DataLoader(
    dataset, batch_size=10, shuffle=True, num_workers=2
)

for i, batch in enumerate(train_loader):
    print(f'index={i}')
    #batch[0][i] == tensor 3x12000 of the data of the i element
    #batch[1][i] == target of i element
    #batch[2][i] == p_target of i element
    #batch[3][i] == s_target of i element
    print(len(batch))
    print(batch[0][0].size())
    print(batch[1][0])
    print(batch[2][0])
    print(batch[3][0])

    data = batch[0]
    targets = batch[1]
    trace_names = batch[4]
    break

print("Print data are good")

for i in range(data.shape[0]):
    print("=============")
    trace_name = trace_names[i]
    y = targets[i]
    print(f"y: {y} - trace name: {trace_name}")
    if y == 0:
        actual_data = get_3c_wave_form(noise_hdf5_file, trace_name)
        print(f"Data is noise and matches: {torch.equal(data[i], actual_data)}")
    else:
        actual_data = get_3c_wave_form(event_hdf5_file, trace_name)
        print(f"Data is signal and matches: {torch.equal(data[i], actual_data)}")
        

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


index=0
5
torch.Size([3, 12000])
tensor(1)
tensor(-1)
tensor(-1)
Print data are good
y: 1 - trace name: 10210101.IV.SSFR..HN
Data is signal and matches: True
y: 0 - trace name: 20100120T120736.GU.BHB..HH
Data is noise and matches: True
y: 0 - trace name: 20090827T225325.IV.RMP..HH
Data is noise and matches: True
y: 1 - trace name: 10281481.IV.T1256..HN
Data is signal and matches: True
y: 0 - trace name: 20080129T190140.IV.SCTE..HH
Data is noise and matches: True
y: 1 - trace name: 10126201.IV.T1216..EH
Data is signal and matches: True
y: 1 - trace name: 10308001.IV.T1219..EH
Data is signal and matches: True
y: 1 - trace name: 10284211.IV.GUMA..HN
Data is signal and matches: True
y: 1 - trace name: 10340251.IV.CESI..HH
Data is signal and matches: True
y: 1 - trace name: 10103471.IV.T1241..HN
Data is signal and matches: True


In [61]:
#Check data are loading properly

#split_percentage=[train, validation, test]

split_percentage=[0.91, 0.04, 0.05]
split_index = 0

dataset = InstanceDataset(event_hdf5_file, shuffled_event_metadata_file, noise_hdf5_file, shuffled_noise_metadata_file, "binary", padding_type="sample", start_padding_value=40, end_padding_value=40, target_phase=False, phase_padding=100, remove_empty_phase=True, split_index=split_index, split_percentage=split_percentage)


train_loader = torch.utils.data.DataLoader(
    dataset, batch_size=128, shuffle=True, num_workers=2
)
should_continue = True

for i, batch in enumerate(train_loader):

    data = batch[0]
    targets = batch[1]
    trace_names = batch[4]

    hasEvent = False
    hasNoise = False

    for i in range(data.shape[0]):
        trace_name = trace_names[i]
        y = targets[i]
        if y == 0:
            actual_data = get_3c_wave_form(noise_hdf5_file, trace_name)
            hasNoise = True
            if not torch.equal(data[i], actual_data):
                print(f'index={i}, data size={data.shape[0]}, batch_i={i}, y={y}, {trace_name} is not loaded properly')
                should_continue = False
                break
        else:
            actual_data = get_3c_wave_form(event_hdf5_file, trace_name)
            hasEvent = True
            if not torch.equal(data[i], actual_data):
                print(f'index={i}, data size={data.shape[0]}, batch_i={i}, y={y}, {trace_name} is not loaded properly')
                should_continue = False
                break
    
    if not should_continue:
        break
    
    print(f'index={i}, data size={data.shape[0]}, hasNoise={hasNoise}, hasEvent={hasEvent} everything is good!')
        

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128,

In [62]:
#Check data are loading properly

#split_percentage=[train, validation, test]

split_percentage=[1]
split_index = 0

dataset = InstanceDataset(event_hdf5_file, event_metadata_file, noise_hdf5_file, noise_metadata_file, "binary", padding_type="sample", start_padding_value=40, end_padding_value=40, target_phase=False, phase_padding=100, remove_empty_phase=False, split_index=split_index, split_percentage=split_percentage)


train_loader = torch.utils.data.DataLoader(
    dataset, batch_size=256, shuffle=True, num_workers=2
)
should_continue = True

for i, batch in enumerate(train_loader):

    data = batch[0]
    targets = batch[1]
    trace_names = batch[4]

    hasEvent = False
    hasNoise = False

    for i in range(data.shape[0]):
        trace_name = trace_names[i]
        y = targets[i]
        if y == 0:
            actual_data = get_3c_wave_form(noise_hdf5_file, trace_name)
            hasNoise = True
            if not torch.equal(data[i], actual_data):
                print(f'index={i}, data size={data.shape[0]}, batch_i={i}, y={y}, {trace_name} is not loaded properly')
                should_continue = False
                break
        else:
            actual_data = get_3c_wave_form(event_hdf5_file, trace_name)
            hasEvent = True
            if not torch.equal(data[i], actual_data):
                print(f'index={i}, data size={data.shape[0]}, batch_i={i}, y={y}, {trace_name} is not loaded properly')
                should_continue = False
                break
    
    if not should_continue:
        break
    
    print(f'index={i}, data size={data.shape[0]}, hasNoise={hasNoise}, hasEvent={hasEvent} everything is good!')
        

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


index=255, data size=256, hasNoise=True, hasEvent=True everything is good!
index=255, data size=256, hasNoise=True, hasEvent=True everything is good!
index=255, data size=256, hasNoise=True, hasEvent=True everything is good!
index=255, data size=256, hasNoise=True, hasEvent=True everything is good!
index=255, data size=256, hasNoise=True, hasEvent=True everything is good!
index=255, data size=256, hasNoise=True, hasEvent=True everything is good!
index=255, data size=256, hasNoise=True, hasEvent=True everything is good!
index=255, data size=256, hasNoise=True, hasEvent=True everything is good!
index=255, data size=256, hasNoise=True, hasEvent=True everything is good!
index=255, data size=256, hasNoise=True, hasEvent=True everything is good!
index=255, data size=256, hasNoise=True, hasEvent=True everything is good!
index=255, data size=256, hasNoise=True, hasEvent=True everything is good!
index=255, data size=256, hasNoise=True, hasEvent=True everything is good!
index=255, data size=256,

In [63]:
split_percentage=[1]
split_index = 0

dataset = InstanceDataset(event_hdf5_file, event_metadata_file, noise_hdf5_file, noise_metadata_file, "box_trapezoidal", padding_type="percentage", start_padding_value=0, end_padding_value=0.4, target_phase=True, phase_padding=40, remove_empty_phase=True, split_index=split_index, split_percentage=split_percentage, norm_mode='max')

datagenerator = DataGenerator(dataset, 4, True)

batch = datagenerator[0]
