In [19]:
import numpy as np
import pandas as pd
from utils.dataloaders.InstanceDataset import InstanceDataset
from utils.plot import plot_trace_prediction
import torch
import math
import h5py
from torchinfo import summary 
from nn.eq_transformer import DataGenerator

In [20]:
event_hdf5_file = "data/instance_samples/Instance_events_counts_10k.hdf5"
event_metadata_file = "data/instance_samples/metadata_Instance_events_10k.csv"
noise_hdf5_file = "data/instance_samples/Instance_noise_1k.hdf5"
noise_metadata_file = "data/instance_samples/metadata_Instance_noise_1k.csv"

In [22]:

def _shuffle_events(event_metadata_file, noise_metadata_file, random_state, output_event_metadata_file, output_noise_metadata_file, balance=False, remove_empty_phase=False):
    
    """ 
    
    Split the list of input data into training, validation, and test set.

    Parameters
    ----------
    event_metadata_file: str
        path to event metadata file 
    
    noise_metadata_file: str
        path to event metadata file 

    random_state: int
        random state for reproducibility

    output_event_metadata_file: bool
       Path to the output event metadata file

    output_noise_metadata_file: str
       Path to the output noise metadata file

    balance: bool
       Should have equal noise and events 

    remove_empty_phase: bool
       Remove events that do not have both phases
              
    Returns
    -------   
    (output_event_metadata_file, output_noise_metadata_file)
    """       

    noise_metadata = pd.read_csv(noise_metadata_file)
    event_metadata = pd.read_csv(event_metadata_file, usecols=["trace_name", 'trace_start_time', 'trace_P_arrival_time', 'trace_S_arrival_time', 'trace_P_arrival_sample', 'trace_S_arrival_sample', 'trace_GPD_P_number', 'trace_GPD_S_number', 'trace_EQT_P_number', 'trace_EQT_S_number', 'trace_EQT_number_detections'])

    if remove_empty_phase:
        event_metadata = event_metadata[event_metadata["trace_P_arrival_sample"].notna() & event_metadata["trace_S_arrival_sample"].notna()]
    
    event_metadata = event_metadata.sample(frac=1, random_state=random_state) 
    noise_metadata = noise_metadata.sample(frac=1, random_state=random_state) 

    if balance:
        event_metadata = event_metadata.head(noise_metadata.shape[0])

    event_metadata.to_csv(output_event_metadata_file, index=False)
    noise_metadata.to_csv(output_noise_metadata_file, index=False)

    return output_event_metadata_file, output_noise_metadata_file 


shuffled_event_metadata_file, shuffled_noise_metadata_file = _shuffle_events(
    event_metadata_file=event_metadata_file,
    noise_metadata_file=noise_metadata_file,
    random_state=9,
    output_event_metadata_file='temp/event_metadata.csv',
    output_noise_metadata_file='temp/noise_metadata.csv',
    balance=True,
    remove_empty_phase=True
)

In [18]:
event_metadata = pd.read_csv(event_metadata_file, usecols=["station_code", "trace_name", 'trace_start_time', 'trace_P_arrival_time', 'trace_S_arrival_time', 'trace_P_arrival_sample', 'trace_S_arrival_sample', 'trace_GPD_P_number', 'trace_GPD_S_number', 'trace_EQT_P_number', 'trace_EQT_S_number', 'trace_EQT_number_detections'])

event_metadata = event_metadata[event_metadata["trace_P_arrival_sample"].notna() & event_metadata["trace_S_arrival_sample"].notna()]

event_metadata.head(10)
# event_metadata.info(verbose=True)
# event_metadata.describe()
# event_metadata.shape

# test = event_metadata['trace_S_arrival_sample'] - event_metadata['trace_P_arrival_sample']
# test.describe()

# print(event_metadata["trace_name"].iloc[4969])
# print(event_metadata["trace_P_arrival_sample"][3])


Unnamed: 0,station_code,trace_start_time,trace_P_arrival_time,trace_S_arrival_time,trace_P_arrival_sample,trace_S_arrival_sample,trace_name,trace_GPD_P_number,trace_GPD_S_number,trace_EQT_number_detections,trace_EQT_P_number,trace_EQT_S_number
3,RM33,2016-12-04T15:34:43.69Z,2016-12-04T15:35:00.48Z,2016-12-04T15:35:07.35Z,1678,2366.0,11030611.IV.RM33..EH,3,7,2.0,2.0,2.0
4,RM33,2016-12-04T15:34:44.38Z,2016-12-04T15:35:00.48Z,2016-12-04T15:35:07.35Z,1610,2297.0,11030611.IV.RM33..HN,1,3,,,
12,T1201,2016-12-04T15:34:35.50Z,2016-12-04T15:34:58.48Z,2016-12-04T15:35:03.73Z,2298,2823.0,11030611.IV.T1201..HN,5,5,,,
13,T1202,2016-12-04T15:34:36.75Z,2016-12-04T15:34:57.47Z,2016-12-04T15:35:01.65Z,2072,2490.0,11030611.IV.T1202..EH,2,3,1.0,1.0,1.0
14,T1204,2016-12-04T15:34:40.51Z,2016-12-04T15:34:58.23Z,2016-12-04T15:35:03.27Z,1771,2276.0,11030611.IV.T1204..EH,3,4,3.0,3.0,3.0
15,T1211,2016-12-04T15:34:23.35Z,2016-12-04T15:35:01.13Z,2016-12-04T15:35:08.18Z,3778,4483.0,11030611.IV.T1211..EH,0,4,1.0,0.0,1.0
16,T1211,2016-12-04T15:34:41.24Z,2016-12-04T15:35:01.13Z,2016-12-04T15:35:08.18Z,1989,2694.0,11030611.IV.T1211..HN,0,3,,,
17,T1212,2016-12-04T15:34:17.66Z,2016-12-04T15:34:56.60Z,2016-12-04T15:34:59.98Z,3893,4232.0,11030611.IV.T1212..EH,2,9,1.0,1.0,1.0
18,T1212,2016-12-04T15:34:30.55Z,2016-12-04T15:34:56.60Z,2016-12-04T15:34:59.98Z,2605,2943.0,11030611.IV.T1212..HN,2,12,,,
19,T1213,2016-12-04T15:34:22.49Z,2016-12-04T15:34:56.72Z,2016-12-04T15:35:00.25Z,3422,3775.0,11030611.IV.T1213..EH,2,4,1.0,1.0,1.0


In [5]:
noise_metadata = pd.read_csv(noise_metadata_file)

noise_metadata.head()

# print(noise_metadata["trace_name"].iloc[170])

Unnamed: 0,source_id,station_network_code,station_code,station_location_code,station_channels,station_latitude_deg,station_longitude_deg,station_elevation_m,station_vs_30_mps,station_vs_30_detail,...,trace_Z_upper_quartile_counts,trace_E_spikes,trace_N_spikes,trace_Z_spikes,trace_name,trace_GPD_P_number,trace_GPD_S_number,trace_EQT_number_detections,trace_EQT_P_number,trace_EQT_S_number
0,20131205T210857,IV,ORZI,,HN,45.4056,9.9307,83.0,297.0,Vs30 extracted from ShakeMap,...,10.0,0.0,0.0,0.0,20131205T210857.IV.ORZI..HN,1,0,,,
1,20070528T142806,IV,BOB,,HH,44.76792,9.44782,910.0,766.0,Vs30 extracted from ShakeMap,...,1542.0,0.0,0.0,0.0,20070528T142806.IV.BOB..HH,0,3,0.0,0.0,0.0
2,20100131T124455,IV,CRAC,,EH,40.3814,16.435,384.0,575.0,Vs30 extracted from ShakeMap,...,539.25,0.0,0.0,0.0,20100131T124455.IV.CRAC..EH,0,1,0.0,0.0,0.0
3,20181206T001526,IV,HPAC,,HH,36.7085,15.0372,70.0,584.0,Vs30 extracted from ShakeMap,...,1576.0,0.0,0.0,0.0,20181206T001526.IV.HPAC..HH,0,1,0.0,0.0,0.0
4,20190720T231539,IV,MMGO,,HH,37.66195,12.97673,397.0,801.0,Vs30 extracted from ShakeMap,...,36.0,0.0,0.0,0.0,20190720T231539.IV.MMGO..HH,3,0,0.0,0.0,0.0


In [6]:
#Pass split percentage = [1] to not split and access the whole data

dataset = InstanceDataset(event_hdf5_file, event_metadata_file, noise_hdf5_file, noise_metadata_file, "binary", split_index=0, split_percentage=[1], padding_type="sample", start_padding_value=100, end_padding_value=1000, target_phase=True, phase_padding=20)

print(f"Dataset size = {len(dataset)}")

print("################## Accessing first event ##################")
index = 0
f1, t1, p1, s1, trace_name = dataset[index]
print(f"index = {index}")
print(f"Detection target = {t1}")
print(f"input shape = {f1.shape}")
print(f"trace name = {trace_name}")

print("################## Accessing last event ##################")
index = event_metadata.shape[0] - 1
f1, t1, p1, s1, trace_name = dataset[index]
print(f"index = {index}")
print(f"Detection target = {t1}")
print(f"input shape = {f1.shape}")
print(f"trace name = {trace_name}")


print("################## Accessing first noise ##################")
index = event_metadata.shape[0]
f1, t1, p1, s1, trace_name = dataset[index]
print(f"index = {index}")
print(f"Detection target = {t1}")
print(f"input shape = {f1.shape}")
print(f"trace name = {trace_name}")


print("################## Accessing last noise ##################")
index = noise_metadata.shape[0] + event_metadata.shape[0] - 1
f1, t1, p1, s1, trace_name = dataset[index]
print(f"index = {index}")
print(f"Detection target = {t1}")
print(f"input shape = {f1.shape}")
print(f"trace name = {trace_name}")

print("################## Out of Index ##################")
index = noise_metadata.shape[0] + event_metadata.shape[0]
print(f"index = {index}")
f1, t1, p1, s1, trace_name = dataset[index]

Dataset size = 7063
################## Accessing first event ##################
index = 0
Detection target = 1
input shape = torch.Size([3, 12000])
trace name = 11030611.IV.RM33..EH
################## Accessing last event ##################
index = 6062
Detection target = 1
input shape = torch.Size([3, 12000])
trace name = 11282341.IV.LTRZ..EH
################## Accessing first noise ##################
index = 6063
Detection target = 0
input shape = torch.Size([3, 12000])
trace name = 20131205T210857.IV.ORZI..HN
################## Accessing last noise ##################
index = 7062
Detection target = 0
input shape = torch.Size([3, 12000])
trace name = 20190813T130147.IV.CESX..HH
################## Out of Index ##################
index = 7063


IndexError: single positional indexer is out-of-bounds

In [8]:
def get_3c_wave_form(filename, trace_name):
    with h5py.File(filename, 'r') as f:
        return torch.tensor(f['data'][trace_name][:])

In [34]:
#split_percentage=[train, validation, test]

split_percentage=[0.85, 0.05, 0.1]

train_dataset = InstanceDataset(event_hdf5_file, event_metadata_file, noise_hdf5_file, noise_metadata_file, "binary", split_index=0, split_percentage=split_percentage, padding_type="sample", start_padding_value=100, end_padding_value=1000, target_phase=False, phase_padding=20)
val_dataset = InstanceDataset(event_hdf5_file, event_metadata_file, noise_hdf5_file, noise_metadata_file, "binary", split_index=1, split_percentage=split_percentage, padding_type="sample", start_padding_value=100, end_padding_value=1000, target_phase=False, phase_padding=20)
test_dataset = InstanceDataset(event_hdf5_file, event_metadata_file, noise_hdf5_file, noise_metadata_file, "binary", split_index=2, split_percentage=split_percentage, padding_type="sample", start_padding_value=100, end_padding_value=1000, target_phase=False, phase_padding=20)

#Check dataset sizes
train_events_count = math.floor(event_metadata.shape[0] * split_percentage[0])
train_noise_count = math.floor(noise_metadata.shape[0] * split_percentage[0])
print(f"Train size - Expected:{train_events_count+train_noise_count} - Actual:{len(train_dataset)}")


val_events_count = math.floor(event_metadata.shape[0] * split_percentage[1])
val_noise_count = math.floor(noise_metadata.shape[0] * split_percentage[1])
print(f"Val size - Expected:{val_events_count+val_noise_count} - Actual:{len(val_dataset)}")


test_events_count = math.floor(event_metadata.shape[0] * split_percentage[2])
test_noise_count = math.floor(noise_metadata.shape[0] * split_percentage[2])
print(f"Test size - Expected:{test_events_count+test_noise_count} - Actual:{len(test_dataset)}")


#Check train dataset indexes
absolute_index = 0
data, target, p, s, name = train_dataset[0]
actual_data = get_3c_wave_form(event_hdf5_file, event_metadata["trace_name"].iloc[absolute_index])
print(f"Train first event data matches: {torch.equal(data, actual_data)}")
print(f"Train first target matches: {target == 1}")

data, target, p, s, name = train_dataset[train_events_count]
actual_data = get_3c_wave_form(noise_hdf5_file, noise_metadata["trace_name"].iloc[absolute_index])
print(f"Train first noise data matches: {torch.equal(data, actual_data)}")
print(f"Train first noise target matches: {target == 0}")

absolute_index = train_events_count - 1
data, target, p, s, name = train_dataset[train_events_count - 1]
actual_data = get_3c_wave_form(event_hdf5_file, event_metadata["trace_name"].iloc[absolute_index])
print(f"Train last event data matches: {torch.equal(data, actual_data)}")
print(f"Train last event target matches: {target == 1}")

absolute_index = train_noise_count - 1
data, target, p, s, name = train_dataset[train_events_count + train_noise_count - 1]
actual_data = get_3c_wave_form(noise_hdf5_file, noise_metadata["trace_name"].iloc[absolute_index])
print(f"Train last noise data matches: {torch.equal(data, actual_data)}")
print(f"Train last noise target matches: {target == 0}")

#Check validation dataset indexes
absolute_index = train_events_count
data, target, p, s, name = val_dataset[0]
actual_data = get_3c_wave_form(event_hdf5_file, event_metadata["trace_name"].iloc[absolute_index])
print(f"Val first event data matches: {torch.equal(data, actual_data)}")
print(f"Val first target matches: {target == 1}")

absolute_index = train_noise_count
data, target, p, s, name = val_dataset[val_events_count]
actual_data = get_3c_wave_form(noise_hdf5_file, noise_metadata["trace_name"].iloc[absolute_index])
print(f"Val first noise data matches: {torch.equal(data, actual_data)}")
print(f"Val first noise target matches: {target == 0}")

absolute_index = train_events_count + val_events_count - 1
data, target, p, s, name = val_dataset[val_events_count - 1]
actual_data = get_3c_wave_form(event_hdf5_file, event_metadata["trace_name"].iloc[absolute_index])
print(f"Val last event data matches: {torch.equal(data, actual_data)}")
print(f"Val last event target matches: {target == 1}")

absolute_index = train_noise_count + val_noise_count - 1
data, target, p, s, name = val_dataset[val_events_count + val_noise_count - 1]
actual_data = get_3c_wave_form(noise_hdf5_file, noise_metadata["trace_name"].iloc[absolute_index])
print(f"Val last noise data matches: {torch.equal(data, actual_data)}")
print(f"Val last noise target matches: {target == 0}")

#Check test dataset indexes
absolute_index = train_events_count + val_events_count
data, target, p, s, name = test_dataset[0]
actual_data = get_3c_wave_form(event_hdf5_file, event_metadata["trace_name"].iloc[absolute_index])
print(f"Test first event data matches: {torch.equal(data, actual_data)}")
print(f"Test first target matches: {target == 1}")

absolute_index = train_noise_count + val_noise_count
data, target, p, s, name = test_dataset[test_events_count]
actual_data = get_3c_wave_form(noise_hdf5_file, noise_metadata["trace_name"].iloc[absolute_index])
print(f"Test first noise data matches: {torch.equal(data, actual_data)}")
print(f"Test first noise target matches: {target == 0}")

absolute_index = train_events_count + val_events_count + test_events_count - 1
data, target, p, s, name = test_dataset[test_events_count - 1]
actual_data = get_3c_wave_form(event_hdf5_file, event_metadata["trace_name"].iloc[absolute_index])
print(f"Test last event data matches: {torch.equal(data, actual_data)}")
print(f"Test last event target matches: {target == 1}")

absolute_index = train_noise_count + val_noise_count + test_noise_count - 1
data, target, p, s, name = test_dataset[test_events_count + test_noise_count - 1]
actual_data = get_3c_wave_form(noise_hdf5_file, noise_metadata["trace_name"].iloc[absolute_index])
print(f"Test last noise data matches: {torch.equal(data, actual_data)}")
print(f"Test last noise target matches: {target == 0}")


Train size - Expected:6003 - Actual:6003
Val size - Expected:353 - Actual:353
Test size - Expected:706 - Actual:706
Train first event data matches: True
Train first target matches: True
Train first noise data matches: True
Train first noise target matches: True
Train last event data matches: True
Train last event target matches: True
Train last noise data matches: True
Train last noise target matches: True
Val first event data matches: True
Val first target matches: True
Val first noise data matches: True
Val first noise target matches: True
Val last event data matches: True
Val last event target matches: True
Val last noise data matches: True
Val last noise target matches: True
Test first event data matches: True
Test first target matches: True
Test first noise data matches: True
Test first noise target matches: True
Test last event data matches: True
Test last event target matches: True
Test last noise data matches: True
Test last noise target matches: True


In [35]:
#split_percentage=[train, validation, test]

split_percentage=[0.85, 0.05, 0.1]
split_index = 0

index = 102

dataset = InstanceDataset(event_hdf5_file, event_metadata_file, noise_hdf5_file, noise_metadata_file, "box_trapezoidal", padding_type="percentage", start_padding_value=1, end_padding_value=4, target_phase=True, phase_padding=200, split_index=split_index, split_percentage=split_percentage)


trace_name = f'{event_metadata["trace_name"].iloc[index]}-trapeze-sample'
p_sample = event_metadata["trace_P_arrival_sample"].iloc[index]
s_sample = event_metadata["trace_S_arrival_sample"].iloc[index]
input, target, p_target, s_target, name = dataset[index]

sample_size = input.shape[1]

if np.isnan(p_sample):
    p_sample = 0

if np.isnan(s_sample):
    s_sample = sample_size - 1

s_sample = round(s_sample)

plot_trace_prediction(trace_name, input, np.array([p_sample]), np.array([s_sample]), target, p_target, s_target, "output/figures")

  plt.tight_layout()


In [9]:
#split_percentage=[train, validation, test]

split_percentage=[0.85, 0.05, 0.1]
split_index = 0

index = 5

dataset = InstanceDataset(event_hdf5_file, event_metadata_file, noise_hdf5_file, noise_metadata_file, "binary", padding_type="sample", start_padding_value=40, end_padding_value=40, target_phase=False, phase_padding=100, split_index=split_index, split_percentage=split_percentage)


trace_name = f'{event_metadata["trace_name"].iloc[index]}-square-sample'
p_sample = event_metadata["trace_P_arrival_sample"].iloc[index]
s_sample = event_metadata["trace_S_arrival_sample"].iloc[index]
input, target, p_target, s_target, _ = dataset[index]

sample_size = input.shape[1]

if np.isnan(p_sample):
    p_sample = 0

if np.isnan(s_sample):
    s_sample = sample_size - 1

s_sample = round(s_sample)

plot_trace_prediction(trace_name, input, np.array([p_sample]), np.array([s_sample]), target, p_target, s_target, "output/figures")

  plt.tight_layout()
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


In [13]:
#split_percentage=[train, validation, test]

split_percentage=[0.85, 0.05, 0.1]
split_index = 0

dataset = InstanceDataset(event_hdf5_file, shuffled_event_metadata_file, noise_hdf5_file, shuffled_noise_metadata_file, "binary", padding_type="sample", start_padding_value=40, end_padding_value=40, target_phase=False, phase_padding=100, remove_empty_phase=False, split_index=split_index, split_percentage=split_percentage)


train_loader = torch.utils.data.DataLoader(
    dataset, batch_size=10, shuffle=True, num_workers=2
)

for i, batch in enumerate(train_loader):
    print(f'index={i}')
    #batch[0][i] == tensor 3x12000 of the data of the i element
    #batch[1][i] == target of i element
    #batch[2][i] == p_target of i element
    #batch[3][i] == s_target of i element
    print(len(batch))
    print(batch[0][0].size())
    print(batch[1][0])
    print(batch[2][0])
    print(batch[3][0])

    data = batch[0]
    targets = batch[1]
    trace_names = batch[4]
    break

print("Print data are good")

for i in range(data.shape[0]):
    print("=============")
    trace_name = trace_names[i]
    y = targets[i]
    print(f"y: {y} - trace name: {trace_name}")
    if y == 0:
        actual_data = get_3c_wave_form(noise_hdf5_file, trace_name)
        print(f"Data is noise and matches: {torch.equal(data[i], actual_data)}")
    else:
        actual_data = get_3c_wave_form(event_hdf5_file, trace_name)
        print(f"Data is signal and matches: {torch.equal(data[i], actual_data)}")
        

index=0
5
torch.Size([3, 12000])
tensor(1)
tensor(-1)
tensor(-1)
Print data are good
y: 1 - trace name: 11269271.IV.GUMA..HH
Data is signal and matches: True
y: 0 - trace name: 20191231T182220.IV.CESX..HH
Data is noise and matches: True
y: 1 - trace name: 11041921.IV.MURB..HH
Data is signal and matches: True
y: 1 - trace name: 11190131.IV.ATCC..HN
Data is signal and matches: True
y: 1 - trace name: 11105061.IV.NRCA..HN
Data is signal and matches: True
y: 1 - trace name: 11237341.IV.LMD..HH
Data is signal and matches: True
y: 1 - trace name: 11079441.IV.TERO..HH
Data is signal and matches: True
y: 1 - trace name: 11278861.IV.AIO..HH
Data is signal and matches: True
y: 1 - trace name: 1115921.IV.MSSA..HH
Data is signal and matches: True
y: 1 - trace name: 1110151.IV.SSFR..HN
Data is signal and matches: True


In [15]:
#Check data are loading properly

#split_percentage=[train, validation, test]

split_percentage=[0.91, 0.04, 0.05]
split_index = 0

dataset = InstanceDataset(event_hdf5_file, event_metadata_file, noise_hdf5_file, noise_metadata_file, "binary", padding_type="sample", start_padding_value=40, end_padding_value=40, target_phase=False, phase_padding=100, remove_empty_phase=True, split_index=split_index, split_percentage=split_percentage)


train_loader = torch.utils.data.DataLoader(
    dataset, batch_size=64, shuffle=True, num_workers=2
)
should_continue = True

for i, batch in enumerate(train_loader):

    data = batch[0]
    targets = batch[1]
    trace_names = batch[4]

    hasEvent = False
    hasNoise = False

    for i in range(data.shape[0]):
        trace_name = trace_names[i]
        y = targets[i]
        if y == 0:
            actual_data = get_3c_wave_form(noise_hdf5_file, trace_name)
            hasNoise = True
            if not torch.equal(data[i], actual_data):
                print(f'index={i}, data size={data.shape[0]}, batch_i={i}, y={y}, {trace_name} is not loaded properly')
                should_continue = False
                break
        else:
            actual_data = get_3c_wave_form(event_hdf5_file, trace_name)
            hasEvent = True
            if not torch.equal(data[i], actual_data):
                print(f'index={i}, data size={data.shape[0]}, batch_i={i}, y={y}, {trace_name} is not loaded properly')
                should_continue = False
                break
    
    if not should_continue:
        break
    
    print(f'index={i}, data size={data.shape[0]}, hasNoise={hasNoise}, hasEvent={hasEvent} everything is good!')
        

index=63, data size=64, hasNoise=True, hasEvent=True everything is good!
index=63, data size=64, hasNoise=True, hasEvent=True everything is good!
index=63, data size=64, hasNoise=True, hasEvent=True everything is good!
index=63, data size=64, hasNoise=True, hasEvent=True everything is good!
index=63, data size=64, hasNoise=True, hasEvent=True everything is good!
index=63, data size=64, hasNoise=True, hasEvent=True everything is good!
index=63, data size=64, hasNoise=True, hasEvent=True everything is good!
index=63, data size=64, hasNoise=True, hasEvent=True everything is good!
index=63, data size=64, hasNoise=True, hasEvent=True everything is good!
index=63, data size=64, hasNoise=True, hasEvent=True everything is good!
index=63, data size=64, hasNoise=True, hasEvent=True everything is good!
index=63, data size=64, hasNoise=True, hasEvent=True everything is good!
index=63, data size=64, hasNoise=True, hasEvent=True everything is good!
index=63, data size=64, hasNoise=True, hasEvent=Tru

In [16]:
#Check data are loading properly

#split_percentage=[train, validation, test]

split_percentage=[1]
split_index = 0

dataset = InstanceDataset(event_hdf5_file, event_metadata_file, noise_hdf5_file, noise_metadata_file, "binary", padding_type="sample", start_padding_value=40, end_padding_value=40, target_phase=False, phase_padding=100, remove_empty_phase=False, split_index=split_index, split_percentage=split_percentage)


train_loader = torch.utils.data.DataLoader(
    dataset, batch_size=128, shuffle=True, num_workers=2
)
should_continue = True

for i, batch in enumerate(train_loader):

    data = batch[0]
    targets = batch[1]
    trace_names = batch[4]

    hasEvent = False
    hasNoise = False

    for i in range(data.shape[0]):
        trace_name = trace_names[i]
        y = targets[i]
        if y == 0:
            actual_data = get_3c_wave_form(noise_hdf5_file, trace_name)
            hasNoise = True
            if not torch.equal(data[i], actual_data):
                print(f'index={i}, data size={data.shape[0]}, batch_i={i}, y={y}, {trace_name} is not loaded properly')
                should_continue = False
                break
        else:
            actual_data = get_3c_wave_form(event_hdf5_file, trace_name)
            hasEvent = True
            if not torch.equal(data[i], actual_data):
                print(f'index={i}, data size={data.shape[0]}, batch_i={i}, y={y}, {trace_name} is not loaded properly')
                should_continue = False
                break
    
    if not should_continue:
        break
    
    print(f'index={i}, data size={data.shape[0]}, hasNoise={hasNoise}, hasEvent={hasEvent} everything is good!')
        

index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128, hasNoise=True, hasEvent=True everything is good!
index=127, data size=128,

In [6]:
split_percentage=[1]
split_index = 0

dataset = InstanceDataset(event_hdf5_file, event_metadata_file, noise_hdf5_file, noise_metadata_file, "box_trapezoidal", padding_type="percentage", start_padding_value=0, end_padding_value=0.4, target_phase=True, phase_padding=40, remove_empty_phase=True, split_index=split_index, split_percentage=split_percentage, norm_mode='max')

datagenerator = DataGenerator(dataset, 4, True)

batch = datagenerator[0]


Generator - data shape = torch.Size([12000, 3])
Generator - detection = torch.Size([12000])
Generator - p_phase = torch.Size([12000])
Generator - s_phase = torch.Size([12000])
Generator - data shape = torch.Size([12000, 3])
Generator - detection = torch.Size([12000])
Generator - p_phase = torch.Size([12000])
Generator - s_phase = torch.Size([12000])
Generator - data shape = torch.Size([12000, 3])
Generator - detection = torch.Size([12000])
Generator - p_phase = torch.Size([12000])
Generator - s_phase = torch.Size([12000])
Generator - data shape = torch.Size([12000, 3])
Generator - detection = torch.Size([12000])
Generator - p_phase = torch.Size([12000])
Generator - s_phase = torch.Size([12000])


In [15]:
def normalize(data, mode = 'max'):  
    'Normalize waveforms'
    
    data -= np.mean(data, axis=1, keepdims=True)
    if mode == 'max':
        max_data = np.max(data, axis=1, keepdims=True)
        assert(max_data.shape[-2] == data.shape[-2])
        max_data[max_data == 0] = 1
        data /= max_data              

    elif mode == 'std':               
        std_data = np.std(data, axis=1, keepdims=True)
        assert(std_data.shape[-2] == data.shape[-2])
        std_data[std_data == 0] = 1
        data /= std_data
    return data

test = np.array([[11.0, 2.0, -4, 7], [-3, 6, 8, -7]])
# test2 = np.ones(2)

# test[1] = test2[..., np.newaxis]

# test

# print(test2.shape)
# test2[..., np.newaxis]


print(np.mean(test, axis=1, keepdims=True))

# normalize(test, 'std')


[[4.]
 [1.]]


In [58]:
def normalize(data, mode = 'max'):  
    'Normalize waveforms'
    
    data -= torch.mean(data, axis=1, keepdims=True)
    if mode == 'max':
        max_data = torch.max(data, axis=1, keepdims=True).values
        assert(max_data.shape[-2] == data.shape[-2])
        max_data[max_data == 0] = 1
        data /= max_data              

    elif mode == 'std':               
        std_data = torch.std(data, axis=1, keepdims=True, unbiased=False)
        assert(std_data.shape[-2] == data.shape[-2])
        std_data[std_data == 0] = 1
        data /= std_data
    return data

test = np.array([[11.0, 2.0, -4, 7], [-3, 6, 8, -7]])
test = torch.from_numpy(test)


# print(torch.std(test, axis=1, keepdims=True, unbiased=False))

normalize(test, 'std')

tensor([[ 1.2472, -0.3563, -1.4254,  0.5345],
        [-0.6447,  0.8058,  1.1282, -1.2893]], dtype=torch.float64)

In [25]:
def _shuffle_events(event_metadata_file, noise_metadata_file, random_state, output_event_metadata_file, output_noise_metadata_file):
    
    """ 
    
    Split the list of input data into training, validation, and test set.

    Parameters
    ----------
    event_metadata_file: str
        path to event metadata file 
    
    noise_metadata_file: str
        path to event metadata file 

    random_state: int
        random state for reproducibility

    output_event_metadata_file: bool
       Path to the output event metadata file

    output_noise_metadata_file: str
       Path to the output noise metadata file
              
    Returns
    -------   
    (output_event_metadata_file, output_noise_metadata_file)
    """       

    noise_metadata = pd.read_csv(noise_metadata_file)
    event_metadata = pd.read_csv(event_metadata_file, usecols=["trace_name", 'trace_start_time', 'trace_P_arrival_time', 'trace_S_arrival_time', 'trace_P_arrival_sample', 'trace_S_arrival_sample', 'trace_GPD_P_number', 'trace_GPD_S_number', 'trace_EQT_P_number', 'trace_EQT_S_number', 'trace_EQT_number_detections'])

    event_metadata = event_metadata.sample(frac=1, random_state=random_state) 
    noise_metadata =noise_metadata.sample(frac=1, random_state=random_state) 

    event_metadata.to_csv(output_event_metadata_file, index=False)
    noise_metadata.to_csv(output_noise_metadata_file, index=False)

    return output_event_metadata_file, output_noise_metadata_file 

new_event, new_noise = _shuffle_events(
    event_metadata_file=event_metadata_file,
    noise_metadata_file=noise_metadata_file,
    random_state=9,
    output_event_metadata_file='temp/event_metadata.csv',
    output_noise_metadata_file='temp/noise_metadata.csv'
)

noise_metadata = pd.read_csv(noise_metadata_file)
event_metadata = pd.read_csv(event_metadata_file, usecols=["trace_name", 'trace_start_time', 'trace_P_arrival_time', 'trace_S_arrival_time', 'trace_P_arrival_sample', 'trace_S_arrival_sample', 'trace_GPD_P_number', 'trace_GPD_S_number', 'trace_EQT_P_number', 'trace_EQT_S_number', 'trace_EQT_number_detections'])


new_noise_m = pd.read_csv(new_noise)
new_event_m = pd.read_csv(new_event, usecols=["trace_name", 'trace_start_time', 'trace_P_arrival_time', 'trace_S_arrival_time', 'trace_P_arrival_sample', 'trace_S_arrival_sample', 'trace_GPD_P_number', 'trace_GPD_S_number', 'trace_EQT_P_number', 'trace_EQT_S_number', 'trace_EQT_number_detections'])

print(f"old event shape={event_metadata.shape}")
print(f"new event shape={new_event_m.shape}")
print(f"old noise shape={noise_metadata.shape}")
print(f"old noise shape={new_noise_m.shape}")

old event shape=(10000, 11)
new event shape=(10000, 11)
old noise shape=(1000, 43)
old noise shape=(1000, 43)
