# Prepare TensorFlow records manifest files

Since we do not expect all events to be detectable by our fiber-optic DAS array, we select events above a certain amplitude threshold. We select which event and noise samples to include in the TensorFlow record files used for training and evaluating the machine learning model. We save the indices of these data samples in a manifest file. 

In [None]:
 cd ..

In [11]:
import numpy as np
import os
import pandas as pd
import random

random.seed(42)

In [12]:
# Load the event and noise catalogs
df_events = pd.read_csv('catalog/earthquake_catalog.csv')
df_noise = pd.read_csv('catalog/noise_catalog.csv')

df_events

Unnamed: 0.1,Unnamed: 0,id,type,focal_time,arrival_time,latitude,longitude,depth,magnitude,distance,duration,evaluation_mode,evaluation_status,onset,velocity_estimate,datetime,azimuth,has_seismometer_data,has_das_data,local_amplitude
0,0,72688791,earthquake,2016-09-02 03:13:55.760000,2016-09-02 03:13:58.240000,37.388000,-122.272835,-122.272835,1.00,12229.080,7.1269,manual,reviewed,impulsive,4929.2812,2016-09-02 03:13:58.218031,241.905850,True,False,91.871735
1,1,72688876,earthquake,2016-09-02 05:10:33.570000,,36.978832,-121.639660,-121.639660,1.86,69224.750,,,,,,2016-09-02 05:10:45.886511,136.247300,True,False,7.867574
2,2,72689111,quarry blast,2016-09-02 17:26:32.670000,2016-09-02 17:26:36.040000,37.324000,-122.101830,-122.101830,1.30,13406.592,15.4430,manual,reviewed,emergent,3978.3070,2016-09-02 17:26:35.952630,150.703230,True,False,144.868800
3,3,72689151,quarry blast,2016-09-02 19:22:06.220000,,36.893833,-121.617500,-121.617500,1.71,77405.836,,,,,,2016-09-02 19:22:25.172631,139.987780,True,True,4.184556
4,4,72689361,earthquake,2016-09-03 06:01:49.650000,,37.909832,-121.853165,-121.853165,1.35,61244.918,,,,,,2016-09-03 06:02:00.721334,27.995693,True,True,3.326677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5764,5764,73312476,earthquake,2019-12-07 19:18:51.490000,,38.043835,-121.903000,-121.903000,2.06,74234.360,,,,,,2019-12-07 19:19:04.556444,19.340443,True,True,10.427045
5765,5765,73312481,earthquake,2019-12-07 19:34:00.340000,2019-12-07 19:34:11.600000,37.116833,-121.522500,-121.522500,2.70,67801.750,50.5100,manual,reviewed,impulsive,6018.6455,2019-12-07 19:34:12.437310,120.705025,True,True,57.402836
5766,5766,73312566,earthquake,2019-12-08 02:38:59.140000,2019-12-08 02:39:06.280000,37.730500,-122.136500,-122.136500,2.26,33886.727,29.7800,manual,reviewed,impulsive,4744.4526,2019-12-08 02:39:05.631518,5.923369,True,True,123.036560
5767,5767,73312731,earthquake,2019-12-08 20:06:06.480000,2019-12-08 20:06:14.140000,37.677834,-122.508500,-122.508500,1.48,42149.785,12.4950,manual,reviewed,emergent,5497.7285,2019-12-08 20:06:14.408352,313.275180,True,True,11.679759


In [13]:
# Path where the processed data windows are stored.
data_dir = '/scratch/earthquake-detection-ml/processed_data/das/'

# Amplitude threshold in nano-strain.
threshold = 40  
# Ratio of noise to event windows.
noise_ratio = 1
# Prefix for naming the manifest files. 
manifest_prefix = 'das_threshold40_balanced'

In [14]:
def remove_missing_data(df):
  df = df[df.has_das_data == True]
  return df[df.has_seismometer_data == True]

print('Before removing missing data', len(df_events))
df_events = remove_missing_data(df_events)
print('After removing missing data', len(df_events))

df_noise = remove_missing_data(df_noise)

Before removing missing data 5769
After removing missing data 4519


In [15]:
def get_event_indices(df, threshold):
  indices = list(df[df.local_amplitude >= threshold].index)
  random.shuffle(indices)
  return indices

event_indices = get_event_indices(df_events, threshold)
print('Number of events:', len(event_indices))

Number of events: 555


In [None]:
n_event = len(event_indices)
noise_indices = df_noise[:n_event * noise_ratio].index

n_event_train = int(0.8 * n_event)
n_event_eval = int(0.1 * n_event)
n_event_test = n_event - n_event_train - n_event_eval

n_noise_train = noise_ratio * n_event_train
n_noise_eval = noise_ratio * n_event_eval
n_noise_test = noise_ratio * n_event_test

In [None]:
 def _get_filenames(indices, prefix, data_dir, train=True, batch=1000):
    filenames = []
    for i in indices:
        # Data windows with half of the channels.
        filename1 = '{}_{:05d}_1.h5'.format(prefix, i)
        # Data windows with second of the channels.
        filename2 = '{}_{:05d}_2.h5'.format(prefix, i)
        # The data files are organized into subfolders of 1000 files.
        subdir = '{:05d}'.format((i // batch) * batch)
        
        filename1 = os.path.join(data_dir, prefix, subdir, filename1)
        filename2 = os.path.join(data_dir, prefix, subdir, filename2)
        if not os.path.isfile(filename1):
            print('file does not exist')
            pass
        filenames.append(filename1)
        if train:
            # We include the second half of the channels
            # for data augmentation during training.
            filenames.append(filename2)
    return filenames

In [None]:
train_event_files = _get_filenames(event_indices[:n_event_train], 'event', data_dir, True)
eval_event_files = _get_filenames(event_indices[n_event_train:n_event_train + n_event_eval], 'event', data_dir, False)
test_event_files = _get_filenames(event_indices[-n_event_test:], 'event', data_dir, False)

train_noise_files = _get_filenames(noise_indices[:n_noise_train], 'noise', data_dir, True)
eval_noise_files = _get_filenames(noise_indices[n_noise_train:n_noise_train + n_noise_eval], 'noise', data_dir, False)
test_noise_files = _get_filenames(noise_indices[-n_noise_test:], 'noise', data_dir, False)

train_files = train_event_files + train_noise_files
random.shuffle(train_files)
eval_files = eval_event_files + eval_noise_files
random.shuffle(eval_files)
test_files = test_event_files + test_noise_files
random.shuffle(test_files)

In [None]:
manifest_dir = '/scratch/earthquake-detection-ml/tfrecords/manifests'

def create_manifest(out_file, filenames):
    out_file = os.path.join(manifest_dir, out_file)
    print(out_file)
    with open(out_file, 'w') as f:
        for filename in filenames:
            f.write(filename + '\n')

In [None]:
create_manifest('{}_train_manifest.txt'.format(manifest_prefix), train_files)
create_manifest('{}_eval_manifest.txt'.format(manifest_prefix), eval_files)
create_manifest('{}_test_manifest.txt'.format(manifest_prefix), test_files)