# Step 4 - Classify events with SSL algorithm

In this step the waveform data are used for training an algorithm to identify the type of earthquake (strikeslip, normal, reverse) with a Self-Supervised Learning methodology, where we train it with a small sample of each size and then we ask to classify the event dataset based on this training.

There are two options for the user:

1) Using the same threshold for the identification of the events by their moment tensor (see Step 2) and feed the algorithm with about the 30% of the available data of each type, while asking to classify the rest 70% and the unclassified ones.
2) Using different thresholds and add the lower threshold obtained data as feed (higher accuracy) and then ask to identify the data of the waveforms classified by their moment tensor with a higher threshold and also the unclassified ones for that threshold.

© Foivos Karakostas - UCL, 2024

--------------------------------

#### User inputs

The variable feedperc is the percentage of seismograms that are going to be used for each type of earthquake for training the algorithm against the total number of events existing in the given feed dataset. Use 0.3 or similar when the feed and classify thresholds are the same, use values up to 1 when they are different (the feed sample should be smaller than the classifiable, because otherwise the overlap is in all data).

In [34]:
component = 'Z'
distance = 15
backazimuth = 0
feed_threshold = 20
classify_threshold = 45
feedperc = 0.6

lf = 0.1
hf = 1.0

------------------------------------------------------

#### Script run

Import the necessary packages

In [35]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
import os
import matplotlib
from matplotlib import pyplot
import math
from obspy import read
from collections import Counter

Use these functions for the pre-processing of the data input

In [36]:
def take_waveform(dir, eventname, distance, backazimuth, component, lf, hf):
    
    directory = dir + '/' + eventname
    filename = 'SY.SD' + str(distance).zfill(3) + 'B' + str(backazimuth).zfill(3) + '.mseed'
    filepath = directory + '/' + filename
    input = read(filepath)
    if component == 'Z':
        st = input[0]
    elif component == 'R':
        st = input[1]
    elif component == 'T':
        st = input[2]
    st.filter('bandpass', freqmin = lf, freqmax = hf)
    t = st.times();
    d = st.data;    
    
    return t, d

def remove_overlaps(feed_events, classify_events_init):

    classify_events = []
    for e in range(0, len(classify_events_init)):
        event = classify_events_init[e]
        check = 0
        for j in range(0, len(feed_events)):
            eventf = feed_events[j]
            if event == eventf:
                check = check + 1
        if check == 0:
            classify_events.append(event)
   
    return classify_events

def take_feed_events(feed_events_init, feedperc):
    feed_events = []
    feedlength = math.ceil(len(feed_events_init)*feedperc)
    for e in range(0, feedlength):
        event = feed_events_init[e]
        feed_events.append(event)

    return feed_events
    

In [37]:
syntdirectory_feed = 'Synthetics' + str(feed_threshold)
syntdirectory1_feed = syntdirectory_feed + '/Strikeslip'
syntdirectory2_feed = syntdirectory_feed + '/Normal'
syntdirectory3_feed = syntdirectory_feed + '/Reverse'

syntdirectory_classify = 'Synthetics' + str(classify_threshold)
syntdirectory1_classify = syntdirectory_classify + '/Strikeslip'
syntdirectory2_classify = syntdirectory_classify + '/Normal'
syntdirectory3_classify = syntdirectory_classify + '/Reverse'
syntdirectory4_classify = syntdirectory_classify + '/Unclassified'

feed_events1_init = os.listdir(syntdirectory1_feed)
feed_events2_init = os.listdir(syntdirectory2_feed)
feed_events3_init = os.listdir(syntdirectory3_feed)

classify_events1_init = os.listdir(syntdirectory1_classify)
classify_events2_init = os.listdir(syntdirectory2_classify)
classify_events3_init = os.listdir(syntdirectory3_classify)
classify_events4_init = os.listdir(syntdirectory4_classify)

feed_events1 = take_feed_events(feed_events1_init, feedperc)   
feed_events2 = take_feed_events(feed_events2_init, feedperc)
feed_events3 = take_feed_events(feed_events3_init, feedperc) 
    
classify_events1 = remove_overlaps(feed_events1, classify_events1_init)
classify_events2 = remove_overlaps(feed_events2, classify_events2_init)
classify_events3 = remove_overlaps(feed_events3, classify_events3_init)
classify_events4 = classify_events4_init

message_1 = 'Feed Events 1 = ' + str(len(feed_events1)) + ' | Classify Events 1 = ' + str(len(classify_events1)) + ' | Total Events 1  = ' + str(len(classify_events1_init))
print(message_1)
message_2 = 'Feed Events 2 = ' + str(len(feed_events2)) + ' | Classify Events 2 = ' + str(len(classify_events2)) + ' | Total Events 2  = ' + str(len(classify_events2_init))
print(message_2)
message_3 = 'Feed Events 3 = ' + str(len(feed_events3)) + ' | Classify Events 3 = ' + str(len(classify_events3)) + ' | Total Events 3  = ' + str(len(classify_events3_init))
print(message_3)

Feed Events 1 = 11 | Classify Events 1 = 15 | Total Events 1  = 26
Feed Events 2 = 12 | Classify Events 2 = 36 | Total Events 2  = 48
Feed Events 3 = 4 | Classify Events 3 = 22 | Total Events 3  = 26


In [38]:
t_data = []
d_data = []
labels = []

for e in range(0, len(feed_events1)):
    label = 1
    event = feed_events1[e]
    t,d = take_waveform(syntdirectory1_feed, event, distance, backazimuth, component, lf, hf)
    t_data.append(t)
    d_data.append(d)
    labels.append(1)
    
for e in range(0, len(feed_events2)):
    label = 2
    event = feed_events2[e]
    t,d = take_waveform(syntdirectory2_feed, event, distance, backazimuth, component, lf, hf)
    t_data.append(t)
    d_data.append(d)
    labels.append(2)

for e in range(0, len(feed_events3)):
    label = 3
    event = feed_events3[e]
    t,d = take_waveform(syntdirectory3_feed, event, distance, backazimuth, component, lf, hf)
    t_data.append(t)
    d_data.append(d)
    labels.append(3)

In [39]:
X = np.hstack((t_data, d_data))

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Model Accuracy: {accuracy * 100:.2f}%")
print(f"F1 Score: {f1:.2f}")

Model Accuracy: 77.78%
F1 Score: 0.78


In [40]:
def classify_waveform(t, d):
    input_data = np.hstack((t, d)).reshape(1, -1)
    input_data = scaler.transform(input_data)
    
    predicted_label = model.predict(input_data)[0]
    
    return predicted_label

In [41]:
labels_for_1 = []

for e in range(0, len(classify_events1)):
    event = classify_events1[e]
    new_t_data, new_d_data = take_waveform(syntdirectory1_classify,event,distance,backazimuth,component, lf, hf)
    predicted_label = classify_waveform(new_t_data, new_d_data)
    labels_for_1.append(predicted_label)
    
class_counts = Counter(labels_for_1)
for event_type in range(1, 4):
    count = class_counts.get(event_type, 0)
    print(f"Type {event_type}: {count} occurrences")

Type 1: 7 occurrences
Type 2: 2 occurrences
Type 3: 6 occurrences


In [42]:
labels_for_2 = []

for e in range(0, len(classify_events2)):
    event = classify_events2[e]
    new_t_data, new_d_data = take_waveform(syntdirectory2_classify,event,distance,backazimuth,component, lf, hf)
    predicted_label = classify_waveform(new_t_data, new_d_data)
    labels_for_2.append(predicted_label)
    
class_counts = Counter(labels_for_2)
for event_type in range(1, 4):
    count = class_counts.get(event_type, 0)
    print(f"Type {event_type}: {count} occurrences")

Type 1: 11 occurrences
Type 2: 12 occurrences
Type 3: 13 occurrences


In [43]:
labels_for_3 = []

for e in range(0, len(classify_events3)):
    event = classify_events3[e]
    new_t_data, new_d_data = take_waveform(syntdirectory3_classify,event,distance,backazimuth,component, lf, hf)
    predicted_label = classify_waveform(new_t_data, new_d_data)
    labels_for_3.append(predicted_label)
    
class_counts = Counter(labels_for_3)
for event_type in range(1, 4):
    count = class_counts.get(event_type, 0)
    print(f"Type {event_type}: {count} occurrences")

Type 1: 4 occurrences
Type 2: 1 occurrences
Type 3: 17 occurrences


In [44]:
labels_for_4 = []

for e in range(0, len(classify_events4)):
    event = classify_events4[e]
    new_t_data, new_d_data = take_waveform(syntdirectory4_classify,event,distance,backazimuth,component, lf, hf)
    predicted_label = classify_waveform(new_t_data, new_d_data)
    labels_for_4.append(predicted_label)
    
class_counts = Counter(labels_for_4)
for event_type in range(1, 4):
    count = class_counts.get(event_type, 0)
    print(f"Type {event_type}: {count} occurrences")

Type 1: 1 occurrences
Type 2: 4 occurrences
Type 3: 4 occurrences
