### Demo of training meta learner with example predictions
Written by Congcong Yuan, 2023-02

Outline of this tutorial:

0. Install packages
1. Import packages and functions
2. Load data and prepare training data
3. Train meta learner
4. Save your model

#### 0. Install packages

In [None]:
# install the required packages by pip3 on virtual machine
!pip3 install seisbench
!pip3 install ELEP==0.0.2 

#### 1. import packages and functions

In [6]:
import torch
import torch.nn as nn

from torch.autograd import Variable
print(torch.cuda.is_available())

import numpy as np
import pandas as pd
import random

from ELEP.elep.ensemble_learners import ensemble_regressor_cnn

False


#### 2. load data and prepare training data

In [None]:
# load metadata which includes labels
!wget https://github.com/congcy/ELEP/raw/main/docs/tutorials/data/events_metadata.csv
!wget https://drive.google.com/file/d/15U0KWTYa3l7lNrFVAqKAf7XrLHcAqa_q/view -O pretrain_prediction.npy
# dirpath_data = '/mnt/Data02/DataDL/Ensemble/INSTANCE_prediction_02/original/training/'
# dirpath_data2 = '/mnt/Data02/DataDL/Ensemble/INSTANCE_prediction_02/original/training/CNN_regress/'
# npy_fnm = 'Instance_training_set_original.npy'
# csv_fnm = 'Instance_training_set_original.csv'
# model_fnm = 'Instance_training_set_original_tp_L2k.pt'
# valid_fnm = 'Instance_validation_set_original_tp_L2k'
# pdata = np.load(dirpath_data + npy_fnm)
# # load csv
# csv_reader = pd.read_csv(dirpath_data+csv_fnm)
# trace_name_list = csv_reader['trace_name'].values
# trace_stt_list = csv_reader['trace_start_time'].values
# tp_sample_list = csv_reader['p_arrival_sample'].values
# ts_sample_list = csv_reader['s_arrival_sample'].values

In [11]:
import numpy as np
preds = np.load('./data/events_pretrain_predictions.npy')

In [None]:
# prepare data and labels
[nsamples, nphases, nmdls, npts] = pdata.shape
ntwin = 2000
cdata = np.zeros([nsamples, 1, nmdls, ntwin]) # Put P&S together, [N,C,H,W]
clabels = np.zeros([nsamples, 1]) # labels
it_labels = np.zeros([nsamples, ], dtype=int)
for isamp in range(nsamples):
    # extract manual/labled picks
    itp, its = tp_sample_list[isamp], ts_sample_list[isamp]
    # cut and write data
    itind = random.randint(100, 1900)
    if itp-itind+ntwin >= npts:
        itind = itp+ntwin-npts
    if itp-itind < 0:
        itind = itp
    cdata[isamp] = pdata[isamp,0:1,:,itp-itind:itp-itind+ntwin]

    it_labels[isamp] = itind
    # make label between 0 and 1
    clabels[isamp] = itind/ntwin

# remove variable
del pdata

In [None]:
# split data to training and validation datasets
split_rate = 1.0
nsplit = int(split_rate*nsamples)
cdata_train = torch.Tensor(cdata[:nsplit])
cdata_test = torch.Tensor(cdata[nsplit:])

clabels_train = torch.Tensor(clabels[:nsplit])
clabels_test = torch.Tensor(clabels[nsplit:])

it_labels_train = it_labels[:nsplit]
it_labels_test = it_labels[nsplit:]

print(cdata.shape, clabels.shape)

#### setup training parameters

In [None]:
optimizer = 'Adam'
lr = 1e-2
epochs = 20
CUDA = True
batch_size=500
batch_size2=500
shuffle = False # enforce to False
# prepare dataloader
train_load=torch.utils.data.DataLoader(dataset=cdata_train, batch_size=batch_size, shuffle=shuffle)
#test_load=torch.utils.data.DataLoader(dataset=cdata_test, batch_size=batch_size2, shuffle=shuffle)
# load model 
model=ensemble_regressor_cnn()
if CUDA:
    model=model.cuda()
# define a loss function
loss_function = torch.nn.MSELoss()
# select optimizer
if optimizer == 'Adam':
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
elif optimizer == 'SGD':
    optimizer=torch.optim.SGD(model.parameters(), lr=lr)
else:
    optimizer = []

In [None]:
# define test function
def test_fn(dataloader):
    num_batches = len(dataloader)
    test_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            # get labels
            labels = torch.Tensor(clabels_test[i*batch_size2:(i+1)*batch_size2])
            if CUDA:
                batch =Variable(batch.cuda())
                labels =Variable(labels.cuda())
            else:
                batch =Variable(batch)
                labels =Variable(labels)

            pred = model(batch)
            test_loss += loss_function(pred, labels).item()

    test_loss /= num_batches
    print(f"Test avg loss: {test_loss:>8f} \n")
    return test_loss 

# start training

In [None]:
# initializing loss
loss_arr = np.zeros([epochs, 2])
# training
for epoch in range(epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    # train
    for i, batch in enumerate(train_load):
        # get labels
        labels = torch.Tensor(clabels_train[i*batch_size:(i+1)*batch_size])

        if CUDA:
            batch =Variable(batch.cuda())
            labels =Variable(labels.cuda())
        else:
            batch =Variable(batch)
            labels =Variable(labels)
            
        optimizer.zero_grad()
        outputs=model(batch)
        loss=loss_function(outputs,labels)
        loss.backward()
        optimizer.step()

        if i % 5 == 0:
            loss, current = loss.item(), i * batch.shape[0]
            print(f"loss: {loss:>7f}  [{current:>5d}/{nsamples:>5d}]")

            # loss on test dataset
            # test_loss = test_fn(test_load)

    # test
    loss_arr[epoch,0] = loss
    #loss_arr[epoch,1] = test_loss
print("Finished!")

#### save data

In [None]:
torch.save(model, dirpath_data2+model_fnm)