In [None]:
import torch
import joblib
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader, random_split

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Core Functions

In [None]:
# Define MLP model
class MLPRegressor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, x):
        return self.net(x)

# training loop
def train(model, train_loader, val_loader, epochs=1000, schedule=True):
    train_loss_history = []
    val_loss_history = []
    for epoch in range(epochs):
        model.train()
        train_loss  = 0.0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output  = model(X_batch)
            loss    = loss_fn(output, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X_batch.size(0)

        avg_loss    = train_loss / len(train_loader.dataset)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                preds   = model(xb)
                loss    = loss_fn(preds, yb)
                val_loss += loss.item() * xb.size(0)

        avg_train_loss  = train_loss / train_size
        avg_val_loss    = val_loss / val_size

        train_loss_history.append(avg_train_loss)
        val_loss_history.append(avg_val_loss)

        print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        # Optional: Reduce LR or stop early
        if schedule:
            scheduler.step(avg_train_loss)

    return train_loss_history, val_loss_history

# get dataset ready to be used in pytorch training loop
def prepare_data(db, inds, deps, input_scaler=None, output_scaler=None):
    X           = db[inds].values
    y           = db[deps].values

    if input_scaler is None:
        input_scaler    = StandardScaler()
        X_scaled        = input_scaler.fit_transform(X)
    else:
        X_scaled        = input_scaler.transform(X)

    if output_scaler is None:
        output_scaler   = StandardScaler()
        y_scaled        = output_scaler.fit_transform(y)
    else:
        y_scaled        = output_scaler.transform(y)

    X_tensor    = torch.tensor(X_scaled, dtype=torch.float32)
    y_tensor    = torch.tensor(y_scaled, dtype=torch.float32)
    dataset     = TensorDataset(X_tensor, y_tensor)
    return dataset, input_scaler, output_scaler

# make predictions on dataset based on trained model
def make_predictions(model, dataset, input_scaler, output_scaler):
    all_ins         = []
    all_preds       = []
    all_targets     = []
    dataset_loader  = DataLoader(dataset, batch_size=1024)
    with torch.no_grad():
        for xb, yb in dataset_loader:
            preds_scaled    = model(xb)
            preds           = output_scaler.inverse_transform(preds_scaled.numpy())
            targets         = output_scaler.inverse_transform(yb.numpy())

            all_preds.append(preds)
            all_targets.append(targets)
    all_preds       = np.vstack(all_preds)
    all_targets     = np.vstack(all_targets)
    return all_targets, all_preds

# quick function to select datasets based on saved file name
def choose_datasets(folder, snr, bw_start, use_base=False):
    db          = pd.read_csv(f'db_delta_snr{snr}_complex_{bw_start}to{bw_start+1}.csv')
    db2         = pd.read_csv(f'db_gmm_snr{snr}_complex_{bw_start}to{bw_start+1}.csv')
    return db, db2

# create arrays of names of inputs and outputs for training
def create_training_columns(segments):
    cols = []
    for seg in range(segments):
        for coef in range((5)):
            cols.append(f'segment{seg}_{coef}_real')
            cols.append(f'segment{seg}_{coef}_imag')
    independents    = cols.copy()
    dependents      = ['mean', 'std', 'skew', 'inv_kurt']
    return independents, dependents

# Train Network on Data

## Load Data

In [None]:
foler       = '/your/folder/here'
snr         = 0         # 0, 10, or 100. SNR measured in average amplitude
run_num     = 0         # if you want to collect results from multiple training runs
bw_start    = 0         # 0 or 1, depending on interrogating wave's frequency band (default 0)
db, db2     = choose_datasets(folder, snr, run_num, bw_start)

687800


## select which data to include as independent and dependent variables

In [None]:
segments                    = 5
independents, dependents    = create_training_columns(segments, run_num)

print(independents)
print(dependents)

['segment0_0_real', 'segment0_0_imag', 'segment0_1_real', 'segment0_1_imag', 'segment0_2_real', 'segment0_2_imag', 'segment0_3_real', 'segment0_3_imag', 'segment0_4_real', 'segment0_4_imag', 'segment1_0_real', 'segment1_0_imag', 'segment1_1_real', 'segment1_1_imag', 'segment1_2_real', 'segment1_2_imag', 'segment1_3_real', 'segment1_3_imag', 'segment1_4_real', 'segment1_4_imag', 'segment2_0_real', 'segment2_0_imag', 'segment2_1_real', 'segment2_1_imag', 'segment2_2_real', 'segment2_2_imag', 'segment2_3_real', 'segment2_3_imag', 'segment2_4_real', 'segment2_4_imag', 'segment3_0_real', 'segment3_0_imag', 'segment3_1_real', 'segment3_1_imag', 'segment3_2_real', 'segment3_2_imag', 'segment3_3_real', 'segment3_3_imag', 'segment3_4_real', 'segment3_4_imag', 'segment4_0_real', 'segment4_0_imag', 'segment4_1_real', 'segment4_1_imag', 'segment4_2_real', 'segment4_2_imag', 'segment4_3_real', 'segment4_3_imag', 'segment4_4_real', 'segment4_4_imag']
['mean', 'std', 'skew', 'inv_kurt']


## run training

In [None]:
dataset, _in, _out  = prepare_data(db, independents, dependents)
model               = MLPRegressor(input_dim=len(independents), output_dim=len(dependents))
loss_fn             = nn.HuberLoss()
optimizer           = optim.Adam(model.parameters(), lr=8e-4)
scheduler           = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.8, patience=10)
val_size            = int(0.2 * len(dataset))
train_size          = len(dataset) - val_size
_train, _val        = random_split(dataset, [train_size, val_size])
train_loader        = DataLoader(_train, batch_size=512, shuffle=True)
val_loader          = DataLoader(_val, batch_size=512)
tl_hist, vl_hist    = train(model, train_loader, val_loader, epochs=500, schedule=True)

In [None]:
plt.plot(vl_hist, label='Validation Loss')
plt.plot(tl_hist, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.yscale('log')
plt.legend()

# save plot
plt.savefig(folder + f'plots/loss_snr{snr}_complex_{bw_start}to{bw_start+1}_run{run_num}.png')
plt.show()

## save trained model and scalers

In [None]:
sub_folder = folder + 'Predictions/'
torch.save(model, sub_folder + f'models/polys_snr{snr}_complex_{bw_start}to{bw_start+1}_run{run_num}.pt')
joblib.dump(_in, sub_folder + f'pickles/input_scaler_snr{snr}_{bw_start}to{bw_start+1}_run{run_num}.pkl')
joblib.dump(_out, sub_folder + f'pickles/output_scaler_snr{snr}_{bw_start}to{bw_start+1}_run{run_num}.pkl')

# Evaluate Results

### do this stuff

if you didn't just train the network and set all these variables already

In [None]:
sub_folder = folder + '/Predictions/'

In [None]:
snr         = 0
run_num     = 0
bw_start    = 0
model       = torch.load(sub_folder + f'models/polys_snr{snr}_complex_{bw_start}to{bw_start+1}_run{run_num}.pt',
                         weights_only=False)
db, db2     = choose_datasets(folder, snr, run_num, bw_start)

model.eval()

In [None]:
_in     = joblib.load(sub_folder + f'pickles/input_scaler_snr{snr}_{bw_start}to{bw_start+1}_run{run_num}.pkl')
_out    = joblib.load(sub_folder + f'pickles/output_scaler_snr{snr}_{bw_start}to{bw_start+1}_run{run_num}.pkl')

In [None]:
segments                    = 5
independents, dependents    = create_training_columns(segments, run_num)

### make predictions

In [None]:
scattertype     = 'delta'

print("scatter type:", scattertype)
print('')

if scattertype == 'delta':
    dataset, _, _   = prepare_data(db, independents, dependents, _in, _out)
elif scattertype == 'gmm':
    dataset, _, _   = prepare_data(db2, independents, dependents, _in, _out)

targets, preds  = make_predictions(model, dataset, _in, _out)

rmse            = np.sqrt(np.mean((preds - targets)**2, axis=0))
mae             = np.mean(np.abs(preds - targets), axis=0)

for name, error in zip(dependents, rmse):
    print(f"RMSE for {name}: {error:.4f}")
print('')
for name, error in zip(dependents, mae):
    print(f"MAE for {name}: {error:.4f}")

### save results

In [None]:
db_preds    = pd.DataFrame(preds, columns=[d + '_pred' for d in dependents])

if scattertype == 'delta':
    db_w_preds  = pd.concat([db, db_preds], axis=1)
    filename = f'db_delta_snr{snr}_complex_w_preds_{bw_start}to{bw_start+1}_{run_num}.csv'
elif scattertype == 'gmm':
    db_w_preds  = pd.concat([db2, db_preds], axis=1)
    filename = f'db_gmm_snr{snr}_complex_w_preds_{bw_start}to{bw_start+1}_{run_num}.csv'

db_w_preds.to_csv(sub_folder + filename, index=False)