# **Network Training**

This tutorial demonstrates how to train the double LSTM-based model for the seqeunce-to-scalar future hysteresis step prediction. The model may serve as a good starting point for the neural network based transient magnetic modeling. The network model will be trained based on 3C90_Training_Tutorial.h5 file and saved as a state dictionary (.sd) file. The training data is a all frequency inclusive, 50-sequences per freqeuncy dataset with each seuqence containing only 1000 randomly selected time steps for training, operating under all availble temepratures, and flux excitations.


# **Step 0: Import Packages**

In this step we import the important packages that are necessary for the training.

In [8]:
from google.colab import drive
drive.mount('/content/drive')

import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import numpy as np
import json
import math
import csv
import time
import h5py

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Step 1: Define Network Structure**
The structure of the duel LSTM-based encoder-projector-decoder neural network are defined here. Refer to the PyTorch document for more details.

In [9]:
# Define model structures and functions

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.lstm_B = nn.LSTM(1, 8, num_layers=1, batch_first=True, bidirectional=False)

        self.lstm_H = nn.LSTM(1, 8, num_layers=1, batch_first=True, bidirectional=False)

        self.projector = nn.Sequential(
            nn.Linear(8 *2 + 2 , 8 *2 + 2),
            nn.ReLU(),
            nn.Linear(8 *2 + 2, 8),
            nn.ReLU(),
            nn.Linear(8 , 4),
            nn.ReLU(),
            nn.Linear(4, 1)
        )

    def forward(self, seq_B: Tensor, seq_H: Tensor, scal: Tensor, T: Tensor, device) -> Tensor:

        seq_B = seq_B.float()
        seq_H = seq_H.float()
        scal = scal.float()
        T = T.float()
        x_B, _ = self.lstm_B(seq_B)
        x_B = x_B[:, -1, :]
        x_H, _ = self.lstm_H(seq_H)
        x_H = x_H[:, -1, :]

        # print(x_B.size())
        # print(x_H.size())
        # print(scal.size())
        output = self.projector(torch.cat((scal, T, x_B, x_H), dim=1))
        output = output.to(device)

        return output

# **Step 2: Load the Training Dataset**

Dataset needs to be processed before trianing. This includes the normalization of the data. In this part, we load and pre-process the dataset for the network training and testing. In this demo, a small dataset measured with 3C90 ferrite material containing all frequency, temperature, flux excitations is used. The full dataset can be downloaded from the MagNet Challenge GitHub repository.

In [12]:
# Load the dataset
def get_dataset(data_length=80):

    # Change the directory to where the training data is located. The file used is h5py file. For more inofmration, please visit https://docs.h5py.org/en/latest/build.html
    with h5py.File('/content/drive/MyDrive/MLTran/3C90_Training_Tutorial.h5', 'r') as file:
        print("keys:", list(file.keys()))

        B_list = []
        H_list = []
        B_scal_list = []
        H_out_list = []
        T_scal_list = []

        for i in range(1, 8):  # i from 1 to 7, for reading all frequency datasets
            B_list.append(file[f'B_seq_f_{i}'][:])
            H_list.append(file[f'H_seq_f_{i}'][:])
            B_scal_list.append(file[f'B_scal_{i}'][:])
            H_out_list.append(file[f'H_scal_{i}'][:])
            T_scal_list.append(file[f'T_{i}'][:])


        # Now concatenate all of them after the loop
        B = np.concatenate(B_list, axis=0)
        H = np.concatenate(H_list, axis=0)
        B_scal = np.concatenate(B_scal_list, axis=0)
        H_out = np.concatenate(H_out_list, axis=0)
        T_scal = np.concatenate(T_scal_list, axis=0)

    print("Data Loading Initiated")


    B = np.array(B)
    H = np.array(H)
    B_scal = np.array(B_scal)
    H_out = np.array(H_out)
    T_scal = np.array(T_scal)

    B_scal = B_scal.reshape(-1,1)
    B_scal = torch.from_numpy(B_scal)
    T_scal = T_scal.reshape(-1,1)
    T_scal = torch.from_numpy(T_scal)

    in_B = torch.from_numpy(B).float().view(-1,data_length,1)
    in_H = torch.from_numpy(H).float().view(-1,data_length,1)
    out = torch.from_numpy(H_out).float().view(-1,1)


    # Save the normalized parameters to JSON file
    with open('/content/drive/MyDrive/MLTran/Normalization_Params.json', 'w') as f:
        json.dump({'mean_B': torch.mean(in_B).tolist(),
                    'std_B': torch.std(in_B).tolist(),
                    'mean_H': torch.mean(in_H).tolist(),
                    'std_H': torch.std(in_H).tolist(),
                    'mean_out': torch.mean(out).tolist(),
                    'std_out': torch.std(out).tolist(),
                    'mean_Scal': torch.mean(B_scal).tolist(),
                    'std_Scal': torch.std(B_scal).tolist(),
                    'mean_T': torch.mean(T_scal).tolist(),
                    'std_T': torch.std(T_scal).tolist()},f)

    B_scal = (B_scal-torch.mean(B_scal))/torch.std(B_scal)
    T_scal = (T_scal-torch.mean(T_scal))/torch.std(T_scal)
    in_B = (in_B-torch.mean(in_B))/torch.std(in_B)
    in_H = (in_H-torch.mean(in_H))/torch.std(in_H)
    out = (out-torch.mean(out))/torch.std(out)



    print(in_B.size())
    print(in_H.size())
    print(B_scal.size())
    print(out.size())



    return torch.utils.data.TensorDataset(in_B, in_H , B_scal, T_scal, out)

# **Step 3: Training the Model**

The loaded dataset is randomly split into training set, validation set, and test set. The output of the training is saved into the state dictionary file (.sd) containing all the trained parameter values. In this exmaple however, the test set data is from a seperate file.

In [13]:
# Defind parameters
# Define the memory length your data is saved with
data_length = 80

# Count the number of parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# Config the model training

def main():

    # Reproducibility
    random.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Hyperparameters
    NUM_EPOCH = 200
    BATCH_SIZE = 2048
    DECAY_EPOCH = 50
    DECAY_RATIO = 0.6
    LR_INI = 0.001

    times = []
    results = []

    # Select GPU as default device
    device = torch.device("cuda")

    # Load dataset
    dataset = get_dataset()

    # Split the dataset
    train_size = int(0.8 * len(dataset))
    valid_size = int(0.1 * len(dataset))
    test_size = len(dataset)- train_size- valid_size
    train_dataset, valid_dataset, test_dataset= torch.utils.data.random_split(dataset, [train_size, valid_size, test_size])
    kwargs = {'num_workers': 0, 'pin_memory': True, 'pin_memory_device': "cuda"}
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, **kwargs)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, **kwargs)
    test_loader =  torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, **kwargs)
    trainData = list(train_loader)
    validData = list(valid_loader)
    testData = list(test_loader) # Not used in training, have a seperate dataset for the test file

    # Setup network
    net = Net().to(device)

    # Log the number of parameters
    print("Number of parameters: ", count_parameters(net))
    print("Number of parameters: ", count_parameters(net.lstm_B))
    print("Number of parameters: ", count_parameters(net.projector))

    # Setup optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(net.parameters(), lr=LR_INI)

    # Record initial time
    start_time = time.time()

    # Setup the vairables for the model saving
    min_loss = 1e16
    min_Num = 0
    num_saved = 0


    # Train the network
    for epoch_i in range(NUM_EPOCH):

        start_epoch = time.time()

        # Train for one epoch
        epoch_train_loss = 0
        net.train()
        optimizer.param_groups[0]['lr'] = LR_INI* (DECAY_RATIO ** (0+ epoch_i // DECAY_EPOCH))

        for in_B, in_H, B_scal, T_scal, out in trainData:
            optimizer.zero_grad()
            output = net(seq_B=in_B.to(device), seq_H = in_H.to(device), scal = B_scal.to(device), T = T_scal.to(device) , device=device)
            loss = criterion(output, out.to(device))
            loss.backward()

            optimizer.step()
            epoch_train_loss += loss.item()

        # Compute Validation Loss
        with torch.no_grad():
            epoch_valid_loss = 0
            for in_B, in_H, B_scal, T_scal, out in validData:
                output_valid = net(seq_B=in_B.to(device), seq_H = in_H.to(device), scal = B_scal.to(device), T = T_scal.to(device), device=device)
                loss = criterion(output_valid, out.to(device))


                epoch_valid_loss += loss.item()


        end_epoch = time.time()
        times.append(end_epoch-start_epoch)

        # Print the training and validation loss and save them in a csv file
        if (epoch_i+1)%50 == 0:
          print(f"Epoch {epoch_i+1:2d} "
              f"Train {epoch_train_loss / len(train_dataset) * 1e5:.5f} "
              f"Valid {epoch_valid_loss / len(valid_dataset) * 1e5:.5f}")
          results.append([epoch_i+1, epoch_train_loss / len(train_dataset) * 1e5, epoch_valid_loss / len(valid_dataset) * 1e5])
          with open("/content/drive/MyDrive/MLTran/Training_.csv", "w") as f:
              writer = csv.writer(f)
              writer.writerows(results)


        # Save the model parameters based on the lowest subsequent validation loss
        if (min_loss> (epoch_valid_loss / len(valid_dataset))):
          min_loss = epoch_valid_loss / len(valid_dataset)

          #print("Model Updated:", num_saved," Saved")
          torch.save(net.state_dict(), "/content/drive/MyDrive/MLTran/Model/Model_LSTM_"+str(num_saved)+".sd")
          num_saved = num_saved + 1


    elapsed = time.time() - start_time
    print(f"Total Time Elapsed: {elapsed}")
    print(f"Average time per Epoch: {sum(times)/NUM_EPOCH}")


if __name__ == "__main__":
    main()

keys: ['B_scal_1', 'B_scal_2', 'B_scal_3', 'B_scal_4', 'B_scal_5', 'B_scal_6', 'B_scal_7', 'B_seq_f_1', 'B_seq_f_2', 'B_seq_f_3', 'B_seq_f_4', 'B_seq_f_5', 'B_seq_f_6', 'B_seq_f_7', 'H_scal_1', 'H_scal_2', 'H_scal_3', 'H_scal_4', 'H_scal_5', 'H_scal_6', 'H_scal_7', 'H_seq_f_1', 'H_seq_f_2', 'H_seq_f_3', 'H_seq_f_4', 'H_seq_f_5', 'H_seq_f_6', 'H_seq_f_7', 'T_1', 'T_2', 'T_3', 'T_4', 'T_5', 'T_6', 'T_7']
Data Loading Initiated
torch.Size([350000, 80, 1])
torch.Size([350000, 80, 1])
torch.Size([350000, 1])
torch.Size([350000, 1])
Number of parameters:  1239
Number of parameters:  352
Number of parameters:  535
Epoch 50 Train 0.00837 Valid 0.01105
Epoch 100 Train 0.00686 Valid 0.00780
Epoch 150 Train 0.00651 Valid 0.00685
Epoch 200 Train 0.00641 Valid 0.00654
Total Time Elapsed: 218.83896374702454
Average time per Epoch: 1.0897246634960174
