# Training

For some reason, the script train.py crashes WSL after a couple of epochs. Above is it's full code, run it from the notebook in case you got any error while running direct from the CLI.


In [1]:
import torch
import cv2
import numpy as np
import os
import pickle
from core.dummy_videos import create_dummy_video
from torch.utils.data import Dataset


pre_seq_length = 20
aft_seq_length = 10
batch_size = 4
epoch = 30

train_dir = 'train'
val_dir = 'val'
test_dir = 'auxiliary_dirs/test_dummy'

def select_random_file(folder_path):
    try:
        # List all files in the folder
        files = [file for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))]
    
        # Check if any files are found
        if files:
            # Randomly select a file and return its full path
            return os.path.join(folder_path, files[0])
        else:
            return None
    except FileNotFoundError:
        print("The specified folder does not exist.")
        return None

# Create test dummy video
create_dummy_video(select_random_file(train_dir), test_dir, pre_seq_length, aft_seq_length)

# Set the precision for matrix multiplications to utilize Tensor Cores
torch.set_float32_matmul_precision('high')

def sample_frames(video_path, num_frames=20):
    # read the video
    video = cv2.VideoCapture(video_path)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    # uniformly sample frames from the video
    frame_idxs = np.linspace(0, total_frames-1, num_frames, dtype=int)
    frames = []
    for idx in frame_idxs:
        video.set(cv2.CAP_PROP_POS_FRAMES, idx)
        _, frame = video.read()
        # frame = cv2.resize(frame, (height, width))
        frames.append(frame)
    video.release()
    return np.stack(frames)

def process_folder(folder_path, pre_slen=10, aft_slen=10, suffix='.avi'):
    # get all the videos in this folder
    videos = []
    files = os.listdir(folder_path)
    for file in files:
        video_path = os.path.join(folder_path, file)
        if os.path.isfile(video_path) and file.endswith(suffix):
            video = sample_frames(video_path, pre_slen + aft_slen)
            videos.append(video)
    # stack video frames from each folder
    data = np.stack(videos).transpose(0, 1, 4, 2, 3)

    # if the data is in [0, 255], rescale it into [0, 1]
    if data.max() > 1.0:
        data = data.astype(np.float32) / 255.0

    return data[:, :pre_slen], data[:, pre_slen:]

class CustomDataset(Dataset):
    def __init__(self, X, Y, normalize=False, data_name='custom'):
        super(CustomDataset, self).__init__()
        self.X = X
        self.Y = Y
        self.mean = None
        self.std = None
        self.data_name = data_name

        if normalize:
            # get the mean/std values along the channel dimension
            mean = data.mean(axis=(0, 1, 2, 3)).reshape(1, 1, -1, 1, 1)
            std = data.std(axis=(0, 1, 2, 3)).reshape(1, 1, -1, 1, 1)
            data = (data - mean) / std
            self.mean = mean
            self.std = std

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, index):
        data = torch.tensor(self.X[index]).float()
        labels = torch.tensor(self.Y[index]).float()
        return data, labels

dataset = {}
folders = [train_dir, val_dir, test_dir]
for folder in folders:
    data_x, data_y = process_folder(folder, pre_slen=pre_seq_length, aft_slen=aft_seq_length, suffix='.avi')
    dataset['X_' + folder], dataset['Y_' + folder] = data_x, data_y

# save as a pkl file
with open('dataset_train.pkl', 'wb') as f:#MUDAR AQUI
    pickle.dump(dataset, f)

# load the dataset
with open('dataset_train.pkl', 'rb') as f:#MUDAR AQUI
    dataset = pickle.load(f)

train_x, train_y = dataset['X_train'], dataset['Y_train']
print(train_x.shape)
# the shape is B x T x C x H x W
# B: the number of samples
# T: the number of frames in each sample
# C, H, W: the channel, height, width of each frame

X_train, X_val, X_test, Y_train, Y_val, Y_test = dataset[f'X_{train_dir}'], dataset[
    f'X_{val_dir}'], dataset[f'X_{test_dir}'], dataset[f'Y_{train_dir}'], dataset[f'Y_{val_dir}'], dataset[f'Y_{test_dir}']#MUDAR AQUI

train_set = CustomDataset(X=X_train, Y=Y_train)
val_set = CustomDataset(X=X_val, Y=Y_val)
test_set = CustomDataset(X=X_test, Y=Y_test)

dataloader_train = torch.utils.data.DataLoader(
    train_set, batch_size=batch_size, shuffle=True, pin_memory=True)
dataloader_val = torch.utils.data.DataLoader(
    val_set, batch_size=batch_size, shuffle=True, pin_memory=True)
dataloader_test = torch.utils.data.DataLoader(
    test_set, batch_size=1, shuffle=True, pin_memory=True)


custom_training_config = {
    'pre_seq_length': pre_seq_length,
    'aft_seq_length': aft_seq_length,
    'total_length': pre_seq_length + aft_seq_length,
    'batch_size': batch_size,
    'val_batch_size': batch_size,
    'epoch': epoch,
    'lr': 0.001,   
    'metrics': ['mse', 'mae'],

    'ex_name': 'custom_exp',
    'dataname': 'custom',
    'in_shape': [pre_seq_length, train_x.shape[2], train_x.shape[3], train_x.shape[4]], #pre_seq_length, channels, height, widht
}

custom_model_config = {
    # For MetaVP models, the most important hyperparameters are: 
    # N_S, N_T, hid_S, hid_T, model_type
    'method': 'SimVP',
    # Users can either using a config file or directly set these hyperparameters 
    # 'config_file': 'configs/custom/example_model.py',
    
    # Here, we directly set these parameters
    'model_type': 'gSTA',
    'N_S': 4,
    'N_T': 8,
    'hid_S': 64,
    'hid_T': 256
}

from openstl.api import BaseExperiment
from openstl.utils import create_parser, default_parser

args = create_parser().parse_args([])
config = args.__dict__

# update the training config
config.update(custom_training_config)
# update the model config
config.update(custom_model_config)
# fulfill with default values
default_values = default_parser()
for attribute in default_values.keys():
    if config[attribute] is None:
        config[attribute] = default_values[attribute]

exp = BaseExperiment(args, dataloaders=(dataloader_train, dataloader_val, dataloader_test), strategy='auto')

print('>'*35 + ' training ' + '<'*35)
exp.train()

Error: Could not open input video.
Dummy video saved in auxiliary_dirs/test_dummy folder.
(200, 20, 3, 32, 32)


Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> training <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<



  | Name      | Type        | Params | Mode 
--------------------------------------------------
0 | model     | SimVP_Model | 43.6 M | train
1 | criterion | MSELoss     | 0      | train
--------------------------------------------------
43.6 M    Trainable params
0         Non-trainable params
43.6 M    Total params
174.482   Total estimated model params size (MB)
215       Modules in train mode
0         Modules in eval mode


Environment info:
------------------------------------------------------------
sys.platform: linux
Python: 3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:26:04) [GCC 10.4.0]
CUDA available: True
CUDA_HOME: /usr/local/cuda
NVCC: Build cuda_12.5.r12.5/compiler.34177558_0
GPU 0: NVIDIA GeForce RTX 3060 Ti
GCC: gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
PyTorch: 2.4.1+cu121
PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.4.2 (Git Hash 1137e04ec0b5251ca2b4400a4fd3c667ce843d67)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA Runtime 12.1
  - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;

Sanity Checking: |                                                                                | 0/? [00:00…

Training: |                                                                                       | 0/? [00:00…

Validation: |                                                                                     | 0/? [00:00…

Epoch 0, global step 50: 'val_loss' reached 0.00584 (best 0.00584), saving model to '/home/hokutani/tg/OpenSTL/STL_FrameForge/work_dirs/custom_exp/checkpoints/best-epoch=00-val_loss=0.006.ckpt' as top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 1: Lr: 0.0001528 | Train Loss: 0.0191539 | Vali Loss: 0.0032419


Epoch 1, global step 100: 'val_loss' reached 0.00324 (best 0.00324), saving model to '/home/hokutani/tg/OpenSTL/STL_FrameForge/work_dirs/custom_exp/checkpoints/best-epoch=01-val_loss=0.003.ckpt' as top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 2: Lr: 0.0002810 | Train Loss: 0.0057198 | Vali Loss: 0.0027739


Epoch 2, global step 150: 'val_loss' reached 0.00277 (best 0.00277), saving model to '/home/hokutani/tg/OpenSTL/STL_FrameForge/work_dirs/custom_exp/checkpoints/best-epoch=02-val_loss=0.003.ckpt' as top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 3: Lr: 0.0004381 | Train Loss: 0.0047069 | Vali Loss: 0.0028669


Epoch 3, global step 200: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 4: Lr: 0.0006052 | Train Loss: 0.0046112 | Vali Loss: 0.0026521


Epoch 4, global step 250: 'val_loss' reached 0.00265 (best 0.00265), saving model to '/home/hokutani/tg/OpenSTL/STL_FrameForge/work_dirs/custom_exp/checkpoints/best-epoch=04-val_loss=0.003.ckpt' as top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 5: Lr: 0.0007619 | Train Loss: 0.0046699 | Vali Loss: 0.0029399


Epoch 5, global step 300: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 6: Lr: 0.0008894 | Train Loss: 0.0046736 | Vali Loss: 0.0025427


Epoch 6, global step 350: 'val_loss' reached 0.00254 (best 0.00254), saving model to '/home/hokutani/tg/OpenSTL/STL_FrameForge/work_dirs/custom_exp/checkpoints/best-epoch=06-val_loss=0.003.ckpt' as top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 7: Lr: 0.0009721 | Train Loss: 0.0049591 | Vali Loss: 0.0026712


Epoch 7, global step 400: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 8: Lr: 0.0010000 | Train Loss: 0.0042635 | Vali Loss: 0.0025278


Epoch 8, global step 450: 'val_loss' reached 0.00253 (best 0.00253), saving model to '/home/hokutani/tg/OpenSTL/STL_FrameForge/work_dirs/custom_exp/checkpoints/best-epoch=08-val_loss=0.003.ckpt' as top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 9: Lr: 0.0009942 | Train Loss: 0.0041291 | Vali Loss: 0.0021646


Epoch 9, global step 500: 'val_loss' reached 0.00216 (best 0.00216), saving model to '/home/hokutani/tg/OpenSTL/STL_FrameForge/work_dirs/custom_exp/checkpoints/best-epoch=09-val_loss=0.002.ckpt' as top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 10: Lr: 0.0009773 | Train Loss: 0.0041392 | Vali Loss: 0.0021906


Epoch 10, global step 550: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 11: Lr: 0.0009498 | Train Loss: 0.0041213 | Vali Loss: 0.0023027


Epoch 11, global step 600: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 12: Lr: 0.0009123 | Train Loss: 0.0037975 | Vali Loss: 0.0023569


Epoch 12, global step 650: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 13: Lr: 0.0008655 | Train Loss: 0.0036736 | Vali Loss: 0.0020753


Epoch 13, global step 700: 'val_loss' reached 0.00208 (best 0.00208), saving model to '/home/hokutani/tg/OpenSTL/STL_FrameForge/work_dirs/custom_exp/checkpoints/best-epoch=13-val_loss=0.002.ckpt' as top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 14: Lr: 0.0008106 | Train Loss: 0.0034452 | Vali Loss: 0.0019663


Epoch 14, global step 750: 'val_loss' reached 0.00197 (best 0.00197), saving model to '/home/hokutani/tg/OpenSTL/STL_FrameForge/work_dirs/custom_exp/checkpoints/best-epoch=14-val_loss=0.002.ckpt' as top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 15: Lr: 0.0007487 | Train Loss: 0.0032917 | Vali Loss: 0.0021028


Epoch 15, global step 800: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 16: Lr: 0.0006813 | Train Loss: 0.0034260 | Vali Loss: 0.0018856


Epoch 16, global step 850: 'val_loss' reached 0.00189 (best 0.00189), saving model to '/home/hokutani/tg/OpenSTL/STL_FrameForge/work_dirs/custom_exp/checkpoints/best-epoch=16-val_loss=0.002.ckpt' as top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 17: Lr: 0.0006098 | Train Loss: 0.0031765 | Vali Loss: 0.0021764


Epoch 17, global step 900: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 18: Lr: 0.0005359 | Train Loss: 0.0032157 | Vali Loss: 0.0018874


Epoch 18, global step 950: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 19: Lr: 0.0004611 | Train Loss: 0.0031837 | Vali Loss: 0.0018486


Epoch 19, global step 1000: 'val_loss' reached 0.00185 (best 0.00185), saving model to '/home/hokutani/tg/OpenSTL/STL_FrameForge/work_dirs/custom_exp/checkpoints/best-epoch=19-val_loss=0.002.ckpt' as top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 20: Lr: 0.0003873 | Train Loss: 0.0030342 | Vali Loss: 0.0019848


Epoch 20, global step 1050: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 21: Lr: 0.0003159 | Train Loss: 0.0027868 | Vali Loss: 0.0020049


Epoch 21, global step 1100: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 22: Lr: 0.0002487 | Train Loss: 0.0027107 | Vali Loss: 0.0019139


Epoch 22, global step 1150: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 23: Lr: 0.0001871 | Train Loss: 0.0026735 | Vali Loss: 0.0020633


Epoch 23, global step 1200: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 24: Lr: 0.0001325 | Train Loss: 0.0025658 | Vali Loss: 0.0020510


Epoch 24, global step 1250: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 25: Lr: 0.0000860 | Train Loss: 0.0023114 | Vali Loss: 0.0019637


Epoch 25, global step 1300: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 26: Lr: 0.0000489 | Train Loss: 0.0022424 | Vali Loss: 0.0019979


Epoch 26, global step 1350: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 27: Lr: 0.0000218 | Train Loss: 0.0021924 | Vali Loss: 0.0019379


Epoch 27, global step 1400: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 28: Lr: 0.0000054 | Train Loss: 0.0020450 | Vali Loss: 0.0020473


Epoch 28, global step 1450: 'val_loss' was not in top 1


Validation: |                                                                                     | 0/? [00:00…

Epoch 29: Lr: 0.0000000 | Train Loss: 0.0019744 | Vali Loss: 0.0020424


Epoch 29, global step 1500: 'val_loss' was not in top 1
`Trainer.fit` stopped: `max_epochs=30` reached.
