# Required imports

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from torchvision import transforms
from PIL import Image
from sklearn.model_selection import KFold
from cnn_2d_model.CNN_LSTM_model import HelplessnessClassifier as CNN_LSTM_Classifier
from cnn_3d_model.model import HelplessnessClassifier as CNN_3D_Classifier
from pre_trained_transformer_model.model import create_swin3d_t_model_training

device = 'cpu'
if torch.backends.mps.is_available():
    device = 'mps' # Apple Silicon 
if torch.cuda.is_available():
    device = 'cuda' # Nvidia GPU
print(f'Using device: {device}')

Using device: cuda


# Custom dataset for cross-validation

In [2]:
class HelplessnessVideoDataset(Dataset):
    def __init__(self, root_dir, train_transform=None, test_transform=None):
        self.root_dir = root_dir
        self.train_transform = train_transform
        self.test_transform = test_transform
        self.train = True
        self.video_folders = []

        # Read all video sequences in the extracted frames folders
        categories = ['extreme-helpless', 'little_helplessness', 'no-helpless']
        for category in categories:
            category_dir = os.path.join(root_dir, category)
            if not os.path.exists(category_dir):
                print(f"Warning: Category folder {category_dir} does not exist.")
                continue
            for video_folder in sorted(os.listdir(category_dir)):
                video_path = os.path.join(category_dir, video_folder)
                if os.path.isdir(video_path):
                    self.video_folders.append(video_path)

    def __len__(self):
        return len(self.video_folders)

    def __getitem__(self, index):
        video_path = self.video_folders[index]

        # Each video is a sequence of frames, so need to get each frame
        frame_files = sorted(os.listdir(video_path))
        sequence = []
        random_state = torch.get_rng_state()
        for frame_name in frame_files:
            frame_path = os.path.join(video_path, frame_name)
            frame = Image.open(frame_path)
            if self.train_transform and self.test_transform:
                # To allow augmentation, we need to apply the same "random" transformation to each frame
                torch.set_rng_state(random_state)
                frame = self.train_transform(frame) if self.train else self.test_transform(frame)
            sequence.append(frame)

        # Convert the sequence from list of tensors to (sequence_length, channels, height, width) tensor
        sequence = torch.stack(sequence)
        sequence = torch.transpose(sequence, 0, 1)  # REMOVE THIS IF YOU NEED THE SEQUENCE_LENGTH AND CHANNELS DIMENSIONS SWITCHED!

        # Retrieve the level of helplessness label from path of video
        split_path = video_path.split(os.sep)  # Changed '/' to os.sep for cross-platform compatibility

        # Ensure that split_path has at least 2 components
        if len(split_path) >= 2:
            category = split_path[-2]  # category folder is second last in path
        else:
            raise ValueError(f"Path structure issue: {video_path}")

        label = -1
        if category == 'no-helpless':
            label = 0
        elif category == 'little_helplessness':
            label = 1
        elif category == 'extreme-helpless':
            label = 2

        return sequence, label

# 2D CNN-LSTM 5-fold Cross Validation

In [3]:
# Create the transforms and dataset for 2D CNN-LSTM

train_transform = transforms.Compose([
    transforms.RandomResizedCrop((112, 112), scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=10),
    transforms.Grayscale(num_output_channels=1),
    transforms.ToTensor(),  # converts PIL Image in L mode to tensor shape (1, H, W) with pixel values [0,1]
    transforms.Normalize(mean=[0.5], std=[0.5])
])

test_transform = transforms.Compose([
    transforms.Resize((112, 112)),
    transforms.Grayscale(num_output_channels=1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

train_dataset = HelplessnessVideoDataset('../data/train', train_transform=train_transform, test_transform=test_transform)

In [6]:
# to do 5-fold cross validation, we are using the approach shown here: https://saturncloud.io/blog/how-to-use-kfold-cross-validation-with-dataloaders-in-pytorch/

# Initialize how many folds we want for cross-validation
# Using KFold from scikit-learn for this: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
kf = KFold(n_splits=5, shuffle=True)
average_f1score = 0
average_accuracy = 0

# Loop through each fold, training the model and reporting results
for fold, (train_index, test_index) in enumerate(kf.split(train_dataset)):
    print(f'Fold {fold + 1}')
    print('------')

    # Create data loaders for the current fold
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=4,
        sampler=torch.utils.data.SubsetRandomSampler(train_index),
        num_workers=4
    )

    test_loader = DataLoader(
        dataset=train_dataset,
        batch_size=4,
        sampler=torch.utils.data.SubsetRandomSampler(test_index),
        num_workers=4
    )

    # Initialize the model, loss criterion, optimizer and number of epoches
    model = CNN_LSTM_Classifier(num_classes=3).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
    num_epochs = 20

    # Train the model for the given epoches on the train_loader for this fold
    for epochs in tqdm(range(num_epochs)):
        model.train()
        train_dataset.train = True # ensure train transforms on used on the data

        for i, (sequences, labels) in enumerate(train_loader):
            sequences = torch.transpose(sequences, 1, 2) # need to transpose the sequence length and channel dimensions
            sequences, labels = sequences.to(device), labels.to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(sequences)

            # Compute loss
            loss = criterion(outputs, labels)

            # Backward pass
            loss.backward()

            # Optimize the model
            optimizer.step()

    # Evaluate the model on the test_loader for this fold
    model.eval()
    train_dataset.train = False # ensure test transforms on used on the data
    predictions = []
    ground_truth = []
    correct = 0
    total = 0
    with torch.no_grad():
        for i, (sequences, labels) in enumerate(test_loader):
            sequences = torch.transpose(sequences, 1, 2) # need to transpose the sequence length and channel dimensions
            sequences, labels = sequences.to(device), labels.to(device)

            # Forward pass
            outputs = model(sequences)

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            # Store predictions and ground truth labels in lists
            predictions.extend(predicted.detach().cpu().tolist())
            ground_truth.extend(labels.detach().cpu().tolist())
    
    # Using scikit-learn, print the F1 score of this fold: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
    from sklearn.metrics import f1_score
    fold_f1score = f1_score(ground_truth, predictions, average='weighted') * 100
    fold_accuracy = 100 * correct / total
    print(f'Fold F1 Score: {fold_f1score:.2f}%, Fold Accuracy: {fold_accuracy:.2f}%\n')
    average_f1score += fold_f1score
    average_accuracy += fold_accuracy

average_f1score /= 5
average_accuracy /= 5
print(f'Average F1 Score across all folds: {average_f1score:.2f}%')
print(f'Average Accuracy across all folds: {average_accuracy:.2f}%')

Fold 1
------
[DEBUG] CNN feature_dim = 25088


100%|██████████| 20/20 [01:01<00:00,  3.09s/it]


Fold F1 Score: 45.10%, Fold Accuracy: 45.45%

Fold 2
------
[DEBUG] CNN feature_dim = 25088


100%|██████████| 20/20 [01:05<00:00,  3.28s/it]


Fold F1 Score: 31.81%, Fold Accuracy: 34.38%

Fold 3
------
[DEBUG] CNN feature_dim = 25088


100%|██████████| 20/20 [01:04<00:00,  3.25s/it]


Fold F1 Score: 31.34%, Fold Accuracy: 34.38%

Fold 4
------
[DEBUG] CNN feature_dim = 25088


100%|██████████| 20/20 [01:04<00:00,  3.24s/it]


Fold F1 Score: 44.82%, Fold Accuracy: 46.88%

Fold 5
------
[DEBUG] CNN feature_dim = 25088


100%|██████████| 20/20 [01:04<00:00,  3.23s/it]


Fold F1 Score: 45.45%, Fold Accuracy: 46.88%

Average F1 Score across all folds: 39.71%
Average Accuracy across all folds: 41.59%


# 3D CNN 5-fold Cross Validation

In [4]:
# Create the transforms and dataset for 3D CNN

train_transform = transforms.Compose([
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
    transforms.RandomRotation((-45.0, 45.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomResizedCrop((224, 224), scale=(0.8, 1.2)),
    transforms.ToTensor(),
    transforms.Normalize([0.41500069, 0.36530493, 0.33830512], [0.29042152, 0.27499218, 0.27738131])
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.41500069, 0.36530493, 0.33830512], [0.29042152, 0.27499218, 0.27738131])
])

train_dataset = HelplessnessVideoDataset('../data/train', train_transform=train_transform, test_transform=test_transform)

In [7]:
# to do 5-fold cross validation, we are using the approach shown here: https://saturncloud.io/blog/how-to-use-kfold-cross-validation-with-dataloaders-in-pytorch/

# Initialize how many folds we want for cross-validation
# Using KFold from scikit-learn for this: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
kf = KFold(n_splits=5, shuffle=True)
average_f1score = 0
average_accuracy = 0

# Loop through each fold, training the model and reporting results
for fold, (train_index, test_index) in enumerate(kf.split(train_dataset)):
    print(f'Fold {fold + 1}')
    print('------')

    # Create data loaders for the current fold
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=2,
        sampler=torch.utils.data.SubsetRandomSampler(train_index),
        num_workers=4
    )

    test_loader = DataLoader(
        dataset=train_dataset,
        batch_size=2,
        sampler=torch.utils.data.SubsetRandomSampler(test_index),
        num_workers=4
    )

    # Initialize the model, loss criterion, optimizer and number of epoches
    model = CNN_3D_Classifier(input_channels=3).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)
    num_epochs = 135

    # Train the model for the given epoches on the train_loader for this fold
    for epochs in tqdm(range(num_epochs)):
        model.train()
        train_dataset.train = True # ensure train transforms on used on the data

        for i, (sequences, labels) in enumerate(train_loader):
            sequences, labels = sequences.to(device), labels.to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(sequences)

            # Compute loss
            loss = criterion(outputs, labels)

            # Backward pass
            loss.backward()

            # Optimize the model
            optimizer.step()

    # Evaluate the model on the test_loader for this fold
    model.eval()
    train_dataset.train = False # ensure test transforms on used on the data
    predictions = []
    ground_truth = []
    correct = 0
    total = 0
    with torch.no_grad():
        for i, (sequences, labels) in enumerate(test_loader):
            sequences, labels = sequences.to(device), labels.to(device)

            # Forward pass
            outputs = model(sequences)

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            # Store predictions and ground truth labels in lists
            predictions.extend(predicted.detach().cpu().tolist())
            ground_truth.extend(labels.detach().cpu().tolist())
    
    # Using scikit-learn, print the F1 score of this fold: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
    from sklearn.metrics import f1_score
    fold_f1score = f1_score(ground_truth, predictions, average='weighted') * 100
    fold_accuracy = 100 * correct / total
    print(f'Fold F1 Score: {fold_f1score:.2f}%, Fold Accuracy: {fold_accuracy:.2f}%\n')
    average_f1score += fold_f1score
    average_accuracy += fold_accuracy

average_f1score /= 5
average_accuracy /= 5
print(f'Average F1 Score across all folds: {average_f1score:.2f}%')
print(f'Average Accuracy across all folds: {average_accuracy:.2f}%')

Fold 1
------


100%|██████████| 135/135 [35:12<00:00, 15.65s/it]


Fold F1 Score: 54.57%, Fold Accuracy: 54.55%

Fold 2
------


100%|██████████| 135/135 [35:58<00:00, 15.99s/it]


Fold F1 Score: 49.17%, Fold Accuracy: 50.00%

Fold 3
------


100%|██████████| 135/135 [35:57<00:00, 15.98s/it]


Fold F1 Score: 66.12%, Fold Accuracy: 65.62%

Fold 4
------


100%|██████████| 135/135 [35:25<00:00, 15.75s/it]


Fold F1 Score: 40.20%, Fold Accuracy: 40.62%

Fold 5
------


100%|██████████| 135/135 [33:55<00:00, 15.08s/it]


Fold F1 Score: 61.60%, Fold Accuracy: 62.50%

Average F1 Score across all folds: 54.33%
Average Accuracy across all folds: 54.66%


# Pre-trained Video Swin Transformer Cross Validation

In [3]:
# Create the transforms and dataset for 3D CNN

train_transform = transforms.Compose([
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
    transforms.RandomRotation((-45.0, 45.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomResizedCrop((224, 224), scale=(0.8, 1.2)),
    transforms.ToTensor(),
    transforms.Normalize([0.41500069, 0.36530493, 0.33830512], [0.29042152, 0.27499218, 0.27738131])
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.41500069, 0.36530493, 0.33830512], [0.29042152, 0.27499218, 0.27738131])
])

train_dataset = HelplessnessVideoDataset('../data/train', train_transform=train_transform, test_transform=test_transform)

In [7]:
# to do 5-fold cross validation, we are using the approach shown here: https://saturncloud.io/blog/how-to-use-kfold-cross-validation-with-dataloaders-in-pytorch/

# Initialize how many folds we want for cross-validation
# Using KFold from scikit-learn for this: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
kf = KFold(n_splits=5, shuffle=True)
average_f1score = 0
average_accuracy = 0

# Loop through each fold, training the model and reporting results
for fold, (train_index, test_index) in enumerate(kf.split(train_dataset)):
    print(f'Fold {fold + 1}')
    print('------')

    # Create data loaders for the current fold
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=1,
        sampler=torch.utils.data.SubsetRandomSampler(train_index),
        num_workers=4
    )

    test_loader = DataLoader(
        dataset=train_dataset,
        batch_size=1,
        sampler=torch.utils.data.SubsetRandomSampler(test_index),
        num_workers=4
    )

    # Initialize the model, loss criterion, optimizer and number of epoches
    model = create_swin3d_t_model_training().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
    num_epochs = 10

    # Train the model for the given epoches on the train_loader for this fold
    for epochs in tqdm(range(num_epochs)):
        model.train()
        train_dataset.train = True # ensure train transforms on used on the data

        for i, (sequences, labels) in enumerate(train_loader):
            sequences, labels = sequences.to(device), labels.to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(sequences)

            # Compute loss
            loss = criterion(outputs, labels)

            # Backward pass
            loss.backward()

            # Optimize the model
            optimizer.step()

    # Evaluate the model on the test_loader for this fold
    model.eval()
    train_dataset.train = False # ensure test transforms on used on the data
    predictions = []
    ground_truth = []
    correct = 0
    total = 0
    with torch.no_grad():
        for i, (sequences, labels) in enumerate(test_loader):
            sequences, labels = sequences.to(device), labels.to(device)

            # Forward pass
            outputs = model(sequences)

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            # Store predictions and ground truth labels in lists
            predictions.extend(predicted.detach().cpu().tolist())
            ground_truth.extend(labels.detach().cpu().tolist())
    
    # Using scikit-learn, print the F1 score of this fold: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
    from sklearn.metrics import f1_score
    fold_f1score = f1_score(ground_truth, predictions, average='weighted') * 100
    fold_accuracy = 100 * correct / total
    print(f'Fold F1 Score: {fold_f1score:.2f}%, Fold Accuracy: {fold_accuracy:.2f}%\n')
    average_f1score += fold_f1score
    average_accuracy += fold_accuracy

average_f1score /= 5
average_accuracy /= 5
print(f'Average F1 Score across all folds: {average_f1score:.2f}%')
print(f'Average Accuracy across all folds: {average_accuracy:.2f}%')

Fold 1
------


100%|██████████| 10/10 [06:14<00:00, 37.50s/it]


Fold F1 Score: 68.79%, Fold Accuracy: 69.70%

Fold 2
------


100%|██████████| 10/10 [06:17<00:00, 37.79s/it]


Fold F1 Score: 63.12%, Fold Accuracy: 65.62%

Fold 3
------


100%|██████████| 10/10 [06:07<00:00, 36.73s/it]


Fold F1 Score: 83.93%, Fold Accuracy: 84.38%

Fold 4
------


100%|██████████| 10/10 [06:08<00:00, 36.83s/it]


Fold F1 Score: 63.59%, Fold Accuracy: 68.75%

Fold 5
------


100%|██████████| 10/10 [06:05<00:00, 36.59s/it]


Fold F1 Score: 74.84%, Fold Accuracy: 75.00%

Average F1 Score across all folds: 70.85%
Average Accuracy across all folds: 72.69%
