# Model

## Config

In [1]:
from recognizer.utils.constants import DATASET_DIR, TARGET_TO_ENCODING
from recognizer.utils.utils import get_metadata_from_filename

## Load data

In [2]:
import os

import pandas as pd 


targets = []
subjects = []
repetitions = []
files = []

for file in os.listdir(DATASET_DIR):
    if "left" in file:
        continue

    target, subject, repetition = get_metadata_from_filename(file)

    targets.append(target)
    subjects.append(subject)
    repetitions.append(repetition)
    files.append(str((DATASET_DIR / file).resolve()))


metadata = pd.DataFrame(
    data={
        "target": targets,
        "subject": subjects,
        "repetition": repetitions,
        "file": files,
    }
)

metadata["target_encoding"] = metadata["target"].map(TARGET_TO_ENCODING)

metadata.head()

Unnamed: 0,target,subject,repetition,file,target_encoding
0,64,2,1,/Users/facundopalavecino/Documents/DEV/ecd-tra...,63
1,35,10,3,/Users/facundopalavecino/Documents/DEV/ecd-tra...,34
2,33,9,3,/Users/facundopalavecino/Documents/DEV/ecd-tra...,32
3,26,2,4,/Users/facundopalavecino/Documents/DEV/ecd-tra...,25
4,23,5,2,/Users/facundopalavecino/Documents/DEV/ecd-tra...,22


### Train/Test split

In [3]:
import numpy as np

size = 1
replace = False
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]

testing_set = metadata.groupby(["target", "subject"], as_index=False).apply(fn)

testing_set.index = testing_set.index.droplevel(0)

training_set = metadata.loc[~metadata.index.isin(testing_set.index), :]

In [4]:
training_set = training_set.sample(frac=0.3)
testing_set = testing_set.sample(frac=0.3)

In [5]:
def transform(video):
    # Transpose video from (T<frames>, Height, Width, Channels) to (Channels, T<frames>, Height, Width)
    return video.permute(3, 0, 1, 2).float()

In [28]:
from recognizer.dataset.sampled_video_dataset import SampledVideoDataset

training_dataset = SampledVideoDataset(
    video_filenames=training_set["file"].values,
    labels=training_set["target_encoding"].values,
    num_frames=4,
    transform=transform,
)

len(training_dataset)

TypeError: __init__() got an unexpected keyword argument 'num_frames'

In [7]:
testing_dataset = SampledVideoDataset(
    video_filenames=testing_set["file"].values,
    labels=testing_set["target_encoding"].values,
    num_frames=4,
    transform=transform,
)

len(testing_dataset)

192

### Data loaders

In [8]:
BATCH_SIZE = 64
NUM_CLASSES = 64
EPOCHS = 2

In [9]:
import torch 

train_loader = torch.utils.data.DataLoader(training_dataset, batch_size=BATCH_SIZE, shuffle = False)
test_loader = torch.utils.data.DataLoader(testing_dataset, batch_size=BATCH_SIZE, shuffle = False)


## Model

### Loss function

In [10]:
from torch import nn

from recognizer.models.simple import Simple3DCNN


loss_function = nn.CrossEntropyLoss()
learning_rate = 0.001

# device = (
#     "cuda"
#     if torch.cuda.is_available()
#     else "mps"
#     if torch.backends.mps.is_available()
#     else "cpu"
# )

device = "cpu"

print(f"Using {device} device")

model = Simple3DCNN(num_classes=NUM_CLASSES, num_frames=6).to(device=device)

Using cpu device


In [11]:
print(model)

Simple3DCNN(
  (conv_layer): Sequential(
    (0): Conv3d(3, 32, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (1): BatchNorm3d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv3d(32, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (5): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc_layer): Sequential(
    (0): Linear(in_features=65536, out_features=512, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=512, out_features=64, bias=True)
  )
)


### Optimizer

In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [15]:
model.conv_layer

Sequential(
  (0): Conv3d(3, 32, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (1): BatchNorm3d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (4): Conv3d(32, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (5): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (6): ReLU(inplace=True)
  (7): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)

In [16]:
model.fc_layer

Sequential(
  (0): Linear(in_features=65536, out_features=512, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=512, out_features=64, bias=True)
)

## Training

In [14]:
from tqdm import tqdm

for epoch in tqdm(range(EPOCHS)):
    print("Setting model in training mode...")
    model.train()
    total_loss = 0.0
    for i, data in tqdm(enumerate(train_loader)):

        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()

        print(f"Epoch {epoch+1} | batch {i+1}: training...")

        outputs = model(inputs)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Training Loss: {total_loss/len(train_loader)}")

    # Evaluation
    print("Setting model in evaluation mode...")
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for data in tqdm(test_loader):
            inputs, labels = data[0].to(device), data[1].to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Epoch {epoch+1}, Test Accuracy: {100 * correct / total}")

  0%|          | 0/2 [00:00<?, ?it/s]

Setting model in training mode...




Epoch 1 | batch 1: training...
torch.Size([64, 65536])




Epoch 1 | batch 2: training...
torch.Size([64, 65536])




Epoch 1 | batch 3: training...
torch.Size([64, 65536])




Epoch 1 | batch 4: training...
torch.Size([64, 65536])




Epoch 1 | batch 5: training...
torch.Size([64, 65536])




Epoch 1 | batch 6: training...
torch.Size([64, 65536])




Epoch 1 | batch 7: training...
torch.Size([64, 65536])




Epoch 1 | batch 8: training...
torch.Size([64, 65536])




Epoch 1 | batch 9: training...
torch.Size([64, 65536])




Epoch 1 | batch 10: training...
torch.Size([64, 65536])




Epoch 1 | batch 11: training...
torch.Size([64, 65536])




Epoch 1 | batch 12: training...
torch.Size([64, 65536])


12it [01:14,  6.20s/it]


Epoch 1, Training Loss: 22.311721483866375
Setting model in evaluation mode...




torch.Size([64, 65536])




torch.Size([64, 65536])


100%|██████████| 3/3 [00:07<00:00,  2.55s/it]
 50%|█████     | 1/2 [01:22<01:22, 82.07s/it]

torch.Size([64, 65536])
Epoch 1, Test Accuracy: 3.6458333333333335
Setting model in training mode...




Epoch 2 | batch 1: training...
torch.Size([64, 65536])




Epoch 2 | batch 2: training...
torch.Size([64, 65536])




Epoch 2 | batch 3: training...
torch.Size([64, 65536])




Epoch 2 | batch 4: training...
torch.Size([64, 65536])




Epoch 2 | batch 5: training...
torch.Size([64, 65536])




Epoch 2 | batch 6: training...
torch.Size([64, 65536])




Epoch 2 | batch 7: training...
torch.Size([64, 65536])




Epoch 2 | batch 8: training...
torch.Size([64, 65536])




Epoch 2 | batch 9: training...
torch.Size([64, 65536])




Epoch 2 | batch 10: training...
torch.Size([64, 65536])




Epoch 2 | batch 11: training...
torch.Size([64, 65536])




Epoch 2 | batch 12: training...
torch.Size([64, 65536])


12it [01:15,  6.32s/it]


Epoch 2, Training Loss: 4.141027530034383
Setting model in evaluation mode...




torch.Size([64, 65536])




torch.Size([64, 65536])


100%|██████████| 3/3 [00:08<00:00,  2.90s/it]
100%|██████████| 2/2 [02:46<00:00, 83.30s/it]

torch.Size([64, 65536])
Epoch 2, Test Accuracy: 2.0833333333333335



