# Model

## Config

In [1]:
from recognizer.utils.constants import DATASET_DIR, TARGET_TO_ENCODING
from recognizer.utils.utils import get_metadata_from_filename

## Load data

In [2]:
import os

import pandas as pd 


targets = []
subjects = []
repetitions = []
files = []

for file in os.listdir(DATASET_DIR):
    if "left" in file:
        continue

    target, subject, repetition = get_metadata_from_filename(file)

    targets.append(target)
    subjects.append(subject)
    repetitions.append(repetition)
    files.append(str((DATASET_DIR / file).resolve()))


metadata = pd.DataFrame(
    data={
        "target": targets,
        "subject": subjects,
        "repetition": repetitions,
        "file": files,
    }
)

metadata["target_encoding"] = metadata["target"].map(TARGET_TO_ENCODING)

metadata.head()

Unnamed: 0,target,subject,repetition,file,target_encoding
0,64,2,1,/Users/facundopalavecino/Documents/DEV/ecd-tra...,63
1,35,10,3,/Users/facundopalavecino/Documents/DEV/ecd-tra...,34
2,33,9,3,/Users/facundopalavecino/Documents/DEV/ecd-tra...,32
3,26,2,4,/Users/facundopalavecino/Documents/DEV/ecd-tra...,25
4,23,5,2,/Users/facundopalavecino/Documents/DEV/ecd-tra...,22


### Train/Test split

In [3]:
import numpy as np

size = 1
replace = False
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]

testing_set = metadata.groupby(["target", "subject"], as_index=False).apply(fn)

testing_set.index = testing_set.index.droplevel(0)

training_set = metadata.loc[~metadata.index.isin(testing_set.index), :]

In [7]:
training_set.loc[:5, "target_encoding"].values, training_set.loc[:5, "target"].values

(array([63, 34, 32, 25, 22, 17]),
 array(['064', '035', '033', '026', '023', '018'], dtype=object))

In [54]:
def transform(video):
    # Transpose video from (T, H, W, C) to (C, T, H, W)
    return video.permute(3, 0, 1, 2).float()

In [55]:
from recognizer.dataset.sampled_video_dataset import SampledVideoDataset

training_dataset = SampledVideoDataset(
    video_filenames=training_set["file"].values,
    labels=training_set["target_encoding"].values,
    num_frames=25,
    transform=transform,
)

len(training_dataset)

2560

In [56]:
testing_dataset = SampledVideoDataset(
    video_filenames=testing_set["file"].values,
    labels=testing_set["target_encoding"].values,
    num_frames=25,
    transform=transform,
)

len(testing_dataset)

640

### Data loaders

In [57]:
BATCH_SIZE = 64
NUM_CLASSES = 64
EPOCHS = 10

In [58]:
import torch 

train_loader = torch.utils.data.DataLoader(training_dataset, batch_size=BATCH_SIZE, shuffle = False)
test_loader = torch.utils.data.DataLoader(testing_dataset, batch_size=BATCH_SIZE, shuffle = False)


## Model

### Loss function

In [59]:
from torch import nn

loss_function = nn.CrossEntropyLoss()

### Learning rate

In [60]:
learning_rate = 0.001

### Training device

In [61]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


### NN

In [66]:
from recognizer.models.simple import Simple3DCNN

model = Simple3DCNN(num_classes=NUM_CLASSES).to(device="cpu")

In [67]:
print(model)

Simple3DCNN(
  (conv_layer): Sequential(
    (0): Conv3d(3, 32, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (1): BatchNorm3d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv3d(32, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    (5): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc_layer): Sequential(
    (0): Linear(in_features=98304, out_features=512, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=512, out_features=64, bias=True)
  )
)


### Optimizer

In [68]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [69]:
for epoch in range(EPOCHS):
    # Training
    model.train()
    total_loss = 0.0
    for i, data in enumerate(train_loader):
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Training Loss: {total_loss/len(train_loader)}")

    # Evaluation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            inputs, labels = data[0].to(device), data[1].to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Epoch {epoch+1}, Test Accuracy: {100 * correct / total}")

RuntimeError: Input type (MPSFloatType) and weight type (torch.FloatTensor) should be the same