<a href="https://colab.research.google.com/github/fortune-max/M4-final-hand-gesture-recognition/blob/main/recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!mkdir -p pretrained_models
!wget https://n-ws-620xz-pd11.s3pd11.sbercloud.ru/b-ws-620xz-pd11-jux/hagrid/hagrid_models_new/RetinaNet_ResNet50.pth -O pretrained_models/RetinaNet_ResNet50.pth

In [None]:
import torch
from models.detectors.retinanet import RetinaNet_ResNet50

model = RetinaNet_ResNet50(
    num_classes=20,
    pretrained=False,
    img_mean=[0.54, 0.499, 0.473],
    img_std=[0.231, 0.232, 0.229],
    img_size=224
)
model.type = "detector"
model_state = torch.load("pretrained_models/RetinaNet_ResNet50.pth", map_location=torch.device('cpu'))["MODEL_STATE"]
model.load_state_dict(model_state)
model.train()

In [None]:
# Modifying the classification head
import torch.nn as nn

num_classes = 39
new_cls_logits= nn.Conv2d(
    256, model.hagrid_model.head.classification_head.num_anchors * num_classes, kernel_size=3, stride=1, padding=1
)

# Copying over the weights we can from the old layer
old_weights = model.hagrid_model.head.classification_head.cls_logits.weight.data
new_cls_logits.weight.data[:old_weights.shape[0], :, :, :] = old_weights
model.hagrid_model.head.classification_head.cls_logits = new_cls_logits
model.hagrid_model.head.classification_head.num_classes = num_classes

In [None]:
import cv2
from PIL import Image
import albumentations as A
from albumentations.pytorch import ToTensorV2

transform = A.Compose([
    A.LongestMaxSize(max_size=224, p=1),
    A.PadIfNeeded(min_height=224, min_width=224, value=[144, 144, 144], border_mode=0, p=1),
    ToTensorV2()
])

def preprocess_image(img):
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    transformed_image = transform(image=img)
    processed_image = transformed_image["image"] / 255.0
    return processed_image

def permute_processed_image(img):
    return img.permute(1, 2, 0)

def get_image(processed_img):
    return Image.fromarray((permute_processed_image(processed_img) * 255).numpy().astype("uint8"))

In [None]:
from string import ascii_uppercase
targets = list(ascii_uppercase) + list(range(10)) + [
    "space",
    "del",
]
targets = {target: i for i, target in enumerate(targets)}

In [None]:
import cv2
import pandas as pd
from tqdm import tqdm
from sklearn.utils import shuffle

training_images = []
training_labels = []

cut_off = 500 # None for all
train_csv = pd.read_csv("ASL/Training_set.csv").head(cut_off)
train_csv = shuffle(train_csv, random_state=42)

for i in tqdm(range(len(train_csv))):
    row = train_csv.iloc[i]
    if row["label"] not in targets: continue
    img = cv2.imread("ASL/train/" + row["filename"])
    img = preprocess_image(img)
    training_images.append(img)
    training_labels.append(targets[row["label"]])

In [None]:
display(get_image(training_images[7]))
print(list(targets)[training_labels[7]])

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(training_images, training_labels, test_size=0.2, random_state=42)

In [None]:
# Freeze all layers except the classification head
def freeze(model):
    for param in model.parameters():
        param.requires_grad = False

model.hagrid_model.apply(freeze)

for param in model.hagrid_model.head.classification_head.parameters():
    param.requires_grad = True

In [None]:
from torch import optim, nn

# Train the model
num_epochs = 1
boxes = torch.tensor([[0, 0, 224, 224]]) # box is the entire image

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(
    [parameter for parameter in model.parameters() if parameter.requires_grad],
    lr=0.01, momentum=0.9, weight_decay=0.0001)

for epoch in range(num_epochs):
    for input, label in zip(x_train, y_train):
        model_targets = {"boxes": boxes, "labels": torch.tensor([label])}
        optimizer.zero_grad()
        loss = model([input], [model_targets])[0]
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
with torch.no_grad():
    total = 0
    correct = 0
    for input, label in zip(x_val, y_val):
        output = model([input])[0]
        predicted_box = output["boxes"][0]
        predicted_label = output["labels"][0]
        predicted_score = output["scores"][0]
        predicted_target = predicted_label.item()
        total += 1
        if predicted_target == label: correct += 1
    print("Accuracy: {}%".format(100 * correct / total))

In [None]:
%debug

In [None]:
#  ignore, for reloading the module
import importlib
import models.detectors.retinanet
importlib.reload(models.detectors.retinanet)
from models.detectors.retinanet import RetinaNet_ResNet50