# ASL Alphabet

План работы:
- Сделать dataset
- Сделать dataloader
- Сделать разные архитектуры
- Сделать оптимизатор и функцию ошибки
- Сделать функцию обучения
- Обучить нейросеть
- Перейти в многофайловый формат
- Сделать эксперименты для изучения
- Использовать Docker

In [101]:
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision as tv
import cv2
from torch.cuda.amp import autocast, GradScaler

In [None]:
!pip install kaggle
from google.colab import files
if 'kaggle.json' not in os.listdir():
  files.upload() #API token
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download 'grassknoted/asl-alphabet'
!unzip 'asl-alphabet.zip'

## Dataset

In [3]:
class ASLDataset(torch.utils.data.Dataset):
    def __init__(self, train_path,test_path=None, transforms=None, imsize=64):
        self.root = train_path
        self.classes = {}
        self.image = []
        self.size = imsize
        self.train = True
        self.transforms = transforms
        listdir = sorted(os.listdir(train_path))
        if test_path:
            for i, name in enumerate(listdir):
                self.classes[name] = i
            self.image = sorted(os.listdir(test_path))
            self.root = test_path
            self.train = False
        else:
            for i, name in enumerate(listdir):
                self.classes[name] = i
                list_class_image = os.listdir(os.path.join(train_path, name))
                self.image += list_class_image
                print(f'{name}:{len(list_class_image)}', end=' | ')

    def __len__(self):
        return len(self.image)

    def __getitem__(self, idx):
        name = self.image[idx]
        if name[0] in self.classes:
            class_name = name[0]
            class_num = self.classes[name[0]]
        elif 'del' in name:
            class_name = 'del'
            class_num = self.classes['del']
        elif 'nothing' in name:
            class_name = 'nothing'
            class_num = self.classes['nothing']
        elif 'space' in name:
            class_name = 'space'
            class_num = self.classes['space']
        if self.train:
            path = os.path.join(self.root, class_name, name)
        else:
            path = os.path.join(self.root, name)

        img = cv2.imread(path, cv2.IMREAD_COLOR)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = img.astype(np.float32)
        img = img/255.0
        img = cv2.resize(img, (self.size, self.size), interpolation=cv2.INTER_AREA)
        img = img.transpose([2, 0, 1]) # HWC -> CHW
        t_img = torch.from_numpy(img)
        if self.transforms:
            t_img = self.transforms(t_img)
        t_class_num = torch.tensor(class_num)

        return t_img, t_class_num


In [27]:
def show_image(img):
    plt.imshow(img.numpy().transpose([1, 2, 0]))

In [17]:
tfs = tv.transforms.Compose([
    tv.transforms.ColorJitter(hue=.50, saturation=.50),
    # tv.transforms.RandomHorizontalFlip(),
    # tv.transforms.RandomVerticalFlip(),
    tv.transforms.RandomRotation(60),
    # tv.transforms.ToTensor(),
    # tv.transforms.Normalize(mean=[0.43,0.44,0.47],
    #                    std=[0.20,0.20,0.20])
])

train_path = '/content/asl_alphabet_train/asl_alphabet_train'
test_path = '/content/asl_alphabet_test/asl_alphabet_test'
train_dataset = ASLDataset(train_path, transforms=tfs, imsize=128)
test_dataset = ASLDataset(train_path, test_path, imsize=128)

A:3000 | B:3000 | C:3000 | D:3000 | E:3000 | F:3000 | G:3000 | H:3000 | I:3000 | J:3000 | K:3000 | L:3000 | M:3000 | N:3000 | O:3000 | P:3000 | Q:3000 | R:3000 | S:3000 | T:3000 | U:3000 | V:3000 | W:3000 | X:3000 | Y:3000 | Z:3000 | del:3000 | nothing:3000 | space:3000 | 

## Dataloader

In [28]:
batch_size = 16
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True, drop_last=True)
test_loader = torch.utils.data.DataLoader(test_dataset,
                                           batch_size=batch_size,
                                           shuffle=False, drop_last=False)

In [58]:
for i in train_loader:
    print(i[0].shape)
    print(i[1])
    break

torch.Size([16, 3, 128, 128])
tensor([ 9, 25, 28,  1, 26,  8, 12,  8, 24, 14, 28,  3,  0,  7, 12, 18])


## Architecture

In [107]:
class ResBlock(nn.Module):
    def __init__(self, nc):
        super().__init__()
        self.conv0 = nn.Conv2d(nc, nc, 3, padding=1)
        self.norm0 = nn.BatchNorm2d(nc)
        self.act = nn.GELU()
        self.conv1 = nn.Conv2d(nc, nc, 3, padding=1)
        self.norm1 = nn.BatchNorm2d(nc)

    def forward(self, x):
        out = self.conv0(x)
        out = self.norm0(out)
        out = self.act(out)
        out = self.conv1(out)
        out = self.norm1(out)
        out = self.act(x + out)
        return out


class BottleneckBlock(nn.Module):
    def __init__(self, nc):
        super().__init__()
        self.act = nn.GELU()
        self.conv0 = nn.Conv2d(nc, nc//4, kernel_size=1, padding=0)
        self.norm0 = nn.BatchNorm2d(nc//4)
        self.conv1 = nn.Conv2d(nc//4, nc//4, kernel_size=3, padding=1)
        self.norm1 = nn.BatchNorm2d(nc//4)
        self.conv2 = nn.Conv2d(nc//4, nc, kernel_size=1, padding=0)

    def forward(self, x):
        out = self.conv0(x)
        out = self.norm0(out)
        out = self.act(out)
        out = self.conv1(out)
        out = self.norm1(out)
        out = self.act(out)
        out = self.conv2(out)
        out = self.act(x + out)
        return out


class ResStack(nn.Module):
    def __init__(self, nc, num_blocks, block_type):
        super().__init__()
        stack = []
        for i in range(num_blocks):
            if block_type == 'bottleneck':
                stack.append(BottleneckBlock(nc))
            elif block_type == 'resblock':
                stack.append(ResBlock(nc))

        self.blocks = nn.Sequential(*stack)

    def forward(self, x):
        return self.blocks(x)


class Net(nn.Module):
    def __init__(self, in_nc, nc, out_nc, num_blocks, block_type):
        super().__init__()
        self.conv0 = nn.Conv2d(in_nc, nc, 3, padding=1)
        self.act = nn.GELU()
        self.maxpool = nn.MaxPool2d(2,2)


        self.layer1 = ResStack(nc, num_blocks, block_type)
        self.conv1 = nn.Conv2d(nc, 2*nc, 3, padding=1, stride=2)
        self.layer2 = ResStack(2*nc, num_blocks, block_type)
        self.conv2 = nn.Conv2d(2*nc, 4*nc, 3, padding=1, stride=2)
        self.layer3 = ResStack(4*nc, num_blocks, block_type)
        self.conv3 = nn.Conv2d(4*nc, 4*nc, 3, padding=1, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(4*nc, out_nc)

    def forward(self, x):
        out = self.conv0(x)
        out = self.act(out)
        out = self.maxpool(out)
        out = self.layer1(out)
        out = self.conv1(out)
        out = self.layer2(out)
        out = self.conv2(out)
        out = self.layer3(out)
        out = self.conv3(out)
        out = self.avgpool(out)
        out = self.flatten(out)
        out = self.linear(out)
        return out

## Train

In [108]:
# params = {'epochs': 3,
#           'device': 'cuda',
#           'use_amp': True}

def train_model(model, train_loader, params):
    epochs = 10
    loss_history, acc_history, test_history = [], [], []
    for epoch in range(params['epochs']):
        model.train()
        loss_val, acc_train, test_acc = 0, 0, 0
        for sample in (pbar := tqdm(train_loader)):
          img, label = sample[0], sample[1]
          img = img.to(params['device'])
          label = label.to(params['device'])
          label = F.one_hot(label, 29).float()
          optimizer.zero_grad()
          with autocast(params['use_amp']):
            pred = model(img)
            loss = loss_function(pred, label)

          scaler.scale(loss).backward()
          loss_item = loss.item()
          loss_val += loss_item

          scaler.step(optimizer)
          scaler.update()

          acc_current = accuracy(pred.cpu().float(), label.cpu().float())
          acc_train += acc_current

          pbar.set_description(f'epoch: {epoch}\tloss: {loss_item:.5f}\taccuracy: {acc_current:.3f}')

        model.eval()
        for sample in test_loader:
            img, label = sample[0], sample[1]
            img = img.to(params['device'])
            label = label.to(params['device'])
            label = F.one_hot(label, 2).float()
            pred = model(img)
            acc_current = accuracy(pred.cpu().float(), label.cpu().float())
            test_acc += acc_current

        test_history.append(test_acc/len(test_loader))
        loss_history.append(loss_val/len(train_loader))
        acc_history.append(acc_train/len(train_loader))
        print(f'loss: {loss_val/len(train_loader)}')
        print(f'train: {acc_train/len(train_loader)}')
        print(f'test: {test_acc/len(test_loader)}')
    return loss_history, acc_history, test_history

def accuracy(pred, label):
    answer = F.softmax(pred.detach()).numpy().argmax(1) == label.numpy().argmax(1)
    return answer.mean()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [109]:
# device = 'cuda' if torch.cuda.is_available else 'cpu'
device = 'cpu'
print(device)

cpu


In [113]:
torch.backends.cudnn.benchmark = True
# torch.backends.cudnn.deterministic = True

model = Net(3, 6, 29, 3, 'bottleneck')

print(count_parameters(model))
model = model.to(device)
scaler = GradScaler()

loss_function = nn.CrossEntropyLoss()
loss_function = loss_function.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))

12041


In [None]:
params = {'epochs': 3,
          'device': 'cpu',
          'use_amp': True}
loss_history, acc_history, test_history = train_model(model, train_loader, params)