In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
torch.set_printoptions(edgeitems=2)
torch.manual_seed(123)
from PIL import Image
from torchvision import transforms
to_tensor = transforms.ToTensor()

  from .autonotebook import tqdm as notebook_tqdm


This first part is about data capturing.
Capture 100s of images with hand up and hand down and put them in folder.

In [7]:
import os
from pathlib import Path
landing_folder = "./camera_landing_zone/"

Path(landing_folder).mkdir(parents=True, exist_ok=True)

In [4]:
import cv2
import numpy as np
import os
from PIL import Image
import uuid

cv2.namedWindow("preview")
vc = cv2.VideoCapture(0)

if vc.isOpened(): # try to get the first frame
    rval, frame = vc.read()
else:
    rval = False

while rval:
    PIL_image = Image.fromarray(np.uint8(frame)).convert('RGB')
    image_name = f"./camera_landing_zone/%s.jpg" % uuid.uuid4().hex
    PIL_image.save(image_name)
    cv2.imshow("preview", frame)
    rval, frame = vc.read()

    key = cv2.waitKey(20)
    if key == 27: # exit on ESC
        break

vc.release()
cv2.destroyWindow("preview")

In [3]:
def find_mean_and_std():
    folder_hand_down = "./camera_landing_zone/hand_down/"
    folder_hand_up = "./camera_landing_zone/hand_up/"
    to_tensor = transforms.ToTensor()

    lst_images = []

    for folder_name in [folder_hand_down, folder_hand_up]:
        for file_name in os.listdir(folder_name):
            img_t = Image.open(folder_name + file_name)
            lst_images.append(to_tensor(img_t))

    tensor_all_images = torch.stack(lst_images, dim=3)
    # I have C X H X W X N.
    print(tensor_all_images.shape)

    return tensor_all_images.view(3,-1).mean(dim=1), tensor_all_images.view(3,-1).std(dim=1)

find_mean_and_std()

In [2]:
# trying to use my custom loader.
from hand_up_down_loader import HandUpDownDataSet
import random
from typing import Tuple

def create_loaders(test_ratio = 0.2) -> Tuple[HandUpDownDataSet, HandUpDownDataSet]:
    folder_hand_down = "./camera_landing_zone/hand_down/"
    folder_hand_up = "./camera_landing_zone/hand_up/"

    file_paths_down = [folder_hand_down + file_name for file_name in os.listdir(folder_hand_down)]
    file_paths_up = [folder_hand_up + file_name for file_name in os.listdir(folder_hand_up)]

    results = np.array(([0] * len(file_paths_down)) + ([1] * len(file_paths_up)))
    file_paths = np.array(file_paths_down + file_paths_up)

    # permute data for test and training
    # seems like I don't even need to do shuffling.
    # but let's keep it for now.
    permutation = [x for x in range(0, len(file_paths))]
    random.shuffle(permutation)

    train_size = int((1-test_ratio) * len(permutation))

    train_indices = permutation[:train_size]
    test_indices = permutation[train_size:]

    train_file_paths = file_paths[train_indices].tolist()
    train_results = results[train_indices].tolist()

    test_file_paths = file_paths[test_indices].tolist()
    test_results = results[test_indices].tolist()

    loader_train = HandUpDownDataSet(
        file_list=train_file_paths,
        root="./camera_landing_zone/",
        targets=train_results,
        transform=transforms.Compose(transforms=[
            transforms.ToTensor(),
            transforms.Normalize((0.4918, 0.4837, 0.5035),
                                 (0.2523, 0.2437, 0.2239))
        ])
    )

    loader_test = HandUpDownDataSet(
        file_list=test_file_paths,
        root="./camera_landing_zone/",
        targets=test_results,
        transform=transforms.Compose(transforms=[
            transforms.ToTensor(),
            transforms.Normalize((0.4918, 0.4837, 0.5035),
                                 (0.2523, 0.2437, 0.2239))
        ])
    )
    return loader_train, loader_test

loader_train, loader_test = create_loaders()

In [5]:
import torch.nn as nn

model = nn.Sequential(
    nn.Linear(480 * 640 * 3, 512, device='cuda'),
    nn.Tanh(),
    nn.Linear(512, 256, device='cuda'),
    nn.Tanh(),
    nn.Linear(256, 300, device='cuda'),
    nn.Tanh(),
    nn.Linear(300, 2, device='cuda'),
    nn.LogSoftmax(dim=1)
)

learning_rate = 1e-3
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
loss_fn = nn.NLLLoss()
n_epoch = 25

train_loader =torch.utils.data.DataLoader(loader_train, batch_size=512, shuffle=False)
test_loader = torch.utils.data.DataLoader(loader_test, batch_size=256, shuffle=False)

for epoch in range(n_epoch):
    for imgs, labels in train_loader:
        input_train_tensor_cuda = imgs.cuda()
        result_train_tensor_cuda = labels.cuda()
        train_outs = model(input_train_tensor_cuda.view(-1, 480 * 640 * 3))
        loss = loss_fn(train_outs, result_train_tensor_cuda.long())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:
        print("Epoch %d, Loss %f" % (epoch, float(loss)))
        with torch.no_grad():
            total = 0
            correct = 0
            for imgs, labels in test_loader:
                test_outs = model(imgs.cuda().view(-1, 480 * 640 * 3))
                _, predicted = torch.max(test_outs, dim=1)
                total += imgs.shape[0]
                correct += int((predicted == labels.cuda().long()).sum())

            print("Accuracy_test: %f" % (correct / total))

Epoch 0, Loss 0.482407
Accuracy_test: 0.762825
Epoch 10, Loss 0.101468
Accuracy_test: 0.985874
Epoch 20, Loss 0.062577
Accuracy_test: 0.994052


In [6]:
torch.save(model, "models/model_hand_pavle.pickle")
# ok, so this model 1.7 GB. Which is huge.

In [2]:
model_new = torch.load("models/model_hand_pavle.pickle")

In [3]:
# so accuracy is 100% let's see if this is for real.
import cv2
import numpy as np
import os
from PIL import Image

cv2.namedWindow("preview")
vc = cv2.VideoCapture(0)

if vc.isOpened(): # try to get the first frame
    rval, frame = vc.read()
else:
    rval = False

file_id = 0

normalizer = transforms.Normalize((0.4918, 0.4837, 0.5035),
                                 (0.2523, 0.2437, 0.2239))

while rval:
    PIL_image = Image.fromarray(np.uint8(frame)).convert('RGB')
    image_tensor = normalizer(to_tensor(PIL_image)).cuda()
    model_out = model_new(image_tensor.view(-1).unsqueeze(0))
    prob, predicted = torch.max(model_out, dim=1)

    font = cv2.FONT_HERSHEY_SIMPLEX
    org = (50, 50)
    fontScale = 1
    color = (255, 0, 0)
    thickness = 2
    txt = "state: %d, prob: %f" % (int(predicted), float(prob))
    frame = cv2.putText(frame, txt, org, font, fontScale, color, thickness, cv2.LINE_AA)
    cv2.imshow("preview", frame)

    rval, frame = vc.read()


    key = cv2.waitKey(20)
    if key == 27: # exit on ESC
        break

vc.release()
cv2.destroyWindow("preview")

In [8]:
vc.release()
cv2.destroyWindow("preview")