# SANDBOX (ROBOTIC GRIPPER)

## Here is the place to test stuff, etc.
## Is always under construction and the code here don't deserve any credibility.

In [None]:
%load_ext autoreload
%autoreload 2

## 2 - Capturing labeled gestures images

Images will be captured from the webcam.
A folder named **capture** will have several subfolders.
The subfolders will have meaningful names, such as **left**, **right**, and so on.
The subfolder named **left** will hold images of teh gesture that yields the command **turn to the left**.
This is so that later the subfolders name will become the ground truth values of the datasets for the machine learning process.

For controlling the robotic gripper, we are going to use nine commands:
    1. nothing
    2. left
    3. right
    4. up
    5. down
    6. foward
    7. back
    8. grip
    9. loose

In [None]:
%pylab inline 
import cv2
from IPython.display import clear_output
import time
from datetime import datetime
import os
import numpy as np

In [None]:
"""
    function  start_webcam_capture
    parameters:
    path - the path to save captured gesture images files
"""
def start_webcam_capture(path, number_of_captures=10):
    # variables to define play warning sound
    frequency = 100 # Hertz
    duration  = 50 # milliseconds
    #lets make sure the path exists!
    if not os.access(path, os.F_OK):
        os.makedirs(path)
    count_captures = 0
    #using webcam 0.
    #in some systems webcam may be under different numbers, i.e, 1 or 2 or 3 ...
    vid = cv2.VideoCapture(0)
    start_time = time.time()
    try:
        while(count_captures<number_of_captures):
            # Capture frame-by-frame
            ret, frame = vid.read()
            if not ret:
                # Release the Video Device if ret is false
                vid.release()
                # Message to be displayed after releasing the device
                print("Released Video Resource due to capture fail!")
                break
            # Convert the image from OpenCV BGR format to matplotlib RGB format
            # to display the image
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            # check if it is time to save frame to a file
            elapsed_time = time.time() - start_time
            if elapsed_time > 4:
                # make sound to indicate action
                os.system('play -n synth %s sin %s' % (duration/1000, frequency))
                timestamp = datetime.utcnow().strftime('%Y_%m_%d_%H_%M_%S_%f')[:-3]
                timestamp = timestamp + '.jpg'
                image_filename = os.path.join(path, timestamp)
                #print(image_filename)
                cv2.imwrite(image_filename, frame)
                #increment count_captures
                count_captures += 1
                #restart the timer
                start_time = time.time()
            # check for ESC
            key = np.int16(cv2.waitKey(1))
            if key == 27:
                print("Esc key interrupted!")
                break  # esc to quit
            # Turn off the axis
            axis('off')
            # Title of the window
            title("Robotic Gripper Gestures Capture")
            # Display the frame
            imshow(frame)
            show()
            # Display the frame until new frame is available
            clear_output(wait=True)
    except KeyboardInterrupt:
        # Message to be displayed after releasing the device
        print("keyboard interrupted!")
    # Release the Video Device
    vid.release()
    print("Released Video Resource")
    path, dirs, files = os.walk(path).__next__()
    file_count = len(files)
    print('There are now ', file_count, ' images in ', path)


Let's start by capturing the gesture for **nothing**.
When you are done, select **Kernel** on jupyter notebook menu and then select **Interrupt**
As the file names are bases on a complete and unique timestamp, if you wish, you can run the same code again to add more gestures images. You can even visually select and remove some files (in case of a mistake) using a external file manager from your operating system.

In [None]:
path = 'sandboxCapture/nothing'
#start capturing gesture images
start_webcam_capture(path)

Let's capture te gesture for **left**.

In [None]:
path = 'sandboxCapture/left'
#start capturing gesture images
start_webcam_capture(path)

Let's capture te gesture for **right**.

In [None]:
path = 'sandboxCapture/right'
#start capturing gesture images
start_webcam_capture(path)

Let's capture te gesture for **up**.

In [None]:
path = 'sandboxCapture/up'
#start capturing gesture images
start_webcam_capture(path)

Let's capture te gesture for **down**.

In [None]:
path = 'sandboxCapture/down'
#start capturing gesture images
start_webcam_capture(path)

Let's capture te gesture for **foward**.

In [None]:
path = 'sandboxCapture/foward'
#start capturing gesture images
start_webcam_capture(path)

Let's capture te gesture for **back**.

In [None]:
path = 'sandboxCapture/back'
#start capturing gesture images
start_webcam_capture(path)

Let's capture te gesture for **grip**.

In [None]:
path = 'sandboxCapture/grip'
#start capturing gesture images
start_webcam_capture(path)

Let's capture te gesture for **loose**.

In [None]:
path = 'sandboxCapture/loose'
#start capturing gesture images
start_webcam_capture(path)

## 3 - Build the Model and train it using the captured gestures from the first phase

We are going to build our [deep learning](https://en.wikipedia.org/wiki/Deep_learning) robotic gripper gesture commands model using [Keras](https://keras.io/) and [TensorFlow](https://www.tensorflow.org/).

In [73]:
#imports

import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from keras.layers import Lambda, Conv2D, MaxPooling2D, Dropout, Dense, Flatten
from sklearn.model_selection import train_test_split
from sandboxUtils import INPUT_SHAPE, batch_generator
import os
import cv2
import sys

np.random.seed(0)

The function **load_images_from_path** is auxiliary to the function **load_data**.

In [74]:
'''
    load_images_from_path
'''
def load_images_from_path(path, result, images, results):
    for filename in os.listdir(path):
      img = os.path.join(path,filename)
      if img is not None:
        images.append(img)
        results.append(result)
    return images, results

In [75]:
def load_data():
  images = []
  results =[]
  labels = ['nothing', 'left', 'right', 'grip', 'loose', 'foward', 'back', 'up', 'down']

  #load a list of images and a corresponding list of results (images=640x480)
  images, results = load_images_from_path('sandboxCapture/nothing/', 0, images, results)
  images, results = load_images_from_path('sandboxCapture/left/', 1, images, results)
  images, results = load_images_from_path('sandboxCapture/right/', 2, images, results)
  images, results = load_images_from_path('sandboxCapture/grip/', 3, images, results)
  images, results = load_images_from_path('sandboxCapture/loose/', 4, images, results)
  images, results = load_images_from_path('sandboxCapture/foward/', 5, images, results)
  images, results = load_images_from_path('sandboxCapture/back/', 6, images, results)
  images, results = load_images_from_path('sandboxCapture/up/', 7, images, results)
  images, results = load_images_from_path('sandboxCapture/down/', 8, images, results)

  X_train, X_valid, y_train, y_valid = train_test_split(images, results, test_size=0.2, shuffle = True, random_state=0)

  return X_train, X_valid, y_train, y_valid

In [76]:
X_train, X_valid, y_train, y_valid = load_data()

print("Train Images: ", len(X_train))
print("Valid Images: ", len(X_valid))
print("Train Results: ", len(y_train))
print("Valid Results: ", len(y_valid))

# if we wish to check some of the images, just change de index value
# note that the index can't be bigger than the number of images -1
#cv2.imshow('Capture', cv2.imread(X_train[80]))
#print(X_train[80])
#print(labels[results[80]])
#cv2.waitKey(0)
#cv2.destroyAllWindows()
#sys.exit(0)


Train Images:  79
Valid Images:  20
Train Results:  79
Valid Results:  20


In [77]:
def build_model(keep_prob):
    """
    Modified NVIDIA model
    """
    model = Sequential()
    model.add(Lambda(lambda x: x/127.5-1.0, input_shape=INPUT_SHAPE))
    model.add(Conv2D(24, 5, 5, activation='elu', subsample=(2, 2)))
    model.add(Conv2D(36, 5, 5, activation='elu', subsample=(2, 2)))
    model.add(Conv2D(48, 5, 5, activation='elu', subsample=(2, 2)))
    model.add(Conv2D(64, 3, 3, activation='elu'))
    model.add(Conv2D(64, 3, 3, activation='elu'))
    model.add(Dropout(keep_prob))
    model.add(Flatten())
    model.add(Dense(100, activation='elu'))
    model.add(Dense(50, activation='elu'))
    model.add(Dense(10, activation='elu'))
    # let's change from Dense(1) to Dense(9, activation='softmax')
    # where 9 is the number of classes and softmax can be understood
    #here: https://en.wikipedia.org/wiki/Softmax_function
    # model.add(Dense(1))
    model.add(Dense(9, activation='softmax'))
    model.summary()

    return model

Let's build the model.

In [78]:
keep_prob = 0.5
model = build_model(keep_prob)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lambda_3 (Lambda)                (None, 120, 160, 3)   0           lambda_input_5[0][0]             
____________________________________________________________________________________________________
convolution2d_11 (Convolution2D) (None, 58, 78, 24)    1824        lambda_3[0][0]                   
____________________________________________________________________________________________________
convolution2d_12 (Convolution2D) (None, 27, 37, 36)    21636       convolution2d_11[0][0]           
____________________________________________________________________________________________________
convolution2d_13 (Convolution2D) (None, 12, 17, 48)    43248       convolution2d_12[0][0]           
___________________________________________________________________________________________

In [80]:
def train_model(model, psave_best_only, learning_rate, samples_per_epoch, nb_epoch, batch_size, X_train, X_valid, y_train, y_valid):
    """
    Train the model
    """
    checkpoint = ModelCheckpoint('modelSandBox-{epoch:03d}.h5',
                                 monitor='val_loss',
                                 verbose=0,
                                 save_best_only=psave_best_only,
                                 mode='auto')

    model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=learning_rate), metrics=['accuracy'])

    #model.compile(loss='mean_squared_error', optimizer=Adam(lr=learning_rate))
    #model.compile(loss='binary_crossentropy', optimizer=Adam(lr=learning_rate))

    #model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=learning_rate))
    # try to change
    #model.compile(loss='mean_squared_error', optimizer=Adam(lr=learning_rate), metrics=['accuracy'])
    
    model.fit_generator(batch_generator(X_train, y_train, batch_size, True),
                        samples_per_epoch,
                        nb_epoch,
                        max_q_size=1,
                        validation_data = batch_generator(X_valid, y_valid, batch_size, False),
                        nb_val_samples=len(X_valid),
                        callbacks=[checkpoint],
                        verbose=1)

Let's train the model.
As a result, files named like **model-000.h5**, **model-003.h5**, and so on will be saved on the project folder.
Those files are trainned models that can be used later to classify the gestures.
The numbers on their names are meaningfull:
At the end of each epoch run, a loss value is obtainned. If it is the first epoch, the file numbered 000 is recorded. If not, if it beats the lower (less is better here!) loss value obtainned in the former epochs, a file with the model will be saved. The numbers are just epoch -1.
Thus, by the end of the run, the best model obtainned will be the one with the highest number on it's name.
By the other hand, if the value of parameter **psave_best_only** passed to the funcion **train_model** is false, all epochs will be saved. In this case, there is no indication witch was the best result on the files themselves, so, the user have to take note of the run, and observe the orders of losses values picking the lowest as the best model.

In [81]:
psave_best_only = True
learning_rate = 1.0e-4
#samples_per_epoch = 20000
samples_per_epoch = 5000
#nb_epoch = 10
nb_epoch = 1
batch_size = 40
train_model(model, psave_best_only, learning_rate, samples_per_epoch, nb_epoch, batch_size, 
            X_train, X_valid, y_train, y_valid)

Epoch 1/1


## 3 - Operate the robotic gripper using gestures

Import python modules

In [10]:
import cv2
from IPython.display import clear_output
import time
from datetime import datetime
import os
import numpy as np
import shutil
import serial
from keras.models import load_model
import sandboxUtils

The function **start_operation** generates commands and send them to the Arduino board, via USB.
It has a logical parameter named **check_predictions_only**, that is false by default. If it is passed with true value, the function does it work jumping all Arduino comunnication. This is just to allow testing the result of a model without the need of assembling the robotic grip part of the project.

In [None]:
def startOperation(model, check_predictions_only = False):
    if not check_predictions_only:
        #start serial
        ser = serial.Serial('/dev/ttyACM0', 9600, timeout=1)
        print('Serial connection: ', ser.name)
    else:
        ser = None
    # variables to define play warning sound
    frequency = 100 # Hertz
    duration  = 50 # milliseconds
    gc = ' '
    #using webcam 0.
    #in some systems webcam may be under different numbers, i.e, 1 or 2 or 3 ...
    vid = cv2.VideoCapture(0)
    start_time = time.time()
    try:
        while True:
            # Capture frame-by-frame
            ret, frame = vid.read()
            if not ret:
                # Release the Video Device if ret is false
                vid.release()
                # Message to be displayed after releasing the device
                print("Released Video Resource due to capture fail!")
                break
            # Convert the image from OpenCV BGR format to matplotlib RGB format
            # to display the image
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            # check if it is time to save frame to a file
            elapsed_time = time.time() - start_time
            if elapsed_time > 2:
                # make sound to indicate action
                os.system('play -n synth %s sin %s' % (duration/1000, frequency))
                # predict
                gc = predict(model, ser, frame, check_predictions_only)
                #restart the timer
                start_time = time.time()
            # Turn off the axis
            axis('off')
            # Title of the window
            title("Gripper Gesture: (" + gc + ')')
            # Display the frame
            imshow(frame)
            show()
            # Display the frame until new frame is available
            clear_output(wait=True)
    except KeyboardInterrupt:
        # Message to be displayed after releasing the device
        print("keyboard interrupted!")
    # Release the Video Device
    vid.release()

In [None]:
def predict(model, ser, frame, check_predictions_only = False):
    # The current frame of gesture
    gc = ' '
    labels = ['nothing', 'left', 'right', 'grip', 'loose', 'foward', 'back', 'up', 'down']
    rlabels = ['n', 'l', 'r', 'g', 'o', 'f', 'b', 'u', 'd']
    try:
        frame = utils.preprocess(frame) # apply the preprocessing
        frame = np.array([frame])       # the model expects 4D array
        # predict the gesture
        #gesture = float(model.predict(frame, batch_size=1))
        #as the output layer now is softmax over 9 classes, we must addapt to it
        # print('gesture prediction: [', round(gesture), ' <- ', gesture, '] ', labels[round(gesture)])
        gesture = np.argmax(model.predict(frame, batch_size=1))
        print('gesture prediction: ', gesture)
#        if(gesture <= 0.8):
#            gc = 'n';
#        elif(gesture <= 1.8):
#            gc = 'l';
#        elif(gesture <= 2.8):
#            gc = 'r';
#        elif(gesture <= 3.8):
#            gc = 'g';
#        elif(gesture <= 4.8):
#            gc = 'o';
#        elif(gesture <= 5.8):
#            gc = 'f';
#        elif(gesture <= 6.8):
#            gc = 'b';
#        elif(gesture <= 7.8):
#            gc = 'u';
#        elif(gesture <= 8.8):
#            gc = 'd';

        if(gesture != ' '):
            #print('gesture: ', gc)
            print('gesture: ', rlabels[gesture])
            if not check_predictions_only:
                #ser.write(bytes(gc, 'utf-8'))
                ser.write(bytes(rlabels[gesture], 'utf-8'))
                time.sleep(.02)
    except Exception as e:
        print(e)
    #return gc
    return rlabels[gesture]

Let's load the model we want to use for recognize the gestures. Remember, usually a bigger the number on the name of a model file indicates that it's minimization was better than the previous ones, so you better choose the higher number file to use.

In [None]:
model = load_model('modelSandBox-000.h5')

Finally it is time to run the project, and control the robotic gripper.
Due to a limitation on jupyter notebooks IPhython, when a cell is running, we can't interact with it via keyboard.
So, to interrupt a running sell one should press **Kernel** on notebook menu, and then press Interrupt.

In [None]:
startOperation(model, True)

# Extra

Extra code to run small parts in order to test or understand them better

In [47]:
import cv2
from IPython.display import clear_output
import time
from datetime import datetime
import os
import numpy as np
import shutil
import serial
from keras.models import load_model
import sandboxUtils
from sklearn.model_selection import train_test_split
from sandboxUtils import INPUT_SHAPE, batch_generator


In [48]:
model = load_model('modelSandBox-000.h5')

In [49]:
'''
    load_images_from_path
'''
def load_images_from_path(path, result, images, results):
    for filename in os.listdir(path):
      img = os.path.join(path,filename)
      if img is not None:
        images.append(img)
        results.append(result)
    return images, results

In [50]:
def load_data():
  images = []
  results =[]
  labels = ['nothing', 'left', 'right', 'grip', 'loose', 'foward', 'back', 'up', 'down']

  #load a list of images and a corresponding list of results (images=640x480)
  images, results = load_images_from_path('sandboxCapture/nothing/', 0, images, results)
  images, results = load_images_from_path('sandboxCapture/left/', 1, images, results)
  images, results = load_images_from_path('sandboxCapture/right/', 2, images, results)
  images, results = load_images_from_path('sandboxCapture/grip/', 3, images, results)
  images, results = load_images_from_path('sandboxCapture/loose/', 4, images, results)
  images, results = load_images_from_path('sandboxCapture/foward/', 5, images, results)
  images, results = load_images_from_path('sandboxCapture/back/', 6, images, results)
  images, results = load_images_from_path('sandboxCapture/up/', 7, images, results)
  images, results = load_images_from_path('sandboxCapture/down/', 8, images, results)

  X_train, X_valid, y_train, y_valid = train_test_split(images, results, test_size=0.2, shuffle = True, random_state=0)

  return X_train, X_valid, y_train, y_valid

In [51]:
#lets load the data
__, X_valid, ___, y_valid = load_data()

labels = ['nothing', 'left', 'right', 'grip', 'loose', 'foward', 'back', 'up', 'down']

print("Validation Images: ", len(X_valid))
print("Validation Results: ", len(y_valid))

Validation Images:  20
Validation Results:  20


In [72]:
import random

idx = random.randint(0, len(X_valid)-1)
image = sandboxUtils.load_image(X_valid[idx])
image = sandboxUtils.preprocess(image)
image = np.array([image])       # the model expects 4D array
gesture = np.argmax(model.predict(image, batch_size=1))
print('ground truth:', y_valid[idx], ' : ', labels[y_valid[idx]], ' -- prediction: ', labels[gesture], '(', gesture, ')')

ground truth: 8  :  down  -- prediction:  down ( 8 )
