# Object recognition in video using Deep Learning

This notebook was created to practhice object recognition in video stream using Convolutional Neural Networks (CNN). We can do this break the the problem in some steps:

1. capture a video stream
2. make pre processing and clean up data on frame image
3. crop the region of an image
4. use the cropped image in a CNN Model
5. make predictions
6. put a text with the correct label in the video

We are using video streams, and make this predictions in the main loop is not a good idea. It can impact on the frame rates of the video.
To solve this problem, we will put the recognition part in a thread.


### Import libraries

For this project we will use numpy, Keras, tensorflow (with keras), and OpenCV.

In [1]:
import numpy as np
import cv2
import tensorflow as tf
from keras.applications.resnet50 import ResNet50
from keras.applications.vgg16 import VGG16

import image_utils
import time
import threading


Using TensorFlow backend.


### Define an object for Video capture

The `VideoStream` object is the object responsible to get the video and processing all the video frames. For each frame captured, it will preprocess the frame and keep the current frame available to predictions.

In [2]:
class VideoStream(object):

    def __init__(self):
        self.frame = None
        self.original = None
        self.current_label = None

    def start_video(self):
        cap = cv2.VideoCapture('objects.avi')

        while(cap.isOpened()):
            time.sleep(0.1)
            _, self.original = cap.read()
            self.frame = cv2.resize(self.original, (224, 224))

            # gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

            # h,w,channels = self.original.shape
            # self.frame = cv2.getRectSubPix(self.original, patchSize=(h,h), center=(w/2,h/2))
            # self.frame = cv2.cvtColor(self.frame, cv2.COLOR_BGR2GRAY)

            cv2.putText(self.original, 
                        "Object: {}".format(self.current_label), 
                        (10, 60), 
                        cv2.FONT_HERSHEY_SIMPLEX, 
                        0.9, 
                        (0, 255, 0), 
                        2
            )
            cv2.imshow('Classification',self.original)
            
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
            
        cap.release()
        cv2.destroyAllWindows()


    def current_frame(self):
        return self.frame


    def write_label(self, label):
        self.current_label = label


### Define a thread task to object recognition

As said before, the recognition taks will be perfomed by a Thread. The `RecognitionTask` will colaborate with `VideoStream` to get the current frame and apply it into the CNN model. For now, we will *Transfer Learning* with  pre trained imagenet model.

In [3]:
class RecognitionTask(threading.Thread):
    def __init__(self, video_stream):
        threading.Thread.__init__(self)
        self.video_stream = video_stream
        self.model = VGG16(weights='imagenet')
        self.graph = tf.get_default_graph()

        # self.model = ResNet50(weights='imagenet')

    def run(self):
        frame = self.video_stream.current_frame()
        while (~(frame is None)):
            predictions = self.predict(frame)
            label = predictions[1]
            self.video_stream.write_label(label)
            frame = self.video_stream.current_frame()
    
    def predict(self,frame):
        if frame is None:
            return '', ''
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB).astype(np.float32)
        print image.shape
        # image = image.transpose((2,0,1))
        image = image.reshape((1,) + image.shape)
        print image.shape
        with self.graph.as_default():
            image = image_utils.preprocess_input(image)
            predictions = self.model.predict(image)
            values = image_utils.decode_predictions(predictions)[0]
            print values[0]
            return values[0]


### Creates a Command object

The `Recognition` is a object that coordinate all the steps. It will be the object that starts the program.

In [4]:
class Recognition(object):

    def start(self):
        video_stream = VideoStream()
        recognition_task = RecognitionTask(video_stream)
        recognition_task.start()
        video_stream.start_video()

### Run it!

In [5]:
recognition = Recognition()
recognition.start()

(224, 224, 3)
(1, 224, 224, 3)
(u'n03085013', u'computer_keyboard', 0.32429728)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03085013', u'computer_keyboard', 0.41292092)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03085013', u'computer_keyboard', 0.31177422)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03085013', u'computer_keyboard', 0.53773099)
(224, 224, 3)
(1, 224, 224, 3)
(u'n04264628', u'space_bar', 0.45629096)
(224, 224, 3)
(1, 224, 224, 3)
(u'n04264628', u'space_bar', 0.5214861)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03085013', u'computer_keyboard', 0.38137668)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03085013', u'computer_keyboard', 0.53222728)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03085013', u'computer_keyboard', 0.4133876)
(224, 224, 3)
(1, 224, 224, 3)
(u'n04264628', u'space_bar', 0.40255386)
(224, 224, 3)
(1, 224, 224, 3)
(u'n04264628', u'space_bar', 0.38903517)
(224, 224, 3)
(1, 224, 224, 3)
(u'n04264628', u'space_bar', 0.40524644)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03085013', u'computer_keyboard', 0.37815109)
(2

KeyboardInterrupt: 

(u'n03777754', u'modem', 0.1261213)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03481172', u'hammer', 0.18077481)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03481172', u'hammer', 0.18077481)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03481172', u'hammer', 0.18077481)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03481172', u'hammer', 0.18077481)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03481172', u'hammer', 0.18077481)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03481172', u'hammer', 0.18077481)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03481172', u'hammer', 0.18077481)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03481172', u'hammer', 0.18077481)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03481172', u'hammer', 0.18077481)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03481172', u'hammer', 0.18077481)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03481172', u'hammer', 0.18077481)
(224, 224, 3)
(1, 224, 224, 3)
(u'n03481172', u'hammer', 0.18077481)
(224, 224, 3)
(1, 224, 224, 3)
