Using a live camera feed as input? #6

Zeit42 · 2019-08-27T12:32:54Z

Just for a little of background, I trained a custom object detector model using your train.py code. After that I tested it on inference.py, I filled out the necessary terminal flags to make sure that my model is used.

After that, I tried to edit your inference.py, so that instead of going through the photos of a folder, I use the input of the RaspberryPi Camera that I have. I know that the Raspberry Pi Camera works cuz I can open it via OpenCV. I did a lot of code alteration, but basically what I put in is a flag for camera, that when set to True, uses the camera feed instead.

Here is the code that I have:

` import numpy as np
import time
import plac
import os
import cv2

import gi

from model import MobileDetectNetModel

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

gi.require_version('Gst', '1.0')

@plac.annotations(
    inference_type=("Type of inference to test (TF, FP32, FP16, INT8)", 'option', 'T', str),
    batch_size=("Size of the TensorRT batch", 'option', 'B', int),
    weights=("Model weights", 'option', 'W', str),
    test_path=("Test images path", 'option', 'I', str),
    merge=("Test images only: Merge detected regions", 'flag', 'm', bool),
    stage=("Test images only: Augmentation training stage", 'option', 's', str),
    limit=("Test images only: Max number of images to run inference on", 'option', 'l', int),
    confidence=("Test images only: Minimum confidence in coverage to draw bbox", "option", "c", float),
    visualize=("Visualize the inference", "option", "V", bool),
    camera=("Use camera feed. Ignores test_path. Boolean.", "option", "C", bool)
)



# Set inference_type to FP16 to use TensorRT
def main(inference_type: str = "FP16",
         batch_size: int = 1,
         test_path: str = None,
         weights: str = None,
         merge: bool = False,
         stage: str = "test",
         limit: int = 20,
         confidence: float = 0.1,
         visualize: bool = True,
         camera: bool = False):

    keras_model = MobileDetectNetModel.complete_model()

    if weights is not None:
        keras_model.load_weights(weights, by_name=True)

    images_done = 0

    if test_path is not None:
        # import cv2

        if stage != 'test':
            from generator import MobileDetectNetSequence
            seq = MobileDetectNetSequence.create_augmenter(stage)
        else:
            seq = None

        images_full = []
        images_input = []
        images_scale = []

        for r, d, f in os.walk(test_path):
            for file in f:
                image_full = cv2.imread(os.path.join(r, file))
                image_input = cv2.resize(image_full, (224, 224))

                scale_width = image_full.shape[1] / 224
                scale_height = image_full.shape[0] / 224
                images_scale.append((scale_width, scale_height))

                if stage != 'test':
                    seq_det = seq.to_deterministic()
                    image_aug = (seq_det.augment_image(image_input).astype(np.float32) / 127.5) - 1.
                else:
                    image_aug = image_input.astype(np.float32) / 127.5 - 1.

                images_full.append(image_full)
                images_input.append(image_aug)

                images_done += 1

                if images_done == limit:
                    break

            if images_done == limit:
                break

        x_test = np.array(images_input)
    else:
        #x_test = np.random.random((limit, 224, 224, 3))
        x_test = np.random.random((224, 224, 3))
    
        
    # x_test = np.random.random((224, 224, 3))

    x_cold = np.random.random((batch_size, 224, 224, 3))

    print(f'Inference Type is {inference_type}')

    if inference_type == 'K':
        keras_model.predict(x_cold)
        t0 = time.time()
        model_outputs = keras_model.predict(x_test)
        t1 = time.time()
    elif inference_type == 'TF':
        tf_engine = keras_model.tf_engine()
        tf_engine.infer(x_cold)
        t0 = time.time()
        model_outputs = tf_engine.infer(x_test)
        t1 = time.time()
    elif inference_type == 'FP32':
        tftrt_engine = keras_model.tftrt_engine(precision='FP32', batch_size=batch_size)
        tftrt_engine.infer(x_cold)
        t0 = time.time()
        model_outputs = tftrt_engine.infer(x_test)
        t1 = time.time()
    	
    # WE ARE USING THIS INFERENCE TYPE, TFTRT
    elif inference_type == 'FP16':
        tftrt_engine = keras_model.tftrt_engine(precision='FP16', batch_size=batch_size)
        tftrt_engine.infer(x_cold)
        #t0 = time.time()
        #model_outputs = tftrt_engine.infer(x_test)
        #t1 = time.time()
        
    elif inference_type == 'INT8':
        tftrt_engine = keras_model.tftrt_engine(precision='INT8', batch_size=batch_size)
        tftrt_engine.infer(x_cold)
        t0 = time.time()
        model_outputs = tftrt_engine.infer(x_test)
        t1 = time.time()
    else:
        raise ValueError("Invalid inference type")

    #print('Time: ', t1 - t0)
    #print('FPS: ', x_test.shape[0]/(t1 - t0))

    if not visualize:
        return

#    if len(model_outputs) == 2:
#        classes, bboxes = model_outputs

    # TF / TensorRT models won't output regions (not useful for production)
    #elif len(model_outputs) == 3:
    #    regions, bboxes, classes = model_outputs
    #else:
    #    raise ValueError("Invalid model length output")


    if test_path is not None and camera is False:
        import matplotlib.pyplot as plt
        from matplotlib.colors import LinearSegmentedColormap

        # get colormap
        ncolors = 256
        color_array = plt.get_cmap('viridis')(range(ncolors))

        # change alpha values
        color_array[:, -1] = np.linspace(0.0, 1.0, ncolors)

        # create a colormap object
        map_object = LinearSegmentedColormap.from_list(name='viridis_alpha', colors=color_array)

        # register this new colormap with matplotlib
        plt.register_cmap(cmap=map_object)

        for idx in range(0, len(images_full)):

            rectangles = []

            # Does this only get the first 7 items? 
            for y in range(0, 7):
                for x in range(0, 7):

                    if classes[idx, y, x, 0] >= confidence:
                        rect = [
                            int(bboxes[idx, int(y), int(x), 0] * 224),
                            int(bboxes[idx, int(y), int(x), 1] * 224),
                            int(bboxes[idx, int(y), int(x), 2] * 224),
                            int(bboxes[idx, int(y), int(x), 3] * 224)]
                        rectangles.append(rect)

            if merge:
                rectangles, merges = cv2.groupRectangles(rectangles, 1, eps=0.75)

            scale_width, scale_height = images_scale[idx]

            for rect in rectangles:
                cv2.rectangle(images_full[idx],
                              (int(rect[0]*scale_width), int(rect[1]*scale_height)),
                              (int(rect[2]*scale_width), int(rect[3]*scale_height)),
                              (0, 255, 0), 5)

            plt.imshow(cv2.cvtColor(images_full[idx], cv2.COLOR_BGR2RGB), alpha=1.0, aspect='auto')
            plt.imshow(
                cv2.resize(classes[idx].reshape((7, 7)),
                           (images_full[idx].shape[1], images_full[idx].shape[0])),
                interpolation='nearest', alpha=0.5, cmap='viridis_alpha', aspect='auto')
            plt.show()


    font = cv2.FONT_HERSHEY_SIMPLEX
    bottomLeftCornerOfText = (10, 500)
    fontScale = 1
    fontColor = (255, 255, 255)
    lineType = 2

    if camera is True:
        print('camera flag detected!')
        
        #cap = cv2.VideoCapture("nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)1280, height=(int)720, format=(string)NV12, framerate=(fraction)21/1 ! nvvidconv flip-method=2 ! video/x-raw, format=(string)BGRx, width=(int)960, height=(int)616 ! videoconvert ! video/x-raw, format=(string)BGR ! appsink")

        cap = cv2.VideoCapture("nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)1280, height=(int)720, format=(string)NV12, framerate=(fraction)60/1 ! nvvidconv flip-method=2 ! video/x-raw, format=(string)BGRx, width=(int)960, height=(int)616 ! videoconvert ! appsink")
        
        if cap.isOpened():
            cv2.namedWindow("demo")
            while True:
                ret_val, image_np = cap.read()
                image_raw = image_np
                
                #print(f'*** original shape is {image_np.shape}')
                
                # Expand the dimensions
                image_np_expanded = np.expand_dims(image_np, axis=0)

                #print(f'*** image expanded shape is {image_np_expanded.shape}')


                images_full = []
                images_input = []
                images_scale = []
                
                dim = (224, 224)
                
                image_input = cv2.resize(image_raw, (224, 224))
                #image_input = image_np_expanded

                #print(f'image_raw shape is = {image_input.shape}')
                image_full = np.expand_dims(image_input, axis=0)

                #print(f'image_full shape after expanding is = {image_full.shape}')
                                
                #scale_width = image_full.shape[1] / 224
                #scale_height = image_full.shape[0] / 224
                #images_scale.append((scale_width, scale_height))

                if stage != 'test':
                    seq_det = seq.to_deterministic()
                    image_aug = (seq_det.augment_image(image_input).astype(np.float32) / 127.5) - 1.
                else:
                    image_aug = image_input.astype(np.float32) / 127.5 - 1.

                #images_full.append(image_full)
                #images_full.append(image_aug)
                
                t0 = time.time()
                #print(f'shape of image full before sending to ')
                model_outputs = tftrt_engine.infer(image_full)

                t1 = time.time()

                rectangles = []

                #print(f'length of model_outputs is = {len(model_outputs)}')

                if len(model_outputs) == 2:
                    classes, bboxes = model_outputs

                # TF / TensorRT models won't output regions (not useful for production)
                elif len(model_outputs) == 3:
                    regions, bboxes, classes = model_outputs
                else:
                    raise ValueError("Invalid model length output")


                framerate = 1.0/(t1 - t0)

                #print('Time: ', t1 - t0)
                #print('FPS: ', framerate)

                print()


                for y in range(0, 7):
                    for x in range(0, 7):
                        #print(f'confidence is = {classes[0, y, x, 0]}')
                        if classes[0, y, x, 0] >= confidence:
                            #print('confidence is enough!')
                            rect = [
                                int(bboxes[0, int(y), int(x), 0] * 224),
                                int(bboxes[0, int(y), int(x), 1] * 224),
                                int(bboxes[0, int(y), int(x), 2] * 224),
                                int(bboxes[0, int(y), int(x), 3] * 224)]
                            
                            print(f'rectangle is = {rect}')
                            
                            rectangles.append(rect)
                        
                        #else:
                        #    print('confidence not high enough')

                rectangles, merges = cv2.groupRectangles(rectangles, 1, eps=0.75)

                #scale_width, scale_height = images_scale[idx]

                if len(rectangles) > 0:
                    print(f'rectangle count is = {len(rectangles)}')
                    
                for rect in rectangles:
                    cv2.rectangle(image_raw,
                          (int(rect[0]), int(rect[1])),
                          (int(rect[2]), int(rect[3])),
                          (0, 255, 0), 5)
                
                cv2.putText(image_raw, "FPS: {0:.2f}".format(framerate), bottomLeftCornerOfText, font, fontScale, fontColor, lineType)
                
                cv2.imshow("demo", image_raw)
		
                if cv2.waitKey(1) == ord('q'):
                    break
        else:
            print('camera open failed')

        cv2.destroyAllWindows()



if __name__ == '__main__':
    plac.call(main)

`

Basically what happens is that I get the captured frame, and run it through inference. When I run the script (again, using my own model), the camera feed opens just fine, BUT when I make it view a photo of the object I trained it on, the same photos from the folder that I test it with, it doesn't detect my object anymore.

Basically, I'm trying to use my model and your base code in inference.py to run Object Detection on from the Camera Feed, but I haven't had any luck.

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Using a live camera feed as input? #6

Using a live camera feed as input? #6

Zeit42 commented Aug 27, 2019

Using a live camera feed as input? #6

Using a live camera feed as input? #6

Comments

Zeit42 commented Aug 27, 2019