In [4]:
import mediapipe as mp
from ipywebrtc import CameraStream, ImageRecorder, VideoRecorder
from imageio import v3 as iio
from IPython.display import Image, Video
import cv2
import matplotlib.pyplot as plt
from sklearn import preprocessing
import numpy as np
import tensorflow as tf

# Record a Video

In [5]:
camera = CameraStream(constraints=
                      {"facing_mode": "user",
                       "audio": False,
                       "video": { "width": 640, "height": 480 }
                       })
camera

CameraStream(constraints={'facing_mode': 'user', 'audio': False, 'video': {'width': 640, 'height': 480}})

In [3]:
recorder = VideoRecorder(stream=camera)
recorder

VideoRecorder(stream=CameraStream(constraints={'facing_mode': 'user', 'audio': False, 'video': {'width': 640, …

In [14]:
camera.close()

In [7]:
# Get Frames
with iio.imopen(recorder.video.value, "r", format="ffmpeg") as video_file:
    frames = video_file.read()
frames.shape

(442, 480, 640, 3)

# Model Predictions

In [8]:
# Function to get extract hand via mediapipe
def getHandCoordinates(image):
    
    handsModule = mp.solutions.hands
    drawingModule = mp.solutions.drawing_utils
    
    with handsModule.Hands(static_image_mode=True) as hands:
        results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        h,w,c = image.shape
        x_landmarks = []
        y_landmarks = []
        
        # In case no hand is detected
        if results.multi_hand_landmarks == None:
            return -1,-1,-1,-1
        
        # In case hand is detected
        for handLandmarks in results.multi_hand_landmarks:
            for landmark in handLandmarks.landmark:
                x_landmarks.append(landmark.x)
                y_landmarks.append(landmark.y)
                
    # Add 5% padding to the hand
    min_x = int(min(x_landmarks)*w*0.95)
    min_y = int(min(y_landmarks)*h*0.95)
    max_x = int(max(x_landmarks)*w*1.05)
    max_y = int(max(y_landmarks)*h*1.05)
    
    # returns the coordinates of the hand + padding
    return min_x, min_y, max_x, max_y

In [9]:
# Load Labelencoder
le = preprocessing.LabelEncoder()
le.classes_ = np.load("labelencoder.npy", allow_pickle=True)

# Load Model
model = tf.keras.models.load_model('./model')

In [10]:
new_video = []

face_detect = cv2.CascadeClassifier('haarcascade_frontalface_alt.xml')


for frame in frames:
    frame_conv = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    min_x, min_y, max_x, max_y = getHandCoordinates(frame_conv)
    
    if min_x == -1:
        new_video.append(frame)
        continue
    
    hand_frame = frame[min_y:max_y,min_x:max_x]
    hand_frame = cv2.resize(hand_frame,dsize=(32,32))
    
    # Predict output
    hand_tensor = tf.convert_to_tensor([hand_frame])
    output = model.predict(hand_tensor)
    letter = le.inverse_transform([tf.argmax(output, axis=1)[0].numpy()])[0]
    
    # Blur face
    face_data = face_detect.detectMultiScale(frame, 1.3, 5)
    for (x, y, w, h) in face_data:
        roi = frame[y:y+h, x:x+w]
        # applying a gaussian blur over this new rectangle area
        roi = cv2.GaussianBlur(roi, (23, 23), 30)
        # impose this blurred image on original image to get final image
        frame[y:y+roi.shape[0], x:x+roi.shape[1]] = roi
    
    # Draw Rectangle 
    cv2.rectangle(frame, (min_x, min_y), (max_x, max_y), (255, 0, 0), 2)
    cv2.putText(frame, letter[-1], (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 
                   1, (255, 0, 0), 4, cv2.LINE_AA)
    new_video.append(frame)


In [11]:
frames = np.stack(new_video)

In [15]:
iio.imwrite("output.mov", frames, fps=15)
Video("output.mov")