In [1]:
import cv2
import dlib
import numpy as np
from scipy.ndimage import zoom
from time import time

from tensorflow.keras.models import load_model
from imutils import face_utils

def process_video(input_file, output_file):
    shape_x = 48
    shape_y = 48

    def eye_aspect_ratio(eye):
        A = distance.euclidean(eye[1], eye[5])
        B = distance.euclidean(eye[2], eye[4])
        C = distance.euclidean(eye[0], eye[3])
        ear = (A + B) / (2.0 * C)
        return ear

    def detect_face(frame):
        # ... (unchanged)
        # Cascade classifier pre-trained model
        cascPath = '/Users/prithvika/Downloads/face_landmarks.dat'
        faceCascade = cv2.CascadeClassifier(cascPath)

        # BGR -> Gray conversion
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # Cascade MultiScale classifier
        detected_faces = faceCascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=6,
                                                      minSize=(shape_x, shape_y),
                                                      flags=cv2.CASCADE_SCALE_IMAGE)
        coord = []

        for x, y, w, h in detected_faces:
            if w > 100:
                sub_img = frame[y:y + h, x:x + w]
                cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 255), 1)
                coord.append([x, y, w, h])

        return gray, detected_faces, coord

    def extract_face_features(faces, offset_coefficients=(0.075, 0.05)):
        gray = faces[0]
        detected_face = faces[1]

        new_face = []

        for det in detected_face:
            # Regions of the face are detected
            x, y, w, h = det
            # a and y correspond to the gray conversion wheras w corresponds to the height

            # Offset coefficient, np.floor takes the lowest integer (delete border of the image)
            horizontal_offset = np.int(np.floor(offset_coefficients[0] * w))
            vertical_offset = np.int(np.floor(offset_coefficients[1] * h))

            # gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            # image is transformed to gray
            extracted_face = gray[y + vertical_offset:y + h, x + horizontal_offset:x - horizontal_offset + w]

            # Zoom on the extracted face
            new_extracted_face = zoom(extracted_face, (shape_x / extracted_face.shape[0], shape_y / extracted_face.shape[1]))
            # cast type float
            new_extracted_face = new_extracted_face.astype(np.float32)
            # scale
            new_extracted_face /= float(new_extracted_face.max())
            # print(new_extracted_face)

            new_face.append(new_extracted_face)

        return new_face

    #using the cascade model, the facial features are recognized
    (lStart, lEnd) = face_utils.FACIAL_LANDMARKS_IDXS["left_eye"]
    (rStart, rEnd) = face_utils.FACIAL_LANDMARKS_IDXS["right_eye"]

    (nStart, nEnd) = face_utils.FACIAL_LANDMARKS_IDXS["nose"]
    (mStart, mEnd) = face_utils.FACIAL_LANDMARKS_IDXS["mouth"]
    (jStart, jEnd) = face_utils.FACIAL_LANDMARKS_IDXS["jaw"]

    (eblStart, eblEnd) = face_utils.FACIAL_LANDMARKS_IDXS["left_eyebrow"]
    (ebrStart, ebrEnd) = face_utils.FACIAL_LANDMARKS_IDXS["right_eyebrow"]

    # keras model is loaded, this model is for videos
    model = load_model('/Users/prithvika/Downloads/video.h5')
    face_detect = dlib.get_frontal_face_detector()
    predictor_landmarks = dlib.shape_predictor("/Users/prithvika/Downloads/face_landmarks.dat")

    # input the captured video
    video_capture = cv2.VideoCapture(input_file)
    out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), 20.0, (640, 480))

    start_time = time()
    emotion_start_time = time()

    emotion_durations = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
    max_emotion_duration = 0
    max_emotion = None

    while True:
        # Capture frame-by-frame
        ret, frame = video_capture.read()
        if not ret:
            # Break the loop if the video is finished
            break

        face_index = 0

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        rects = face_detect(gray, 1)

        for (i, rect) in enumerate(rects):
            shape = predictor_landmarks(gray, rect)
            shape = face_utils.shape_to_np(shape)

            # Identify face coordinates
            (x, y, w, h) = face_utils.rect_to_bb(rect)
            face = gray[y:y + h, x:x + w]

            # Zoom on extracted face
            face = zoom(face, (shape_x / face.shape[0], shape_y / face.shape[1]))

            # Cast type float
            face = face.astype(np.float32)

            # Scale
            face /= float(face.max())
            face = np.reshape(face.flatten(), (1, 48, 48, 1))

            # Make Prediction
            prediction = model.predict(face)
            prediction_result = np.argmax(prediction)

            # Rectangle around the face
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

            cv2.putText(frame, "Face #{}".format(i + 1), (x - 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                        (0, 255, 0), 2)

            for (j, k) in shape:
                cv2.circle(frame, (j, k), 1, (0, 0, 255), -1)

            # 1. Add prediction probabilities
            cv2.putText(frame, "----------------", (40, 100 + 180 * i), cv2.FONT_HERSHEY_SIMPLEX, 0.5, 155, 0)
            cv2.putText(frame, "Emotional report : Face #" + str(i + 1), (40, 120 + 180 * i),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, 155, 0)
            cv2.putText(frame, "Angry : " + str(round(prediction[0][0], 3)), (40, 140 + 180 * i),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, 155, 0)
            cv2.putText(frame, "Disgust : " + str(round(prediction[0][1], 3)), (40, 160 + 180 * i),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, 155, 0)
            cv2.putText(frame, "Fear : " + str(round(prediction[0][2], 3)), (40, 180 + 180 * i),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, 155, 1)
            cv2.putText(frame, "Happy : " + str(round(prediction[0][3], 3)), (40, 200 + 180 * i),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, 155, 1)
            cv2.putText(frame, "Sad : " + str(round(prediction[0][4], 3)), (40, 220 + 180 * i),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, 155, 1)
            cv2.putText(frame, "Surprise : " + str(round(prediction[0][5], 3)), (40, 240 + 180 * i),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, 155, 1)
            cv2.putText(frame, "Neutral : " + str(round(prediction[0][6], 3)), (40, 260 + 180 * i),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, 155, 1)

            # Calculate and print duration of each emotion
            current_time = time()
            emotion_duration = current_time - emotion_start_time
            print(f"Duration of Emotion #{prediction_result + 1}: {emotion_duration} seconds")

            # Update emotion start time
            emotion_start_time = current_time

            # Add emotion duration to the cumulative total
            emotion_durations[prediction_result] += emotion_duration

            # 2. Annotate main image with a label
            if prediction_result == 0:
                cv2.putText(frame, "Angry", (x + w - 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            elif prediction_result == 1:
                cv2.putText(frame, "Confusion", (x + w - 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            elif prediction_result == 2:
                cv2.putText(frame, "Fear", (x + w - 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            elif prediction_result == 3:
                cv2.putText(frame, "Happy", (x + w - 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            elif prediction_result == 4:
                cv2.putText(frame, "Sad", (x + w - 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            elif prediction_result == 5:
                cv2.putText(frame, "Surprise", (x + w - 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            else:
                cv2.putText(frame, "Neutral", (x + w - 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        cv2.putText(frame, 'Number of Faces : ' + str(len(rects)), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, 155, 1)
        cv2.imshow('Video', frame)
        out.write(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # Calculate and print total duration of the video
    total_duration = time() - start_time
    print(f"Total Duration of the Video: {total_duration} seconds")

    # Print cumulative duration of each emotion
    for emotion, duration in emotion_durations.items():
#         print(f"Duration of Emotion #{emotion + 1} (seconds): {duration}")
        if (emotion == 0):
            print(f"Duration of Anger is (seconds): {duration}")
        elif (emotion == 1):
            print(f"Duration of Confusion is (seconds): {duration}")
        elif (emotion == 2):
            print(f"Duration of Fear is (seconds): {duration}")
        elif (emotion == 3):
            print(f"Duration of Happiness is (seconds): {duration}")
        elif (emotion == 4):
            print(f"Duration of Sadness is (seconds): {duration}")
        elif (emotion == 5):
            print(f"Duration of Surprise is (seconds): {duration}")
        elif (emotion == 6):
            print(f"Duration of Neutral is (seconds): {duration}")
        else:
            print("")

    # Find emotion with the maximum duration
    max_emotion = max(emotion_durations, key=emotion_durations.get)
    if (max_emotion == 0):
        print(f"The emotion that was observed most is: Anger")
    elif (max_emotion == 1):
        print(f"The emotion that was observed most is: Confusion")
    elif (max_emotion == 2):
        print(f"The emotion that was observed most is: Fear")
    elif (max_emotion == 3):
        print(f"The emotion that was observed most is: Happiness")
    elif (max_emotion == 4):
        print(f"The emotion that was observed most is: Sadness")
    elif (max_emotion == 5):
        print(f"The emotion that was observed most is: Surprise")
    elif (max_emotion == 6):
        print(f"The emotion that was observed most is: Neutral")
    else:
        print("")
#     print(f"The emotion that was observed most is: Emotion #{max_emotion + 1}")

    # When everything is done, release the capture
    video_capture.release()
    out.release()
    cv2.destroyAllWindows()

def main():
    input_video_file = '/Users/prithvika/Downloads/Emotion Videos/WhatsApp Video 2023-11-26 at 20.32.15.mp4'
    output_video_file = '/Users/prithvika/Downloads/output_video.mp4'

    process_video(input_video_file, output_video_file)

if __name__ == "__main__":
    main()


Duration of Emotion #5: 0.4086570739746094 seconds
Duration of Emotion #5: 0.2947068214416504 seconds
Duration of Emotion #5: 0.1537332534790039 seconds
Duration of Emotion #5: 0.19908785820007324 seconds
Duration of Emotion #5: 0.18146300315856934 seconds
Duration of Emotion #5: 0.14353609085083008 seconds
Duration of Emotion #5: 0.13518381118774414 seconds
Duration of Emotion #5: 0.12812328338623047 seconds
Duration of Emotion #5: 0.12760186195373535 seconds
Duration of Emotion #5: 0.12952804565429688 seconds
Duration of Emotion #5: 0.12897682189941406 seconds
Duration of Emotion #5: 0.14263296127319336 seconds
Duration of Emotion #5: 0.127946138381958 seconds
Duration of Emotion #5: 0.1295778751373291 seconds
Duration of Emotion #5: 0.12822389602661133 seconds
Duration of Emotion #5: 0.12692523002624512 seconds
Duration of Emotion #5: 0.13045597076416016 seconds
Duration of Emotion #5: 0.12795019149780273 seconds
Duration of Emotion #5: 0.13544178009033203 seconds
Duration of Emotio