# Course-end Project 1 - EdTech
## by Ernie Aparicio ea@launchoc.com

### Video Segmentation 
#### The first task is to divide the video into keyframes using Uniform Time Sampling.

In [4]:
import cv2
import os

def process_video_for_keyframes(video_path, sampling_rate):
    cap = cv2.VideoCapture(video_path)

    # Check if video opened successfully
    if not cap.isOpened():
        print(f"Error opening video file: {video_path}")
        return

    # Get frame rate
    frame_rate = cap.get(cv2.CAP_PROP_FPS)

    # Calculate sampling interval in frames
    sampling_interval = int(frame_rate * sampling_rate)  # sampling_rate in seconds

    frame_count = 0
    keyframe_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % sampling_interval == 0:
            # Folder for keyframes
            base_folder = os.path.splitext(os.path.basename(video_path))[0]
            keyframe_folder = f'keyframes/{base_folder}'
            os.makedirs(keyframe_folder, exist_ok=True)

            # Save keyframe
            keyframe_file = f'{keyframe_folder}/keyframe_{keyframe_count}.jpg'
            cv2.imwrite(keyframe_file, frame)
            keyframe_count += 1

        frame_count += 1

    cap.release()

# List of video paths
video_paths = ['nptel_ai/howToLearn.mp4', 'nptel_ml/ML.mp4']
sampling_rate = 1  # One keyframe every second

# Process each video for keyframes
for video_path in video_paths:
    process_video_for_keyframes(video_path, sampling_rate)

print("Done processing videos.")


Done processing videos.


### Assessment of Instructor Presence and Interaction:
#### We'll use a pre-trained model https://tfhub.dev/tensorflow/ssd_mobilenet_v2 for object detection and check instructor presense.  This model classifies an object as a person.

In [11]:
import os
import cv2
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

# Load the pre-trained SSD MobileNet model from TensorFlow Hub
model_handle = 'https://tfhub.dev/tensorflow/ssd_mobilenet_v2/fpnlite_640x640/1'
model = hub.load(model_handle).signatures['serving_default']

def detect_instructor(frame, presence_threshold=0.5):
    # Convert the frame to uint8 and process for model input
    frame = tf.image.convert_image_dtype(frame, tf.uint8)
    input_tensor = tf.expand_dims(frame, 0)

    # Model inference
    result = model(input_tensor)

    # Parse the results
    result = {key:value.numpy() for key,value in result.items()}
    detection_scores = result["detection_scores"]
    detection_classes = result["detection_classes"]
    detection_boxes = result["detection_boxes"]

    # Check for instructor presence (class 1 is 'person')
    for score, clss, box in zip(detection_scores[0], detection_classes[0], detection_boxes[0]):
        if score >= presence_threshold and clss == 1:
            return True, box
    return False, None

def is_full_screen(bbox, frame_shape):
    frame_height, frame_width, _ = frame_shape
    bbox_width = bbox[3] - bbox[1]  # Calculate width of bounding box
    bbox_height = bbox[2] - bbox[0]  # Calculate height of bounding box

    # Example thresholds: 50% of frame width and 50% of frame height for full-screen
    min_full_screen_width = frame_width * 0.5
    min_full_screen_height = frame_height * 0.5

    if bbox_width >= min_full_screen_width and bbox_height >= min_full_screen_height:
        return True
    return False

def process_video_segments(segment_folder):
    instructor_presence_count = 0
    total_frames = 0
    full_screen_count = 0
    pip_count = 0

    for segment_file in os.listdir(segment_folder):
        segment_path = os.path.join(segment_folder, segment_file)
        frame = cv2.imread(segment_path)
        if frame is not None:
            total_frames += 1
            present, bbox = detect_instructor(frame)
            if present:
                instructor_presence_count += 1
                if is_full_screen(bbox, frame.shape):
                    full_screen_count += 1
                else:
                    pip_count += 1

    return instructor_presence_count, total_frames, full_screen_count, pip_count

def analyze_video(video_folder):
    instructor_presence, total_frames, full_screen, pip = process_video_segments(video_folder)

    if total_frames > 0:
        fraction_visible = instructor_presence / total_frames
        fraction_full_screen = full_screen / total_frames
        fraction_pip = pip / total_frames
        print(f"Instructor Visibility Fraction in {video_folder}: {fraction_visible}")
        print(f"Fraction of Full-Screen Presence: {fraction_full_screen}")
        print(f"Fraction of PIP Presence: {fraction_pip}")
    else:
        print(f"No frames to analyze in {video_folder}")

# Analyze both videos
analyze_video('keyframes/ML')
analyze_video('keyframes/howToLearn')


Instructor Visibility Fraction in keyframes/ML: 0.9463414634146341
Fraction of Full-Screen Presence: 0.0
Fraction of PIP Presence: 0.9463414634146341
Instructor Visibility Fraction in keyframes/howToLearn: 0.1963882618510158
Fraction of Full-Screen Presence: 0.0
Fraction of PIP Presence: 0.1963882618510158


#### We can see from the results above our model did a good job scoring instructor visibility, full-screen presence, and partial for each video.  In the ML video, majority of keyframes are the instructor and in the How To Learn video, majority of screen is the powerpoint presentation.

#### Next, lets analyze interaction of instructor by using a facial expression model to classify between emotions.  For this I found a pre-trained emotion recognition model from https://github.com/joh-fischer/emotion-recognition-CNN

In [9]:
import os
import cv2
import numpy as np
import tensorflow as tf


# Facial Emotion Recognition Model Setup
from emotion_recognition_model import get_base_model
from emotion_recognition_utils import preprocess_fer, get_labels_fer

IMG_SHAPE = (100, 100, 3)
model = get_base_model(IMG_SHAPE)
model.add(tf.keras.layers.Dense(7, activation='softmax', name="softmax"))

model_name = 'FERplus_0124-1040_weights.h5'
model.load_weights('./models/' + model_name)


def analyze_facial_expression(frame):
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    x = cv2.resize(img, dsize=IMG_SHAPE[:-1])
    x = np.expand_dims(x, axis=0)
    x = preprocess_fer(x)

    output = model.predict(x)
    label = get_labels_fer(output)[0]
    confidence = np.argmax(output[0])

    return label, confidence

def process_keyframes(keyframe_folder):
    expression_counts = {emotion: 0 for emotion in ['neutral', 'happiness', 'surprise', 'sadness', 'anger', 'disgust', 'fear', 'contempt']}
    total_frames = 0

    for keyframe_file in os.listdir(keyframe_folder):
        keyframe_path = os.path.join(keyframe_folder, keyframe_file)
        keyframe = cv2.imread(keyframe_path)
        
        if keyframe is not None:
            total_frames += 1
            label, _ = analyze_facial_expression(keyframe)
            expression_counts[label] += 1

    # Calculate expression percentages
    if total_frames > 0:
        for expression in expression_counts:
            expression_counts[expression] = expression_counts[expression] / total_frames

    return expression_counts

def analyze_videos(video_folders):
    for video_folder in video_folders:
        print(f"Analyzing keyframes in {video_folder}")
        expression_percentages = process_keyframes(video_folder)
        print(f"Facial Expression Percentages for {video_folder}:")
        for expression, percentage in expression_percentages.items():
            print(f"{expression.capitalize()}: {percentage * 100:.2f}%")
        print(f"Finished analyzing {video_folder}")

# List of video keyframe folders to analyze
video_keyframe_folders = ['keyframes/ML', 'keyframes/howToLearn']

# Analyze the videos
analyze_videos(video_keyframe_folders)


Analyzing keyframes in keyframes/ML
Facial Expression Percentages for keyframes/ML:
Neutral: 4.39%
Happiness: 69.27%
Surprise: 0.00%
Sadness: 0.00%
Anger: 26.34%
Disgust: 0.00%
Fear: 0.00%
Contempt: 0.00%
Finished analyzing keyframes/ML
Analyzing keyframes in keyframes/howToLearn




Facial Expression Percentages for keyframes/howToLearn:
Neutral: 80.36%
Happiness: 17.61%
Surprise: 0.00%
Sadness: 0.00%
Anger: 2.03%
Disgust: 0.00%
Fear: 0.00%
Contempt: 0.00%
Finished analyzing keyframes/howToLearn


#### Below is a recap of emotional expression for each video. 

#### Facial Expression Percentages for keyframes/ML: Neutral: 4.39% Happiness: 69.27% Surprise: 0.00% Sadness: 0.00% Anger: 26.34% Disgust: 0.00% Fear: 0.00% Contempt: 0.00%

#### Facial Expression Percentages for keyframes/howToLearn: Neutral: 80.36% Happiness: 17.61% Surprise: 0.00% Sadness: 0.00% Anger: 2.03% Disgust: 0.00% Fear: 0.00% Contempt: 0.00%

#### This model reveals that the ML video has an overwhelming happiness score where the howToLearn is more neutral.  This suggests the ML video may provide a more pleasing experience.

#### We're starting to see from our two previous models that the ML video has a happier more visible instructor.  This should lead to greater student experience.  Let's continue.

### Assessment of use of blackboard, slides
#### Let's 

In [10]:
import os
import cv2
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

# Load the pre-trained EfficientDet model from TensorFlow Hub
#model_handle = 'https://tfhub.dev/google/efficientdet/lite2/detection/1'
#model = hub.load(model_handle).signatures['serving_default']

def detect_text(frame):
    # Convert the frame to a PIL Image
    pil_img = Image.fromarray(frame)

    # Use pytesseract to do OCR on the image
    text = pytesseract.image_to_string(pil_img)

    # For now, just check if any text was detected
    if text.strip():  # If there's any text detected
        return True
    return False


def detect_instructor_and_text(frame, presence_threshold=0.5):
    # Convert the frame to uint8 and process for model input
    frame = tf.image.convert_image_dtype(frame, tf.uint8)
    input_tensor = tf.expand_dims(frame, 0)

    # Model inference for person detection
    #result = person_model(input_tensor)
    result = 0
    
    # Parse the results
    result = {key:value.numpy() for key,value in result.items()}
    detection_scores = result["detection_scores"]
    detection_classes = result["detection_classes"]
    detection_boxes = result["detection_boxes"]

    # Text detection
    text_detected = detect_text(frame)

    return present, person_bbox, text_detected

def is_interacting(person_bbox, text_boxes):
    # Implement interaction logic here
    # This is a placeholder implementation
    return False  # Returns True if interaction is detected, otherwise False

def classify_frame(person_present, person_bbox, text_detected):
    if person_present and text_detected:
        return 'blackboard'
    elif text_detected:
        return 'powerpoint'
    else:
        return 'neither'


def process_video_segments(segment_folder):
    # Counters for blackboard, PowerPoint, and neither
    blackboard_count = 0
    powerpoint_count = 0
    neither_count = 0
    total_frames = 0

    for segment_file in os.listdir(segment_folder):
        segment_path = os.path.join(segment_folder, segment_file)
        frame = cv2.imread(segment_path)
        if frame is not None:
            total_frames += 1
            person_present, person_bbox, text_boxes = detect_instructor_and_text(frame)
            classification = classify_frame(person_present, person_bbox, text_boxes)

            if classification == 'blackboard':
                blackboard_count += 1
            elif classification == 'powerpoint':
                powerpoint_count += 1
            else:
                neither_count += 1

    return blackboard_count, powerpoint_count, neither_count, total_frames

def analyze_video(video_folder):
    blackboard, powerpoint, neither, total_frames = process_video_segments(video_folder)

    if total_frames > 0:
        fraction_blackboard = blackboard / total_frames
        fraction_powerpoint = powerpoint / total_frames
        fraction_neither = neither / total_frames
        print(f"Fraction of Blackboard in {video_folder}: {fraction_blackboard}")
        print(f"Fraction of PowerPoint in {video_folder}: {fraction_powerpoint}")
        print(f"Fraction of Neither in {video_folder}: {fraction_neither}")
    else:
        print(f"No frames to analyze in {video_folder}")

# Example usage
analyze_video('keyframes/ML')
analyze_video('keyframes/howToLearn')


OSError: https://tfhub.dev/google/efficientdet/lite2/detection/1 does not appear to be a valid module.