### All imports

In [3]:
import cv2
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube
import collections
import cv2


from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np

import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision


### engine for pose estimation

In [4]:
base_options = python.BaseOptions(model_asset_path='../models/pose_landmarker.task')
options = vision.PoseLandmarkerOptions(
    base_options=base_options,
    output_segmentation_masks=True)

detector = vision.PoseLandmarker.create_from_options(options)

In [5]:
def draw_landmarks_on_image(rgb_image, detection_result):
  pose_landmarks_list = detection_result.pose_landmarks
  annotated_image = np.copy(rgb_image)

  for idx in range(len(pose_landmarks_list)):
    pose_landmarks = pose_landmarks_list[idx]


    pose_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
    pose_landmarks_proto.landmark.extend([
      landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in pose_landmarks
    ])
    solutions.drawing_utils.draw_landmarks(
      annotated_image,
      pose_landmarks_proto,
      solutions.pose.POSE_CONNECTIONS,
      solutions.drawing_styles.get_default_pose_landmarks_style())
  return annotated_image


In [6]:
#xwyPjhRoeNc
#nhoikoUEI8U
video_id = "nhoikoUEI8U"
subtitles = YouTubeTranscriptApi.get_transcript(video_id)
print(len(subtitles))

yt = YouTube(f"https://www.youtube.com/watch?v={video_id}")
stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()

destination_path = "../videos" 

video_file = stream.download(output_path=destination_path)

cap = cv2.VideoCapture(video_file)

cv2.namedWindow('Video with Subtitles', cv2.WINDOW_NORMAL)
cv2.resizeWindow('Video with Subtitles', 800, 600)

current_frame = 0 
fps = cap.get(cv2.CAP_PROP_FPS)
curr_sub_start = 0


dq = collections.deque()

def cv2_to_mediapipe_image(cv2_image):
    rgb_image = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB)
    image = mp.solutions.mediapipe.python.solution_base.Image(
        width=rgb_image.shape[1],
        height=rgb_image.shape[0],
        rgb_data=np.frombuffer(rgb_image.tobytes(), dtype=np.uint8)
    )

    return image

129


### main loop of the program

In [7]:
while True:
    cap.set(cv2.CAP_PROP_POS_FRAMES, current_frame)
    current_time = current_frame / fps

    ret, frame = cap.read()

    if not ret:
        break


    while(subtitles[curr_sub_start]['start']<current_time):
        print(subtitles[curr_sub_start]['text'])
        dq.append(curr_sub_start)
        curr_sub_start=curr_sub_start+1
    if(len(dq) >0):
        while(subtitles[dq[0]]['start'] + subtitles[dq[0]]['duration']<current_time):
            dq.popleft()
    

    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    edges = cv2.Canny(gray_frame, threshold1=100, threshold2=200)  

    sub_index=0
    for x in dq:
        cv2.putText(frame, subtitles[x]['text'], (50, 50+50*sub_index), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2, cv2.LINE_AA)
        sub_index+=1


    img = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)

    detection_result = detector.detect(img)


    annotated_image = draw_landmarks_on_image(img.numpy_view(), detection_result)
    bgr_image = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)

    # Display the image using OpenCV
    #cv2.imshow('Video with Subtitles', edges)
    cv2.imshow('Video with Subtitles', bgr_image)
    #cv2.imshow(cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR))    # Wait for user input (right arrow key to go to the next frame)
    key = cv2.waitKey(30)  # Adjust the delay as needed (milliseconds)
    if key == 27:  # ESC key to exit
        break
    elif key == 83 or key == 100:  # Right arrow key or 'd' key to move to the next frame
        current_frame += 1

# Release resources
cap.release()
cv2.destroyAllWindows()
