### All imports

In [None]:
import cv2
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube
import collections
import cv2


from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np

import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

import nltk  


### engine for pose estimation

In [None]:
base_options = python.BaseOptions(model_asset_path='../models/pose_landmarker.task')
options = vision.PoseLandmarkerOptions(
    base_options=base_options,
    output_segmentation_masks=True)
mp_pose = mp.solutions.pose
detector = mp_pose.Pose()
#detector = vision.PoseLandmarker.create_from_options(options)

In [40]:
from mediapipe.python.solutions import drawing_utils, pose


def draw_landmarks_on_image(rgb_image, detection_result):
    pose_landmarks_list = detection_result.pose_landmarks.landmark
    annotated_image = np.copy(rgb_image)

    # Create NormalizedLandmarkList to hold the detected landmarks
    pose_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
    pose_landmarks_proto.landmark.extend([
        landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z)
        for landmark in pose_landmarks_list
    ])

    # Draw landmarks on the image
    drawing_utils.draw_landmarks(
        annotated_image,
        pose_landmarks_proto,
        pose.POSE_CONNECTIONS,  # Ensure POSE_CONNECTIONS matches the number of landmarks detected
        drawing_utils.DrawingSpec(color=(255, 0, 0), thickness=2, circle_radius=2),
        drawing_utils.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=2)
    )

    return annotated_image


def calculate_angle(p1, p2):
    x_diff = p2[0] - p1[0]
    y_diff = p2[1] - p1[1]
    return np.degrees(np.arctan2(y_diff, x_diff))

# Funkcja do obliczania kąta obrotu sylwetki
def calculate_body_rotation_angle(landmarks):
    if landmarks is None:
      return None
    # Wykryte punkty charakterystyczne dla ramion (np. 11 i 12 dla lewego i prawego ramienia)
    left_shoulder = [landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER].x,
                     landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER].y]
    right_shoulder = [landmarks[mp_pose.PoseLandmark.RIGHT_SHOULDER].x,
                      landmarks[mp_pose.PoseLandmark.RIGHT_SHOULDER].y]

    # Oblicz kąt nachylenia linii ramion
    angle = calculate_angle(left_shoulder, right_shoulder)
    return angle

In [46]:
#xwyPjhRoeNc
#nhoikoUEI8U
video_id = "nhoikoUEI8U"
subtitles = YouTubeTranscriptApi.get_transcript(video_id)
print(len(subtitles))

yt = YouTube(f"https://www.youtube.com/watch?v={video_id}")
stream = yt.streams.filter(res="720p").first()
print(yt.streams.filter(res="720p").first())
destination_path = "../videos" 

video_file = stream.download(output_path=destination_path)


def cv2_to_mediapipe_image(cv2_image):
    rgb_image = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB)
    image = mp.solutions.mediapipe.python.solution_base.Image(
        width=rgb_image.shape[1],
        height=rgb_image.shape[0],
        rgb_data=np.frombuffer(rgb_image.tobytes(), dtype=np.uint8)
    )

    return image

129
<Stream: itag="136" mime_type="video/mp4" res="720p" fps="24fps" vcodec="avc1.4d401f" progressive="False" type="video">


### This is optional tool to run main loop faster without processing text

In [41]:
cap = cv2.VideoCapture(video_file)
dq = collections.deque()
cv2.namedWindow('Video with Subtitles', cv2.WINDOW_NORMAL)
cv2.resizeWindow('Video with Subtitles', 800, 600)
last_frame = 0 
current_frame = 0 
fps = cap.get(cv2.CAP_PROP_FPS)
curr_sub_start = 0

try:
    while True:
        cap.set(cv2.CAP_PROP_POS_FRAMES, current_frame)
        current_time = current_frame / fps

        ret, frame = cap.read()

        if not ret:
            break
        if last_frame != current_frame:
            while subtitles[curr_sub_start]['start'] < current_time:
                print(subtitles[curr_sub_start]['text'])
                dq.append(curr_sub_start)
                curr_sub_start += 1
            if len(dq) > 0:
                while subtitles[dq[0]]['start'] + subtitles[dq[0]]['duration'] < current_time:
                    dq.popleft()

            gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            edges = cv2.Canny(gray_frame, threshold1=100, threshold2=200)  

            sub_index = 0
            for x in dq:
                cv2.putText(frame, subtitles[x]['text'], (50, 50 + 50 * sub_index), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2, cv2.LINE_AA)
                sub_index += 1

            #img = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
            detection_result = detector.process(frame)
            if(detection_result.pose_landmarks):
                #print(detection_result.pose_landmarks)
                #print(detection_result.pose_landmarks.landmark[0])
                
                #print(detection_result.pose_landmarks[11])
                body_angle = calculate_body_rotation_angle(detection_result.pose_landmarks.landmark)
                print(body_angle)
                #cv2.putText(frame, body_angle, (50, 500), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2, cv2.LINE_AA)
                annotated_image = draw_landmarks_on_image(frame, detection_result)
                bgr_image = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
                cv2.imshow('Video with Subtitles', bgr_image)
            else:
               cv2.imshow('Video with Subtitles', frame) 

        #cv2.imshow('Video with Subtitles', frame)

        #cv2.imshow(cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR))    # Wait for user input (right arrow key to go to the next frame)
        key = cv2.waitKey(30)  # Adjust the delay as needed (milliseconds)
        last_frame = current_frame
        if key == 27:  # ESC key to exit
            break
        elif key == 83 or key == 100:
            current_frame += 1
finally:
    cap.release()
    cv2.destroyAllWindows()

-159.95314546650425
-101.32241813607328
-83.49624206538516
-65.19486770546543
-43.7041466005945
15.326723721672215
-43.194385308915635
-68.08071794647772
-34.75050769173432
-31.925907001003896
-59.12850380115535
-62.05253149986459
-74.5069941174697
-52.89103637028931
-71.05167634308162
-70.79403335686183
-74.32549610794187
-78.49193856504841
-89.18665388758376
-86.74710803086082
-86.00601382699583
-89.81017025720249
-81.09202078665241
-75.51980141094994
-63.30707023294511
-65.72870326851661
-71.03531189717611
-70.55011614876639
-40.757738355695395
-95.01456357794628
84.78841358204654
81.80470627518856
81.31143989494406
82.82644881842903
85.10119499850238
87.60298166775024
88.04968985502019
88.56812447933748
91.6374012113704
80.27986225306535
79.55774385714595
-10.283285365093223
-3.9429571772672762
-17.237859760329453
-20.15624499762196
-10.395824541920852
77.89170675488701
71.41998144415254
73.9717810516022
74.54619423515305
76.89707625425153
77.97324516573943
77.38479755767705
90.867

### divide text into sentences and add punctuation with ml model

In [6]:

text =''
for obj in subtitles:
    text+=obj['text']

print(len(text))
print(text)


from deepmultilingualpunctuation import PunctuationModel
model = PunctuationModel()

result = model.restore_punctuation(text)
print(len(result))



4705
we will approach the squat in two phasesfirst unloaded to solve problemsassociated with the bottom position andthen loaded to learn how to apply thebottom position to the hip drive usedfor heavier weights since the majorityof the problems with the squat happenedat the bottom this method expedites theprocess quite effectively we will use afairly neutral foot placement with theheels about shoulder width apart and thetoes pointed out at about 30 degreesmany people will assume a stance withtoes pointed too forward so you may needto point them out more than you want tonext you're going to assume the positionyou will be in at the bottom of a squatwithout the barsquat down all the way to a position inwhich the apex of the hip crease dropsjust below the top of the patella putyour elbows against your knees with thepalms of your hands together and shoveyour knees out notice your feet are flaton the floor your knees are shoved outto where they are in a parallel linewith your feet and just a 

  from .autonotebook import tqdm as notebook_tqdm







4790


### removing artificial connections in words (auto generating subtitles from yt isn't ideal)
### also pos-tags are added here

In [7]:
sents = nltk.sent_tokenize(result)

import wordsegment
from wordsegment import load, segment
load()
from nltk.tokenize import word_tokenize

# segment powoduje również tokenizacje zdania dlatego ten etap(tokenizacji) zostanie pominięty.
sents = [(segment(sent)) for sent in sents]
#porter = nltk.PorterStemmer()
#sents = [[porter.stem(t) for t in sent] for sent in sents]
sents = [nltk.pos_tag(sent) for sent in sents]
grammar = r"""
  NP: {<DT|PP\$>?<JJ>*<NN>} 
      {<NNP>+}               
"""
# grammar = r"""
#   NP: {<DT>?<JJ>*<NN>}
#   VP: {<VB.*><NP|PP>*}
#   PP: {<IN><NP>}
#   ADJP: {<JJ>}
#   ADVP: {<RB.*>}
# """
cp = nltk.RegexpParser(grammar) 
	
# class ConsecutiveNPChunkTagger(nltk.TaggerI): 

#     def __init__(self, train_sents):
#         train_set = []
#         for tagged_sent in train_sents:
#             untagged_sent = nltk.tag.untag(tagged_sent)
#             history = []
#             for i, (word, tag) in enumerate(tagged_sent):
#                 featureset = npchunk_features(untagged_sent, i, history) 
#                 train_set.append( (featureset, tag) )
#                 history.append(tag)
#         self.classifier = nltk.MaxentClassifier.train( 
#             train_set, algorithm='megam', trace=0)

#     def tag(self, sentence):
#         history = []
#         for i, word in enumerate(sentence):
#             featureset = npchunk_features(sentence, i, history)
#             tag = self.classifier.classify(featureset)
#             history.append(tag)
#         return zip(sentence, history)

# class ConsecutiveNPChunker(nltk.ChunkParserI):
#     def __init__(self, train_sents):
#         tagged_sents = [[((w,t),c) for (w,t,c) in
#                          nltk.chunk.tree2conlltags(sent)]
#                         for sent in train_sents]
#         self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

#     def parse(self, sentence):
#         tagged_sents = self.tagger.tag(sentence)
#         conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
#         return nltk.chunk.conlltags2tree(conlltags)
    
# def npchunk_features(sentence, i, history):
#      word, pos = sentence[i]
#      return {"pos": pos}
# chunker = ConsecutiveNPChunker(train_sents)
# print(chunker.evaluate(test_sents))


# sents = [cp.parse(sent) for sent in sents]





In [8]:
# nltk.download('maxent_ne_chunker')
# nltk.download('treebank')
# print(sents[30])
sent = nltk.corpus.treebank.tagged_sents()[22]
#print(sent)
print(sents[15])
#print(nltk.ne_chunk(sent))
print(nltk.ne_chunk(sents[25]))

# sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
# ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]

# grammar = "NP: {<DT>?<JJ>*<NN>}" 

# cp = nltk.RegexpParser(grammar) 
# result = cp.parse(sentence) 
# print(result) 
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
# print(cp.evaluate(sents))
# result.draw() 

[('measured', 'VBN'), ('from', 'IN'), ('the', 'DT'), ('markings', 'NNS'), ('placed', 'VBN'), ('on', 'IN'), ('the', 'DT'), ('bar', 'NN'), ('for', 'IN'), ('this', 'DT'), ('purpose', 'NN'), ('a', 'DT'), ('standard', 'JJ'), ('powerbar', 'NN'), ('has', 'VBZ'), ('16to17', 'CD'), ('inches', 'NNS'), ('between', 'IN'), ('the', 'DT'), ('ends', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('inside', 'JJ'), ('neural', 'JJ'), ('and', 'CC'), ('32', 'CD'), ('inches', 'NNS'), ('between', 'IN'), ('the', 'DT'), ('finger', 'NN'), ('marks', 'NNS')]
(S
  again/RB
  heels/NNS
  should/MD
  be/VB
  about/IN
  shoulder/NN
  width/NNS
  apart/RB
  with/IN
  toes/NNS
  pointed/VBN
  out/RP
  about/IN
  30/CD
  degrees/NNS
  at/IN
  this/DT
  point/NN
  you/PRP
  are/VBP
  ready/JJ
  to/TO
  squat/VB
  with/IN
  the/DT
  empty/JJ
  bar/NN)


### Finding sentences with technique rules(unfinished) (regexp: noun(body part) and verb)

In [9]:
text = "When performing squats with a barbell, ensure your back is straight, knees do not extend beyond your toes, and the barbell rests securely on your shoulders."

from nltk.corpus import wordnet as wn
import nltk 
nltk.download('wordnet')
part = wn.synsets('body_part')[0]

def is_body_part(candidate):
    for ss in wn.synsets(candidate):
        # only get those where the synset matches exactly
        name = ss.name().split(".", 1)[0]
        if name != candidate:
            continue
        hit = part.lowest_common_hypernyms(ss)
        if hit and hit[0] == part:
            return True
    return False

# for word in sents[0]:
#     print(is_body_part(word[0]), word[0], sep="\t")

# Procesowanie każdego zdania
# for sentence in sents:
#     if any(is_body_part(t[0].lower()) for t in sentence):
#         print(f"Zdanie zawiera część ciała: {sentence}")

import nltk
from nltk import CFG
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from nltk.parse import ChartParser

# Lista części ciała
body_parts = ["head", "arm", "leg", "hand", "foot", "eye", "ear", "nose", "mouth", "shoulder", "knee", "elbow"]

# Definicja gramatyki bezkontekstowej z użyciem POS tags
grammar = CFG.fromstring("""
  S -> NP VP
  NP -> DT JJNN | JJNN
  VP -> VBZ NP | VBZ ADJP | VBZ PP
  DT -> 'the' | 'a' | 'his' | 'her'
  JJNN -> JJ NN | JJNN JJ NN
  JJ -> 'badly' | 'quickly' | 'slowly' | 'fast'
  NN -> 'head' | 'arm' | 'leg' | 'hand' | 'foot' | 'eye' | 'ear' | 'nose' | 'mouth' | 'shoulder' | 'knee' | 'elbow'
  VBZ -> VB
  ADJP -> JJ NP
  NP -> DT JJNN
  PP -> IN NP
  IN -> 'in'
""")

# Tworzenie parsera
parser = ChartParser(grammar)

# Tokenizacja tekstu na zdania
for sentence in sents:
    # Sprawdzanie czy zdanie pasuje do gramatyki
    #print(sentence)
    words = [word for word, tag in sentence]
    try:
        for tree in parser.parse(words):
            # # Sprawdzanie czy pierwsza fraza rzeczownikowa jest częścią ciała
            # np = tree[0]
            # if np.label() == 'NP' and np[0][0].lower() in body_parts:
            #     print(f"Zdanie zawiera część ciała jako podmiot: {sentence}")
            tree.pretty_print()
    except ValueError:
            # Jeżeli parser nie znajdzie pasującego drzewa, przechodzi do następnego zdania
        #print("nie ma drzewa")    
        continue
import re 

#pattern = r'(head|arm|leg|hand|foot|eye|ear|nose|back|mouth|shoulder|knee|elbow)'
pattern = r'\b(head|arm|leg|hand|foot|eye|ear|nose|back|mouth|shoulder|knee|elbow)\b\s+(\w+)'

sents_for_regexp = nltk.sent_tokenize(result)
sents_for_regexp = [(segment(sent)) for sent in sents_for_regexp]
for sentence in sents_for_regexp:
    #print(sentence)
    sem = ' '.join([str(elem) for elem in sentence])
    #print(sem)
    match = re.search(pattern,sem)
    if match:
        print(sem)
        #print(match.string)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Damian\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


we will use a fairly neutral foot placement with the heels about shoulder width apart and the toes pointed out at about 30 degrees many people will assume a stance with toes pointed too forward so you may need to point them out more than you want
your back should be as flat as you can get it
also notice that your back is inclined at about a45 degree angle not at all vertical and your eyes are looking down at the floor a few feet in front of you
this movement keeps your weight solidly over the whole foot instead of letting it shift to the toes
grip width for the squat will vary with shoulder width and flexibility but in general the hands will be between these two markings
the elbows should be lifted up to trap the bar between the hands and the back elbows should be up but not high
with your grip in place and your hands and thumbs on top of the bar dip your head under the bar and come up into position with the bar on your back just below the spine of the scapula the bone you feel at the 

In [10]:
for sent in sents:
    for tuple in sent:
        print(tuple[0],end = " ")
    print("\n")    

we will approach the squat in two phases first unloaded to solve problems associated with the bottom position and then loaded to learn how to apply the bottom position to the hip drive used for heavier weights 

since the majority of the problems with the squat happened at the bottom this method expedites the process quite effectively 

we will use a fairly neutral foot placement with the heels about shoulder width apart and the toes pointed out at about 30 degrees many people will assume a stance with toes pointed too forward so you may need to point them out more than you want 

to next your e going to assume the position you will be in at the bottom of a squat without the bar squat down all the way to a position in which the apex of the hip crease drops just below the top of the patella 

put your elbows against your knees with the palms of your hands together and shove your knees out 

notice your feet are flat on the floor 

your knees are shoved out to where they are in a paralle

### main loop of the program

In [15]:
cap = cv2.VideoCapture(video_file)
dq = collections.deque()
cv2.namedWindow('Video with Subtitles', cv2.WINDOW_NORMAL)
cv2.resizeWindow('Video with Subtitles', 800, 600)

current_frame = 0 
fps = cap.get(cv2.CAP_PROP_FPS)
curr_sub_start = 0
while True:
    cap.set(cv2.CAP_PROP_POS_FRAMES, current_frame)
    current_time = current_frame / fps

    ret, frame = cap.read()

    if not ret:
        break


    while(subtitles[curr_sub_start]['start']<current_time):
        print(subtitles[curr_sub_start]['text'])
        dq.append(curr_sub_start)
        curr_sub_start=curr_sub_start+1
    if(len(dq) >0):
        while(subtitles[dq[0]]['start'] + subtitles[dq[0]]['duration']<current_time):
            dq.popleft()
    

    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    edges = cv2.Canny(gray_frame, threshold1=100, threshold2=200)  

    sub_index=0
    for x in dq:
        cv2.putText(frame, subtitles[x]['text'], (50, 50+50*sub_index), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2, cv2.LINE_AA)
        sub_index+=1


    img = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)

    detection_result = detector.detect(img)
    # body_angle = calculate_body_rotation_angle(detection_result.pose_landmarks.landmark)
    # cv2.putText(frame, body_angle, (50, 500), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2, cv2.LINE_AA)


    annotated_image = draw_landmarks_on_image(img.numpy_view(), detection_result)
    bgr_image = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)

    # Display the image using OpenCV
    #cv2.imshow('Video with Subtitles', frame)
    cv2.imshow('Video with Subtitles', bgr_image)
    #cv2.imshow(cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR))    # Wait for user input (right arrow key to go to the next frame)
    key = cv2.waitKey(30)  # Adjust the delay as needed (milliseconds)
    if key == 27:  # ESC key to exit
        break
    elif key == 83 or key == 100:
        current_frame += 1
cap.release()
cv2.destroyAllWindows()


AttributeError: 'list' object has no attribute 'landmark'