In [2]:
from Levenshtein import distance as levenshtein_distance
import cv2
import numpy as np
import os
import tqdm
from yt_dlp import YoutubeDL
from collections import defaultdict
import re
import requests
from PIL import Image
from io import BytesIO
from fpdf import FPDF
import easyocr

In [None]:
video_url = 'https://www.youtube.com/watch?v=Bm1MJgBsL2w&t'
ydl_opts={}
ydl=YoutubeDL(ydl_opts)
info_dict=ydl.extract_info(video_url, download=False)
formats = info_dict.get('formats', None)

[youtube] Extracting URL: https://www.youtube.com/watch?v=Bm1MJgBsL2w&t
[youtube] Bm1MJgBsL2w: Downloading webpage
[youtube] Bm1MJgBsL2w: Downloading ios player API JSON
[youtube] Bm1MJgBsL2w: Downloading mweb player API JSON
[youtube] Bm1MJgBsL2w: Downloading m3u8 information




In [4]:

def distance(p1, p2):
    """Function to calculate the distance between two points."""
    return np.sqrt((p2[0] - p1[0]) ** 2 + (p2[1] - p1[1]) ** 2)

def is_nearly_valid_bbox(bbox, side_length_tolerance=10, angle_tolerance=50):
    """Function to check if the bounding box is nearly valid."""
    if len(bbox) != 4:
        return False  
    
    for point in bbox:
        if not isinstance(point[0], (np.float64, np.int32)) or not isinstance(point[1], (np.float64, np.int32)):
            return False

    side_lengths = [
        distance(bbox[0], bbox[1]), 
        distance(bbox[1], bbox[2]),  
        distance(bbox[2], bbox[3]),  
        distance(bbox[3], bbox[0])   
    ]
    
    if not (abs(side_lengths[0] - side_lengths[2]) < side_length_tolerance and 
            abs(side_lengths[1] - side_lengths[3]) < side_length_tolerance):
        return False

    def angle_between_vectors(v1, v2):
        dot_product = np.dot(v1, v2)
        return abs(dot_product) < angle_tolerance

    v1 = [bbox[1][0] - bbox[0][0], bbox[1][1] - bbox[0][1]]
    v2 = [bbox[2][0] - bbox[1][0], bbox[2][1] - bbox[1][1]]
    v3 = [bbox[3][0] - bbox[2][0], bbox[3][1] - bbox[2][1]]
    v4 = [bbox[0][0] - bbox[3][0], bbox[0][1] - bbox[3][1]]

    if not (angle_between_vectors(v1, v2) and angle_between_vectors(v2, v3) and 
            angle_between_vectors(v3, v4) and angle_between_vectors(v4, v1)):
        return False

    return True

def sort_text_boxes(result, y_threshold=10):
    """" Function to sort text boxes."""
    valid_boxes = [box for box in result if is_nearly_valid_bbox(box[0])]

    result_sorted = sorted(valid_boxes, key=lambda x: x[0][0][1])

    sorted_result = []
    current_group = []

    for i, box in enumerate(result_sorted):
        if not current_group:
            current_group.append(box)
        else:
            prev_box = current_group[-1]
            if abs(box[0][0][1] - prev_box[0][0][1]) <= y_threshold:
                current_group.append(box)
            else:
                sorted_result.extend(sorted(current_group, key=lambda x: x[0][0][0]))
                current_group = [box]

    sorted_result.extend(sorted(current_group, key=lambda x: x[0][0][0]))
    
    return sorted_result

reader = easyocr.Reader(['pl'])

output_folder = "data/marchewka"
os.makedirs(output_folder, exist_ok=True)

label_file = "data/marchewka.txt"
with open(label_file, 'w', encoding='utf-8') as f_labels:
    f_labels.write("")

for f in formats:
    if f.get('format_note', None) == '360p':  
        url = f.get('url', None)
        cap = cv2.VideoCapture(url)

        if not cap.isOpened():
            print("Wrong url:", url)
            continue

        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = cap.get(cv2.CAP_PROP_FPS)
        print(f"Frames in video: {total_frames}, FPS: {fps}")

        pbar = tqdm.tqdm(desc='Processing', total=total_frames // 70)

        frame_number = 0
        x = 0

        while True:
            ret, frame = cap.read()
            if not ret:  
                break

            # OCR
            result = reader.readtext(frame, detail=1)  
            boxes = [r[0] for r in result]  
            texts = [r[1] for r in result]  
            scores = [r[2] for r in result]  

            if result:  
                sorted_result = sort_text_boxes(list(zip(boxes, texts, scores)))

                combined_text = " ".join([text for _, text, _ in sorted_result])

                if combined_text.strip(): 
                    shot_name = f"shot{x}.png"
                    frame_filename = os.path.join(output_folder, shot_name)
                    cv2.imwrite(frame_filename, frame)

                    with open(label_file, 'a', encoding='utf-8') as f_labels:
                        f_labels.write(f"{shot_name}, {combined_text}\n")

                    x += 1

            frame_number += 70
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)

            pbar.update(1)

            if frame_number >= total_frames:
                break

        cap.release()
        pbar.close()
        print(f"Frames downloaded to {output_folder}")
        print(f"Results in {label_file}")
        break


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Frames in video: 23332, FPS: 25.0


Processing: 334it [14:28,  2.60s/it]                         

Frames downloaded to data/marchewka
Results in data/marchewka.txt





In [6]:
def filter_file(file_path, output_file_path):
    filtered_lines = []
    previous_line = None
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip().lower()
            
            #Różne
            line = re.sub(r'^shot\d+\.png, \s*', '', line)  #usuń shot1, 2 itd.
            line = re.sub(r'\s+:', ':', line) #usuń spacje przed :
            line = re.sub(r'\boec\b', 'dec', line) #oec na dec
            line = re.sub(r'\bist\b', '1st', line) #ist na 1st
            line = re.sub(r'\bznd\b', '2nd', line) #znd na 2nd
            line = re.sub(r'\(6rj:', '(6r):', line) #(6rj: na (6r):
            line = re.sub(r'\(3rj:', '(3r):', line) #(3rj: na (3r): 
            line = re.sub(r'\bssc\b', '5sc', line)  #ssc na 5sc
            line = re.sub(r'\bisc\b', '1sc', line)
            line = re.sub(r'\b1\s?sc\b', 'sc', line) #1sc i 1 sc na sc
            line = re.sub(r'\b1\s?inc\b', 'inc', line) #1inc i 1 inc na inc
            line = re.sub(r'\bxo\b', 'x6', line)  #xo na x6
            #ogarniecie jak np jest r 1:, r 1 0: itp
            line = re.sub(r'(?<!\S)r\s*(\d+(?:\s*\d+)*)\s*(?=[:\s])', lambda m: 'r' + ''.join(m.group(1).split()), line)
            
            #Bez dwukropków
            line = re.sub(r'\bchi\b', 'ch1', line)  # chi na ch1
            line = re.sub(r'\bcht\b', 'ch1', line)  # cht na ch1
            line = re.sub(r'\brz\b', 'r2', line) #rz na r2
            line = re.sub(r'\bry\b', 'r4', line) #ry na r4
            line = re.sub(r'\brs\b', 'r5', line) #rs na r5
            line = re.sub(r'\bir5t\b', 'irst', line) #rs na r5
            line = re.sub(r'\br1o\b', 'r10', line) #r1o na r10
            line = re.sub(r'\bris\b', 'r15', line) #ris na r15
            line = re.sub(r'\br2o\b', 'r20', line) #r2o na r20
            line = re.sub(r'\br2z\b', 'r22', line) #r2z na r22
            line = re.sub(r'\brs1\b', 'r51', line)  #rs1 na r51
            line = re.sub(r'\brss\b', 'r55', line) #rss na r55
            #line = re.sub(r'\bi1\b', '11', line)
            line = re.sub(r'\bjsc\b', '3sc', line)
            #line = re.sub(r'\bri2\b', 'r12', line)
            line = re.sub(r'\bró\b', 'r6', line)
            line = re.sub(r'\brio\b', 'r10', line)
            
            line = re.sub(r'\bri(\d)\b', lambda m: 'r' + ('1' if int(m.group(1)) != 0 else '') + m.group(1), line)
            
            #Z dwukropkami
            line = re.sub(r'(?<!\w)ri:(?=\s|\(|\[|,|$)', 'r1:', line)  # ri: na r1:
            line = re.sub(r'(?<!\w)rz:(?=\s|\(|\[|,|$)', 'r2:', line) #rz: na r2:
            line = re.sub(r'(?<!\w)ry:(?=\s|\(|\[|,|$)', 'r4:', line)  # ry: na r4:
            line = re.sub(r'(?<!\w)rs:(?=\s|\(|\[|,|$)', 'r5:', line)  # rs: na r5:
            line = re.sub(r'(?<!\w)ro:(?=\s|\(|\[|,|$)', 'r6:', line)  #ro: na r6:
            line = re.sub(r'(?<!\w)rg:(?=\s|\(|\[|,|$)', 'r9:', line) #rg: na r9:
            line = re.sub(r'(?<!\w)rio:(?=\s|\(|\[|,|$)', 'r10:', line) #rio: na r10:
            line = re.sub(r'(?<!\w)rii:(?=\s|\(|\[|,|$)', 'r11:', line) #rii: na r11:
            line = re.sub(r'(?<!\w)r1i:(?=\s|\(|\[|,|$)', 'r11:', line) #r1i: na r11:
            line = re.sub(r'(?<!\w)r1z:(?=\s|\(|\[|,|$)', 'r13:', line) #r1z: na r13:
            line = re.sub(r'(?<!\w)ris:(?=\s|\(|\[|,|$)', 'r15:', line) #ris: na r15:
            line = re.sub(r'(?<!\w)r1s:(?=\s|\(|\[|,|$)', 'r15:', line)  #r1s: na r15:
            line = re.sub(r'(?<!\w)r1g:(?=\s|\(|\[|,|$)', 'r19:', line) #r1g: na r19:
            line = re.sub(r'(?<!\w)r2i:(?=\s|\(|\[|,|$)', 'r21:', line) #r2i: na r21:
            line = re.sub(r'(?<!\w)r2s:(?=\s|\(|\[|,|$)', 'r25:', line) #r2s: na r25:
            line = re.sub(r'(?<!\w)r2g:(?=\s|\(|\[|,|$)', 'r29:', line) #r2g: na r29:
            line = re.sub(r'(?<!\w)r5y:(?=\s|\(|\[|,|$)', 'r54:', line) #r5y: na r54:
            line = re.sub(r'(?<!\w)rs1:(?=\s|\(|\[|,|$)', 'r51:', line) #rs1: na r51:
            line = re.sub(r'(?<!\w)ry0:(?=\s|\(|\[|,|$)', 'r40:', line) #ry0: na r40:
    
            #Końcowe usuwanie niepotrzebnych spacji
            line = re.sub(r'\s*,\s*', ', ', line)  #spacje wokół przecinków
            line = re.sub(r'\s*\(\s*', ' (', line) #spacje wokół nawiasów i cyfr
            line = re.sub(r'\s*\)\s*', ') ', line).strip()
            line = re.sub(r'\s+', ' ', line) #wielokrotne spacji
            #line = re.sub(r'\b\s**\s*\b', 'x', line) #zamień samodzielne x na * i usuń spacje wokół
            #line = re.sub(r'\s*\*\s*', '*', line) #usuń spacje wokół *
            
            filtered_lines.append(line)
            
            previous_line = line
    
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            for line in filtered_lines:
                output_file.write(line + '\n')
    return filtered_lines

In [7]:
filter_file('data/marchewka.txt', 'data/marchewka_regex.txt')

['korzeń marchewki',
 'r1: 3 in mr (3)',
 'r1: 3 in mr (3)',
 'r1: 3 in mr (3)',
 'r1: 3 in mr (3)',
 'r1: 3 in mr (3)',
 'r2: inc x3 (6)',
 'r2: inc x3 (6)',
 'r2: inc x3 (6)',
 'r2: inc x3 (6)',
 'r2: inc x3 (6)',
 'r2: inc x3 (6)',
 'r2: inc x3 (6)',
 'r2: inc x3 (6)',
 'r2: inc x3 (6)',
 'r2: inc x3 (6)',
 'r2: inc x3 (6)',
 'r2: inc x3 (6)',
 'r2: inc x3 (6)',
 'r3: sc in each (6)',
 'r3: sc in each (6)',
 'r3: sc in each (6)',
 'r3: sc in each (6)',
 'r3: sc in each (6)',
 'r3: sc in each (6)',
 'r3: sc in each (6)',
 'r3: sc in each (6)',
 'r4: (sc, inc) x3 (9)',
 'r4: (sc, inc) x3 (9)',
 'r4: (sc, inc) x3 (9)',
 'r4: (sc, inc) x3 (9)',
 'r4: (sc, inc) x3 (9)',
 'r4: (sc, inc) x3 (9)',
 'r4: (sc, inc) x3 (9)',
 'r4: (sc, inc) x3 (9)',
 'r4: (sc, inc) x3 (9)',
 'r4: (sc, inc) x3 (9)',
 'r5: sc in each (9)',
 'r5: sc in each (9)',
 'r5: sc in each (9)',
 'r5: sc in each (9)',
 'r5: sc in each (9)',
 'r5: sc in each (9)',
 'r5: sc in each (9)',
 'r5: sc in each (9)',
 'r5: sc in ea

In [9]:
def extract_r_number(line):
    """Function to extract the row number."""
    match = re.match(r"^(r\d+(?:-\d+)?)[;:]", line)
    return match.group(1) if match else None

def find_pattern_start(groups):
    for i, group in enumerate(groups):
        for line in group:
            if re.match(r"^(r1|row1)[;:]", line, re.IGNORECASE):
                return max(0, i - 1)  # Keep one group before the pattern start
    return None

def group_similar_lines(lines, threshold=3):
    """Function to group similar lines using levensthein distance."""
    groups = []
    current_group = []
    
    for i, line in enumerate(lines):
        r_number = extract_r_number(line)
        
        # Start a new group if it's a new r_number or if the current group is empty
        if current_group:
            # should change the levenstein first then the r_number
            prev_r_number = extract_r_number(current_group[-1])
            if r_number and r_number != prev_r_number:
                groups.append(current_group)
                current_group = []
        
        current_group.append(line)
        
        if len(current_group) > 1:
            prev_line = current_group[-2]
            if levenshtein_distance(prev_line, line) > threshold:
                groups.append(current_group[:-1])
                current_group = [line]
    
    if current_group:
        groups.append(current_group)
    
    return groups

def detect_outliers(group, threshold=3):
    """Function to detect outliers in a group of lines using levensthein distance."""
    if len(group) < 2:
        return []  # No outliers in single-line groups
    
    outliers = []
    for i, line in enumerate(group):
        for j, other in enumerate(group):
            if i != j and levenshtein_distance(line, other) > threshold:
                outliers.append(line)
                break  
    
    return outliers

def filter_and_simplify_groups(groups, threshold=3):
    filtered_groups = []
    
    for group in groups:
        if len(group) > 2:
            outliers = detect_outliers(group, threshold)
            filtered_group = [line for line in group if line not in outliers]
            if len(filtered_group) > 2:
                filtered_group = filtered_group[1:-1]  # Remove first and last occurrence
            if filtered_group:
                filtered_groups.append(filtered_group[0])  # Keep only one representative line per group
        elif group:
            filtered_groups.append(group[0])
    
    return filtered_groups

def process_txt_file(input_filename, output_filename, threshold=3):
    """Function to process the txt file."""
    with open(input_filename, 'r', encoding='utf-8') as file:
        lines = [line.strip() for line in file.readlines() if line.strip()]
    
    groups = group_similar_lines(lines, threshold)
    pattern_start = find_pattern_start(groups)
    
    if pattern_start is not None:
        groups = groups[pattern_start:]
    
    filtered_groups = filter_and_simplify_groups(groups, threshold)
    
    with open(output_filename, 'w', encoding='utf-8') as output_file:
        for line in filtered_groups:
            output_file.write(line + '\n')


In [10]:
process_txt_file("data/marchewka_regex.txt", "data/patterns/marchewka_pattern.txt")

In [11]:
def txt_to_pdf_with_thumbnail_and_title(txt_filename, pdf_filename, video_url):
    """Extract video info (thumbnail & title)"""
    ydl_opts = {}
    with YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(video_url, download=False)
        thumbnail_url = info_dict.get('thumbnail', None)
        video_title = info_dict.get('title', 'Untitled Video')

    img_path = None
    if thumbnail_url:
        response = requests.get(thumbnail_url)
        if response.status_code == 200:
            img = Image.open(BytesIO(response.content))
            img_path = "thumbnail.jpg"  
            img.save(img_path, "JPEG")
        else:
            print("Failed to download thumbnail.")
    
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    
    pdf.add_font('Roboto', '', 'Roboto/static/Roboto-SemiBold.ttf', uni=True)
    pdf.set_font('Roboto', size=10)
    
    pdf.add_page()

    if img_path:
        page_width = pdf.w 
        img_width = 100  
        x_centered = (page_width - img_width) / 2 

        pdf.image(img_path, x=x_centered, y=10, w=img_width) 
        pdf.ln(60)  
    #video_title = video_title.encode('utf-8').decode('utf-8')
    #pdf.set_font('Roboto', size=14)
    #pdf.cell(0, 10, video_title, ln=True, align='C')
    #pdf.ln(10)  

    pdf.set_font('Roboto', size=8)

    checkbox_size = 4
    x_offset = 5
    y_offset = pdf.get_y() + 5  
    line_height = 5
    page_height = 297
    margin_bottom = 15

    with open(txt_filename, 'r', encoding='utf-8') as file:
        for line in file:
            if y_offset + line_height > page_height - margin_bottom:
                pdf.add_page()
                y_offset = 10  
            
            pdf.rect(x_offset, y_offset, checkbox_size, checkbox_size)
            pdf.set_xy(x_offset + checkbox_size + 2, y_offset)
            pdf.multi_cell(0, line_height, line.strip())
            y_offset += line_height

    pdf.output(pdf_filename)



In [12]:
txt_to_pdf_with_thumbnail_and_title('data/patterns/marchewka_pattern.txt', 'data/patterns/marchewka_pattern.pdf', 'https://www.youtube.com/watch?v=Bm1MJgBsL2w&t')

[youtube] Extracting URL: https://www.youtube.com/watch?v=Bm1MJgBsL2w&t
[youtube] Bm1MJgBsL2w: Downloading webpage
[youtube] Bm1MJgBsL2w: Downloading ios player API JSON
[youtube] Bm1MJgBsL2w: Downloading mweb player API JSON
[youtube] Bm1MJgBsL2w: Downloading m3u8 information


