### Remark: all of the code still needs improvments

In [1]:
from Levenshtein import distance as levenshtein_distance
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import tqdm
import glob
from yt_dlp import YoutubeDL
from collections import defaultdict
import re
import spacy
import requests
from PIL import Image
from io import BytesIO
from fpdf import FPDF

In [2]:
def load_spacy():
    """Load the spaCy Polish and English models."""
    nlp_pl = spacy.load("pl_core_news_sm")  # Polish
    nlp_en = spacy.load("en_core_web_md")   # English
    return nlp_pl, nlp_en

def is_valid_word(word, nlp_pl, nlp_en, crochet_terms, short_words):
    """Check if a word is a valid Polish/English word or a crochet-related term."""
    word = word.strip()
    
    # Allow crochet terms
    if word in crochet_terms:
        return True  

    # Allow CH{number} pattern
    if re.fullmatch(r'CH\d+', word, re.IGNORECASE):
        return True  

    # Reject if word contains numbers or special characters
    if not re.fullmatch(r'[a-zA-ZąćęłńóśźżĄĆĘŁŃÓŚŹŻ]+', word):
        return False  

    # Allow common short words like "in", "on", "to", etc.
    if len(word) < 3 and word.lower() not in short_words:
        return False

    # Check in Polish NLP
    doc_pl = nlp_pl(word)
    is_polish = any(token.is_alpha for token in doc_pl)

    # Check in English NLP
    doc_en = nlp_en(word)
    is_english = any(token.has_vector and not token.is_oov for token in doc_en)

    return is_polish or is_english

def test_lines():
    """Sample lines to test."""
    test_lines = [
        'LALard 2882',
        '2252',
        'eaby',
        'ara',
        'an',
        'Piudse ULrards 2ore',
        'tale Vao foot Zor un o7uo dol',
        'Cleg rord un erecon y',
        '~tner borg Mo7d7 Zox 32 dOl',
        'Xou',
        '8 Plla R0} Hll u@',
        'Puenn4 Ui Vnk Ml7uioi Zo} 70 dol'
        '2 dec, 1 sc'
    ]

    # Crochet-related terms
    crochet_terms = {"chain", "single crochet", "double crochet", "slip stitch", "sl st", "crochet",
                     "hook", "yarn", "needle", "scissors", "polyfill", "stitch", "markers", "pattern", "round", 
                     "repeat", "increase", "decrease", "magic ring", "magic circle", "loop", "tail", "beginning",
                     "back loop", "front loop", "back post", "front post", "except", "attach", "mark", "dluzsza", 
                     "krotsza"}

    # List of common short words to allow (e.g., "in", "on", "to", etc.)
    short_words = {"in", "on", "to", "a", "an", "of", "for", "by", "and", "or", "the"}

    # Load NLP models once here
    nlp_pl, nlp_en = load_spacy()

    valid_lines = []

    for line in test_lines:
        # Split the line into words
        words = re.findall(r'\b\w+\b', line)
        
        # Collect valid words in the line
        valid_words = [word for word in words if is_valid_word(word, nlp_pl, nlp_en, crochet_terms, short_words)]
        
        if valid_words:
            valid_lines.append(line)
            print(f"Valid words in line: '{line}' -> {valid_words}")

    return valid_lines


In [3]:

print("Filtered lines and valid words:")
valid_lines = test_lines()
print("\nLines that will be saved:")
print(valid_lines)


Filtered lines and valid words:
Valid words in line: 'LALard 2882' -> ['LALard']
Valid words in line: 'eaby' -> ['eaby']
Valid words in line: 'ara' -> ['ara']
Valid words in line: 'an' -> ['an']
Valid words in line: 'Piudse ULrards 2ore' -> ['Piudse', 'ULrards']
Valid words in line: 'tale Vao foot Zor un o7uo dol' -> ['tale', 'Vao', 'foot', 'Zor', 'dol']
Valid words in line: 'Cleg rord un erecon y' -> ['Cleg', 'rord', 'erecon']
Valid words in line: '~tner borg Mo7d7 Zox 32 dOl' -> ['tner', 'borg', 'Zox', 'dOl']
Valid words in line: 'Xou' -> ['Xou']
Valid words in line: '8 Plla R0} Hll u@' -> ['Plla', 'Hll']
Valid words in line: 'Puenn4 Ui Vnk Ml7uioi Zo} 70 dol2 dec, 1 sc' -> ['Vnk', 'dec']

Lines that will be saved:
['LALard 2882', 'eaby', 'ara', 'an', 'Piudse ULrards 2ore', 'tale Vao foot Zor un o7uo dol', 'Cleg rord un erecon y', '~tner borg Mo7d7 Zox 32 dOl', 'Xou', '8 Plla R0} Hll u@', 'Puenn4 Ui Vnk Ml7uioi Zo} 70 dol2 dec, 1 sc']


# Levensthein distance

In [None]:
import Levenshtein
import numpy as np

def read_file(file_path):
    """Reads a text file and extracts lines."""
    with open(file_path, "r", encoding="utf-8") as file:
        lines = [line.strip() for line in file if line.strip()]
    return lines

def extract_text(lines):
    """Extracts only the text part from 'filename, extracted_text' format."""
    extracted_texts = []
    for line in lines:
        parts = line.split(",", 1)  # split only at the first comma
        if len(parts) > 1:
            extracted_texts.append(parts[1].strip())  # keep only the text part
    return extracted_texts

def group_consecutive_similar_texts(texts, threshold=5):
    """
    Groups consecutive similar texts based on Levenshtein distance.
    """
    grouped_texts = []
    temp_group = [texts[0]]  

    for i in range(1, len(texts)):
        if levenshtein_distance(texts[i], texts[i - 1]) <= threshold:
            temp_group.append(texts[i])
        else:
            grouped_texts.append(temp_group)
            temp_group = [texts[i]]

    grouped_texts.append(temp_group)
    return grouped_texts

def detect_outliers_in_group(group):
    """
    Identifies the most odd/outlier lines in a group using Levenshtein distance.
    - If group has <= 4 lines, returns an empty list (skips filtering).
    - For larger groups, calculates the average distance of each line to others.
    - Returns the list of lines qualified for deletion.
    """
    if len(group) <= 4:
        return [] 

    distances = np.zeros((len(group), len(group)))

    for i in range(len(group)):
        for j in range(len(group)):
            if i != j:
                distances[i][j] = Levenshtein.distance(group[i], group[j])

    avg_distances = np.mean(distances, axis=1)

    mean_dist = np.mean(avg_distances)
    std_dev = np.std(avg_distances)
    threshold = mean_dist + std_dev

    # Identify outliers
    outliers = [group[i] for i in range(len(group)) if avg_distances[i] > threshold]

    return outliers


In [None]:
file_path = ""

lines = read_file(file_path)
texts = extract_text(lines)

similar_text_groups = group_consecutive_similar_texts(texts)

for i, group in enumerate(similar_text_groups, 1):
    outliers = detect_outliers_in_group(group)

    print(f"\n🔹 Pattern Section {i}:")
    for item in group:
        status = "❌ (OUTLIER)" if item in outliers else "✅"
        print(f"  {status} - {item}")


In [None]:
def extract_r_number(line):
    match = re.match(r"^(r\d+(?:-\d+)?)[;:]", line)
    return match.group(1) if match else None

def find_pattern_start(groups):
    for i, group in enumerate(groups):
        for line in group:
            if re.match(r"^(r1|row1)[;:]", line, re.IGNORECASE):
                return max(0, i - 1)  # Keep one group before the pattern start
    return None

def group_similar_lines(lines, threshold=3):
    groups = []
    current_group = []
    
    for i, line in enumerate(lines):
        r_number = extract_r_number(line)
        
        # Start a new group if it's a new r_number or if the current group is empty
        if current_group:
            prev_r_number = extract_r_number(current_group[-1])
            if r_number and r_number != prev_r_number:
                groups.append(current_group)
                current_group = []
        
        current_group.append(line)
        
        if len(current_group) > 1:
            prev_line = current_group[-2]
            if levenshtein_distance(prev_line, line) > threshold:
                groups.append(current_group[:-1])
                current_group = [line]
    
    if current_group:
        groups.append(current_group)
    
    return groups

def detect_outliers(group, threshold=3):
    if len(group) < 2:
        return []  # No outliers in single-line groups
    
    outliers = []
    for i, line in enumerate(group):
        for j, other in enumerate(group):
            if i != j and levenshtein_distance(line, other) > threshold:
                outliers.append(line)
                break  
    
    return outliers

def filter_and_simplify_groups(groups, threshold=3):
    filtered_groups = []
    
    for group in groups:
        if len(group) > 2:
            outliers = detect_outliers(group, threshold)
            filtered_group = [line for line in group if line not in outliers]
            if len(filtered_group) > 2:
                filtered_group = filtered_group[1:-1]  # Remove first and last occurrence
            if filtered_group:
                filtered_groups.append(filtered_group[0])  # Keep only one representative line per group
        elif group:
            filtered_groups.append(group[0])
    
    return filtered_groups

def process_txt_file(input_filename, output_filename, threshold=3):
    with open(input_filename, 'r', encoding='utf-8') as file:
        lines = [line.strip() for line in file.readlines() if line.strip()]
    
    groups = group_similar_lines(lines, threshold)
    pattern_start = find_pattern_start(groups)
    
    if pattern_start is not None:
        groups = groups[pattern_start:]
    
    filtered_groups = filter_and_simplify_groups(groups, threshold)
    
    # Write filtered output to a new file
    with open(output_filename, 'w', encoding='utf-8') as output_file:
        for line in filtered_groups:
            output_file.write(line + '\n')


In [None]:
process_txt_file("data/fox_f.txt", "data/patterns/fox_pattern.txt")

# PDF File

In [None]:
def txt_to_pdf_with_thumbnail_and_title(txt_filename, pdf_filename, video_url):
    """Extract video info (thumbnail & title)"""
    ydl_opts = {}
    with YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(video_url, download=False)
        thumbnail_url = info_dict.get('thumbnail', None)
        video_title = info_dict.get('title', 'Untitled Video')

    img_path = None
    if thumbnail_url:
        response = requests.get(thumbnail_url)
        if response.status_code == 200:
            img = Image.open(BytesIO(response.content))
            img_path = "thumbnail.jpg"  
            img.save(img_path, "JPEG")
        else:
            print("Failed to download thumbnail.")
    
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    
    pdf.add_font('Roboto', '', 'Roboto/static/Roboto-SemiBold.ttf', uni=True)
    pdf.set_font('Roboto', size=10)
    
    pdf.add_page()

    if img_path:
        page_width = pdf.w 
        img_width = 100  
        x_centered = (page_width - img_width) / 2 

        pdf.image(img_path, x=x_centered, y=10, w=img_width) 
        pdf.ln(60)  
    #video_title = video_title.encode('utf-8').decode('utf-8')
    #pdf.set_font('Roboto', size=14)
    #pdf.cell(0, 10, video_title, ln=True, align='C')
    #pdf.ln(10)  

    pdf.set_font('Roboto', size=8)

    checkbox_size = 4
    x_offset = 5
    y_offset = pdf.get_y() + 5  
    line_height = 5
    page_height = 297
    margin_bottom = 15

    with open(txt_filename, 'r', encoding='utf-8') as file:
        for line in file:
            if y_offset + line_height > page_height - margin_bottom:
                pdf.add_page()
                y_offset = 10  
            
            pdf.rect(x_offset, y_offset, checkbox_size, checkbox_size)
            pdf.set_xy(x_offset + checkbox_size + 2, y_offset)
            pdf.multi_cell(0, line_height, line.strip())
            y_offset += line_height

    pdf.output(pdf_filename)

