In [14]:
import cv2
import numpy as np
import os
import tempfile
from PIL import Image
import re
import easyocr

In [1]:
def compare_files_by_prefix(file1, file2):
    """
    Compares two files line by line, matching lines based on a prefix (the first element before a comma which is a connection to the .png file ).
    The function calculates the accuracy as the percentage of matching lines.
    :param file1: Path to the first file.
    :param file2: Path to the second file.
    """
    frames_num = 0  
    errors = 0  
    
    lines_dict1 = {}
    lines_dict2 = {}

    with open(file1, 'r', encoding="utf-8") as f1:
        for line in f1:
            prefix = line.split(',')[0].strip()  # Extract the PNG filename (prefix)
            lines_dict1[prefix] = line.strip().lower()  # Store the line in lowercase for case-insensitive comparison

    with open(file2, 'r', encoding="utf-8") as f2:
        for line in f2:
            prefix = line.split(',')[0].strip()  
            lines_dict2[prefix] = line.strip().lower()  

    for prefix in lines_dict1:
        if prefix in lines_dict2: 
            frames_num += 1 
            if lines_dict1[prefix] != lines_dict2[prefix]:  
                errors += 1 

    accuracy = (frames_num - errors) / frames_num * 100 if frames_num > 0 else 0
    print(f"Accuracy: {accuracy:.2f}%") 
    return accuracy


# EasyOCR plain accuracy

In [2]:
compare_files_by_prefix('data/test/labels.txt','data/test/labels_fixed.txt')

Accuracy: 56.96%


56.957928802588995

___________

# Pre-processing with images

In [4]:
def extract_text_from_frames(input_folder,  text_output_file, lang='en'):
    """
    Detects text in images, runs OCR on the entire image, and writes the results to a text file.
    
    :param input_folder: Folder containing input images (PNG).
    :param output_folder: Folder to save images (optional, not used in this version).
    :param text_output_file: File to store extracted text.
    :param lang: Language for OCR (default: 'en').
    """
    reader = easyocr.Reader([lang])
    with open(text_output_file, 'w', encoding='utf-8') as file:
        for filename in os.listdir(input_folder):
            if filename.lower().endswith('.png'):
                image_path = os.path.join(input_folder, filename)
                image = cv2.imread(image_path)

                if image is None:
                    print(f"Error loading image: {filename}")
                    continue

                results = reader.readtext(image, detail=0) 

                extracted_text = "No text detected" if not results else " ".join(results)
                
                file.write(f"{filename}, {extracted_text}\n")


### normalized

In [None]:

input_folder = r"C:\Users\julek\Downloads\test"
output_folder = os.path.join(input_folder, "normalized")

os.makedirs(output_folder, exist_ok=True)

for filename in os.listdir(input_folder):
    if filename.endswith(".png"):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)

        img = cv2.imread(input_path, cv2.IMREAD_UNCHANGED)
        if img is None:
            print(f"Skipping {filename}, unable to read image.")
            continue

        # Normalize image
        norm_img = np.zeros_like(img, dtype=np.float32)
        img = cv2.normalize(img, norm_img, 0, 255, cv2.NORM_MINMAX)
        img = np.uint8(img) 

        cv2.imwrite(output_path, img)

In [6]:
compare_files_by_prefix('data/test/labels_fixed.txt','data/test/test_Jula/test/txt/normalized.txt')

Accuracy: 48.92%


48.92008639308855

### image scaling

In [None]:
input_folder = r"C:\Users\julek\Downloads\test" 
output_folder = os.path.join(input_folder, "scaled")

os.makedirs(output_folder, exist_ok=True)

def process_image_scale(file_path):
    """ Scales images with DPI < 300, and saves images with DPI ≥ 300 without scaling. """
    im = Image.open(file_path)
    dpi = im.info.get("dpi", (72, 72))[0]  

    if dpi < 300:
        length_x, width_y = im.size
        factor = min(1, float(1024.0 / length_x)) 
        size = int(factor * length_x), int(factor * width_y)

        im_resized = im.resize(size, Image.LANCZOS)  # Scale the image

        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
        temp_filename = temp_file.name
        im_resized.save(temp_filename, dpi=(300, 300))
        return temp_filename

    else:  # DPI is already 300 or more
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
        temp_filename = temp_file.name
        im.save(temp_filename, dpi=(dpi, dpi))  # Save with its original DPI
        return temp_filename


In [None]:

for filename in os.listdir(input_folder):
    if filename.endswith(".png"):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)

        temp_filename = process_image_scale(input_path)

        if temp_filename: 
            os.replace(temp_filename, output_path)


In [7]:
compare_files_by_prefix('data/test/labels_fixed.txt','data/test/test_Jula/test/txt/scaled.txt')

Accuracy: 48.70%


48.704103671706264

### noise removal

In [None]:
def remove_noise(image):
    return cv2.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 15)

def process_images(input_folder):
    output_folder = os.path.join(input_folder, 'without_noise')
    os.makedirs(output_folder, exist_ok=True)
    
    for filename in os.listdir(input_folder):
        if filename.lower().endswith('.png'):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)
            
            image = cv2.imread(input_path)
            if image is not None:
                cleaned_image = remove_noise(image)
                cv2.imwrite(output_path, cleaned_image)

input_directory = r"C:\Users\julek\Downloads\test"  
process_images(input_directory)


In [5]:
extract_text_from_frames('data/test/test_Jula/test/without_noise', 'data/test/test_Jula/test/txt/without_noise.txt')

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [8]:
compare_files_by_prefix('data/test/labels_fixed.txt','data/test/test_Jula/test/txt/without_noise.txt')

Accuracy: 46.65%


46.652267818574515

### gray scale

In [None]:
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

def process_images(input_folder):
    output_folder = os.path.join(input_folder, 'gray')
    os.makedirs(output_folder, exist_ok=True)
    
    for filename in os.listdir(input_folder):
        if filename.lower().endswith('.png'):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)
            
            image = cv2.imread(input_path)
            if image is not None:
                grayscale_image = get_grayscale(image)
                cv2.imwrite(output_path, grayscale_image)

input_directory = r"C:\Users\julek\Downloads\test" 
process_images(input_directory)

In [9]:
compare_files_by_prefix('data/test/labels_fixed.txt','data/test/test_Jula/test/txt/gray.txt')

Accuracy: 49.35%


49.35205183585313

### threshold

In [None]:
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

def process_images(input_folder):
    output_folder = os.path.join(input_folder, 'binary')
    os.makedirs(output_folder, exist_ok=True)
    
    for filename in os.listdir(input_folder):
        if filename.lower().endswith('.png'):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)
            
            image = cv2.imread(input_path)
            if image is not None:
                grayscale_image = get_grayscale(image)
                thresholded_image = thresholding(grayscale_image)
                cv2.imwrite(output_path, thresholded_image)

input_directory = r"C:\Users\julek\Downloads\test" 
process_images(input_directory)

In [10]:
compare_files_by_prefix('data/test/labels_fixed.txt','data/test/test_Jula/test/txt/binary.txt')

Accuracy: 32.18%


32.18142548596112

## All together

In [None]:

def remove_noise(image):
    return cv2.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 15)

def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

def set_image_dpi(file_path):
    im = Image.open(file_path)
    if im.info.get("dpi", (0, 0))[0] < 300:
        length_x, width_y = im.size
        factor = min(1, float(1024.0 / length_x))
        size = int(factor * length_x), int(factor * width_y)
        im_resized = im.resize(size, Image.LANCZOS)
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
        temp_filename = temp_file.name
        im_resized.save(temp_filename, dpi=(300, 300))
        return temp_filename
    return file_path

def normalize_image(image):
    norm_img = np.zeros((image.shape[0], image.shape[1]))
    return cv2.normalize(image, norm_img, 0, 255, cv2.NORM_MINMAX)

def process_images(input_folder):
    output_folder = os.path.join(input_folder, 'all_combined')
    os.makedirs(output_folder, exist_ok=True)
    
    for filename in os.listdir(input_folder):
        if filename.lower().endswith('.png'):
            input_path = os.path.join(input_folder, filename)
            temp_path = set_image_dpi(input_path)
            image = cv2.imread(temp_path)
            
            if image is not None:
                image = normalize_image(image)
                image = remove_noise(image)
                image = get_grayscale(image)
                image = thresholding(image)
                
                output_path = os.path.join(output_folder, filename)
                cv2.imwrite(output_path, image)

input_directory = r"C:\Users\julek\Downloads\test" 
process_images(input_directory)

In [11]:
compare_files_by_prefix('data/test/labels_fixed.txt','data/test/test_Jula/test/txt/all_combined.txt')

Accuracy: 33.05%


33.04535637149028

# Regular Expression

In [16]:
def filter_file(file_path, output_file_path):
    """
    Filter the file from common mistakes that can behandled by regular expressions and save the result to a new file.
    """
    filtered_lines = []
    previous_line = None
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip().lower()
            
            #Różne
            line = re.sub(r'\s+:', ':', line) #usuń spacje przed :
            line = re.sub(r'\boec\b', 'dec', line) #oec na dec
            line = re.sub(r'\bist\b', '1st', line) #ist na 1st
            line = re.sub(r'\bznd\b', '2nd', line) #znd na 2nd
            line = re.sub(r'\(6rj:', '(6r):', line) #(6rj: na (6r):
            line = re.sub(r'\(3rj:', '(3r):', line) #(3rj: na (3r): 
            line = re.sub(r'\bssc\b', '5sc', line)  #ssc na 5sc
            line = re.sub(r'\bisc\b', '1sc', line)
            line = re.sub(r'\b1\s?sc\b', 'sc', line) #1sc i 1 sc na sc
            line = re.sub(r'\b1\s?inc\b', 'inc', line) #1inc i 1 inc na inc
            line = re.sub(r'\bxo\b', 'x6', line)  #xo na x6
            #ogarniecie jak np jest r 1:, r 1 0: itp
            line = re.sub(r'(?<!\S)r\s*(\d+(?:\s*\d+)*)\s*(?=[:\s])', lambda m: 'r' + ''.join(m.group(1).split()), line)
            
            #Bez dwukropków
            line = re.sub(r'\bchi\b', 'ch1', line)  # chi na ch1
            line = re.sub(r'\bcht\b', 'ch1', line)  # cht na ch1
            line = re.sub(r'\brz\b', 'r2', line) #rz na r2
            line = re.sub(r'\bry\b', 'r4', line) #ry na r4
            line = re.sub(r'\brs\b', 'r5', line) #rs na r5
            line = re.sub(r'\bir5t\b', 'irst', line) #rs na r5
            line = re.sub(r'\br1o\b', 'r10', line) #r1o na r10
            line = re.sub(r'\bris\b', 'r15', line) #ris na r15
            line = re.sub(r'\br2o\b', 'r20', line) #r2o na r20
            line = re.sub(r'\br2z\b', 'r22', line) #r2z na r22
            line = re.sub(r'\brs1\b', 'r51', line)  #rs1 na r51
            line = re.sub(r'\brss\b', 'r55', line) #rss na r55
            line = re.sub(r'\bjsc\b', '3sc', line)
            line = re.sub(r'\bró\b', 'r6', line)
            line = re.sub(r'\brio\b', 'r10', line)
            
            line = re.sub(r'\bri(\d)\b', lambda m: 'r' + ('1' if int(m.group(1)) != 0 else '') + m.group(1), line)
            
            
            #Z dwukropkami
            line = re.sub(r'(?<!\w)ri:(?=\s|\(|\[|,|$)', 'r1:', line)  # ri: na r1:
            line = re.sub(r'(?<!\w)rz:(?=\s|\(|\[|,|$)', 'r2:', line) #rz: na r2:
            line = re.sub(r'(?<!\w)ry:(?=\s|\(|\[|,|$)', 'r4:', line)  # ry: na r4:
            line = re.sub(r'(?<!\w)rs:(?=\s|\(|\[|,|$)', 'r5:', line)  # rs: na r5:
            line = re.sub(r'(?<!\w)ro:(?=\s|\(|\[|,|$)', 'r6:', line)  #ro: na r6:
            line = re.sub(r'(?<!\w)rg:(?=\s|\(|\[|,|$)', 'r9:', line) #rg: na r9:
            line = re.sub(r'(?<!\w)rio:(?=\s|\(|\[|,|$)', 'r10:', line) #rio: na r10:
            line = re.sub(r'(?<!\w)rii:(?=\s|\(|\[|,|$)', 'r11:', line) #rii: na r11:
            line = re.sub(r'(?<!\w)r1i:(?=\s|\(|\[|,|$)', 'r11:', line) #r1i: na r11:
            line = re.sub(r'(?<!\w)r1z:(?=\s|\(|\[|,|$)', 'r13:', line) #r1z: na r13:
            line = re.sub(r'(?<!\w)ris:(?=\s|\(|\[|,|$)', 'r15:', line) #ris: na r15:
            line = re.sub(r'(?<!\w)r1s:(?=\s|\(|\[|,|$)', 'r15:', line)  #r1s: na r15:
            line = re.sub(r'(?<!\w)r1g:(?=\s|\(|\[|,|$)', 'r19:', line) #r1g: na r19:
            line = re.sub(r'(?<!\w)r2i:(?=\s|\(|\[|,|$)', 'r21:', line) #r2i: na r21:
            line = re.sub(r'(?<!\w)r2s:(?=\s|\(|\[|,|$)', 'r25:', line) #r2s: na r25:
            line = re.sub(r'(?<!\w)r2g:(?=\s|\(|\[|,|$)', 'r29:', line) #r2g: na r29:
            line = re.sub(r'(?<!\w)r5y:(?=\s|\(|\[|,|$)', 'r54:', line) #r5y: na r54:
            line = re.sub(r'(?<!\w)rs1:(?=\s|\(|\[|,|$)', 'r51:', line) #rs1: na r51:
            line = re.sub(r'(?<!\w)ry0:(?=\s|\(|\[|,|$)', 'r40:', line) #ry0: na r40:
    
            
            #Końcowe usuwanie niepotrzebnych spacji
            line = re.sub(r'\s*,\s*', ', ', line)  #spacje wokół przecinków
            line = re.sub(r'\s*\(\s*', ' (', line) #spacje wokół nawiasów i cyfr
            line = re.sub(r'\s*\)\s*', ') ', line).strip()
            line = re.sub(r'\s+', ' ', line) #wielokrotne spacji
            line = re.sub(r'\b\s*\*\s*\b', 'x', line) #zamień samodzielne * na x i usuń spacje wokół
            #line = re.sub(r'\s*\*\s*', '*', line) #usuń spacje wokół *
            
            
            filtered_lines.append(line)
            
            previous_line = line
    
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            for line in filtered_lines:
                output_file.write(line + '\n')
    return filtered_lines

In [17]:
filter_file('data/test/labels.txt', 'data/test/labels2.txt')

['fox1_shot_0.png, g80',
 'fox1_shot_11.png, first leg only: close with sl st',
 'fox1_shot_25.png, crochet 2nd',
 'fox1_shot_26.png, the',
 'fox1_shot_27.png, same as the 1st',
 "fox1_shot_28.png, (don't close with sl st or cut the",
 'fox1_shot_29.png, yarn',
 'fox1_shot_52.png, crochet 2nd',
 'fox1_shot_53.png, the',
 'fox1_shot_54.png, same as the 1st',
 "fox1_shot_55.png, (don't close with sl st or",
 'fox1_shot_56.png, the',
 'fox1_shot_57.png, yarn,',
 'fox1_shot_58.png, cut',
 'fox1_shot_74.png, ch 3 after r7, join to 1st',
 'fox1_shot_75.png, with sc (mark this st',
 "fox1_shot_76.png, it's the 1st st of ri of belly)",
 'fox1_shot_77.png, leg',
 'fox1_shot_95.png, ch 3 after r7, join to 1st',
 'fox1_shot_96.png, with sc (mark this st',
 "fox1_shot_97.png, it's the 1st st of r1 of belly)",
 'fox1_shot_98.png, leg',
 'fox1_shot_114.png, ch 3 after r7, join to 1st',
 'fox1_shot_115.png, with sc (mark this st',
 "fox1_shot_116.png, it's the 1st st of r1 of belly)",
 'fox1_shot_117

In [19]:
compare_files_by_prefix('data/test/labels.txt','data/test/labels2.txt')

Accuracy: 79.94%


79.93527508090615

____________

## Another approach to accuracy

In [20]:
from Levenshtein import ratio


In [21]:

def compare_files_by_prefix2(file1, file2):
    """
    Compares two files line by line, matching lines based on a prefix.
    Instead of exact match, calculates the similarity using Levenshtein ratio.
    The function then computes an average accuracy score.
    """
    frames_num = 0  
    total_similarity = 0  
    
    lines_dict1 = {}
    lines_dict2 = {}

    with open(file1, 'r', encoding="utf-8") as f1:
        for line in f1:
            prefix = line.split(',')[0].strip()
            lines_dict1[prefix] = line.strip().lower()

    with open(file2, 'r', encoding="utf-8") as f2:
        for line in f2:
            prefix = line.split(',')[0].strip()
            lines_dict2[prefix] = line.strip().lower()

    for prefix in lines_dict1:
        if prefix in lines_dict2:
            frames_num += 1
            similarity = ratio(lines_dict1[prefix], lines_dict2[prefix])
            total_similarity += similarity  # Sum similarity scores

    accuracy = (total_similarity / frames_num) * 100 if frames_num > 0 else 0
    print(f"Accuracy: {accuracy:.2f}%")
    return accuracy

In [22]:
compare_files_by_prefix2('data/test/labels.txt','data/test/labels_fixed.txt')

Accuracy: 98.17%


98.16646335087569

In [23]:
compare_files_by_prefix2('data/test/labels.txt','data/test/labels2.txt')

Accuracy: 99.45%


99.4453991432068

In [24]:
def jaccard_similarity(text1, text2):
    """Calculate Jaccard similarity between two texts based on word overlap."""
    set1, set2 = set(text1.split()), set(text2.split())
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union > 0 else 0

def compare_files_by_prefix_jac(file1, file2):
    frames_num = 0  
    total_similarity = 0  
    
    lines_dict1 = {}
    lines_dict2 = {}

    with open(file1, 'r', encoding="utf-8") as f1:
        for line in f1:
            prefix = line.split(',')[0].strip()
            lines_dict1[prefix] = line.strip().lower()

    with open(file2, 'r', encoding="utf-8") as f2:
        for line in f2:
            prefix = line.split(',')[0].strip()
            lines_dict2[prefix] = line.strip().lower()

    for prefix in lines_dict1:
        if prefix in lines_dict2:
            frames_num += 1
            similarity = jaccard_similarity(lines_dict1[prefix], lines_dict2[prefix])
            total_similarity += similarity  

    accuracy = (total_similarity / frames_num) * 100 if frames_num > 0 else 0
    print(f"Accuracy: {accuracy:.2f}%")
    return accuracy

In [25]:
compare_files_by_prefix_jac('data/test/labels.txt','data/test/labels_fixed.txt')

Accuracy: 79.39%


79.3930158897796

In [26]:
compare_files_by_prefix_jac('data/test/labels.txt','data/test/labels2.txt')

Accuracy: 92.64%


92.64083511513543

## Combined

In [29]:
def compare_files_by_prefix_both(file1, file2):
    """
    Compares two files line by line, matching lines based on a prefix.
    Uses both Levenshtein similarity and Jaccard similarity to determine accuracy.
    """
    frames_num = 0  
    total_levenshtein = 0  
    total_jaccard = 0  
    
    lines_dict1 = {}
    lines_dict2 = {}

    with open(file1, 'r', encoding="utf-8") as f1:
        for line in f1:
            prefix = line.split(',')[0].strip()
            lines_dict1[prefix] = line.strip().lower()

    with open(file2, 'r', encoding="utf-8") as f2:
        for line in f2:
            prefix = line.split(',')[0].strip()
            lines_dict2[prefix] = line.strip().lower()

    for prefix in lines_dict1:
        if prefix in lines_dict2:
            frames_num += 1
            text1, text2 = lines_dict1[prefix], lines_dict2[prefix]

            # Calculate Levenshtein and Jaccard similarities
            levenshtein_sim = ratio(text1, text2)
            jaccard_sim = jaccard_similarity(text1, text2)

            # Add to totals
            total_levenshtein += levenshtein_sim
            total_jaccard += jaccard_sim

    # Compute final accuracy as an average of both scores
    if frames_num > 0:
        avg_levenshtein = (total_levenshtein / frames_num) * 100
        avg_jaccard = (total_jaccard / frames_num) * 100
        final_accuracy = (avg_levenshtein + avg_jaccard) / 2  # Averaging both
    else:
        final_accuracy = 0

    print(f"Levenshtein Accuracy: {avg_levenshtein:.2f}%")
    print(f"Jaccard Accuracy: {avg_jaccard:.2f}%")
    print(f"Final OCR Accuracy: {final_accuracy:.2f}%")
    
    return final_accuracy

In [30]:
compare_files_by_prefix_both('data/test/labels.txt','data/test/labels_fixed.txt')

Levenshtein Accuracy: 98.17%
Jaccard Accuracy: 79.39%
Final OCR Accuracy: 88.78%


88.77973962032765

In [31]:
compare_files_by_prefix_both('data/test/labels.txt','data/test/labels2.txt')

Levenshtein Accuracy: 99.45%
Jaccard Accuracy: 92.64%
Final OCR Accuracy: 96.04%


96.04311712917112