In [16]:
import os
import glob

from PIL import Image

In [2]:
# starting position of the first letter
start_x = 5
start_y = 11
spacing = 1
letter_width = 8
letter_height = 10
letter_cnt = 5

In [43]:
def calculate_letter_score(img: Image, x1: int, y1: int, x2: int, y2: int) -> int:
    score = 0
    primes = [2, 3, 5, 7, 11, 13, 17, 19]
    for y in range(y1, y2):
        s = 0
        for x in range(x1, x2):
            s += img.getpixel((x, y)) * primes[x - x1]
        score += s * 11**y
    return score

In [19]:
def get_output_file_content(file_name: str) -> str:
    with open(file_name, "r") as f:
        return f.read().strip()

In [44]:
def calculate_line_stats(image_path: str, output_path: str):
    jpg_files = glob.glob(image_path)
    score_table = {}
    
    for img_file_path in sorted(jpg_files):
        jpg_file_name = os.path.basename(img_file_path)
        root_file_name, _ = os.path.splitext(jpg_file_name)
        output_file = os.path.join(output_path, f"{root_file_name}.txt".replace("input", "output"))
        
        try:
            output_text = get_output_file_content(output_file)
        except:
            continue
        
        img = Image.open(img_file_path)
        # Convert the image to grayscale
        img = img.convert('L')
        # Apply a threshold to convert the image to binary
        threshold = 60
        img = img.point(lambda p: p > threshold)
        
        for i in range(letter_cnt):
            score = calculate_letter_score(img, start_x + i * (letter_width + spacing), start_y, start_x + i * spacing + (i + 1) * letter_width, start_y + letter_height)
            solution_letter = output_text[i]
            if solution_letter not in score_table:
                print(score)
                score_table[solution_letter] = score
            elif score_table[solution_letter] != score:
                print("scores don't match...")
                print(f"old score: {score_table[solution_letter]}")
                print(f"new score: {score}")
        print(output_text)
    return score_table
        
score_table = calculate_line_stats("sampleCaptchas/input/*.jpg", "sampleCaptchas/output/")    
score_table

17625899469123210706848
18383990523184784126915
43661467064742537274946
27084881711340547886880
34757150346768713896989
EGYK4
26680850994944247235937
18498956091371910939048
28384232741340936459703
30190891455050210039639
GRC35
30188208218048391900037
30188127055152141518256
25329878814827491115112
18096974570128686355849
6O5W1
39351426763227391830204
4613841854003088214530
53260894319477455608789
J627C
43551287103958226465895
17626049865178828212667
18096974851160681907684
VLI2C
17263341089396700500015
O1R7Q
43661474652606417174491
26622703831098726513855
27007854359823311888232
OYTAD
17624149399861213282502
26639989377891308108712
30188127044310298035038
ZRMQU
25760514696146198798335
30200880193813367500759
16580158695482559583472
N9DQS
ZGJS3
27008132336701494820644
GZMBA
J14DM
53281563160534943388286
PQ9AE
VWZDO
WGST7
26661261708197290581132
XKMS2
1D2KB
42436625011756130727132
26640749417368155386724
20BHQ
OAH0V
30188286929832080062717
5I8VE
Z97ME
HCE91
WELXV
53281649195127969452892

{'E': 17625899469123210706848,
 'G': 18383990523184784126915,
 'Y': 43661467064742537274946,
 'K': 27084881711340547886880,
 '4': 34757150346768713896989,
 'R': 26680850994944247235937,
 'C': 18498956091371910939048,
 '3': 28384232741340936459703,
 '5': 30190891455050210039639,
 '6': 30188208218048391900037,
 'O': 30188127055152141518256,
 'W': 25329878814827491115112,
 '1': 18096974570128686355849,
 'J': 39351426763227391830204,
 '2': 4613841854003088214530,
 '7': 53260894319477455608789,
 'V': 43551287103958226465895,
 'L': 17626049865178828212667,
 'I': 18096974851160681907684,
 'Q': 17263341089396700500015,
 'T': 43661474652606417174491,
 'A': 26622703831098726513855,
 'D': 27007854359823311888232,
 'Z': 17624149399861213282502,
 'M': 26639989377891308108712,
 'U': 30188127044310298035038,
 'N': 25760514696146198798335,
 '9': 30200880193813367500759,
 'S': 16580158695482559583472,
 'B': 27008132336701494820644,
 'P': 53281563160534943388286,
 'X': 26661261708197290581132,
 '0': 424

In [47]:
# get the inverse of the score_table:
{v: k for k, v in score_table.items()}

{17625899469123210706848: 'E',
 18383990523184784126915: 'G',
 43661467064742537274946: 'Y',
 27084881711340547886880: 'K',
 34757150346768713896989: '4',
 26680850994944247235937: 'R',
 18498956091371910939048: 'C',
 28384232741340936459703: '3',
 30190891455050210039639: '5',
 30188208218048391900037: '6',
 30188127055152141518256: 'O',
 25329878814827491115112: 'W',
 18096974570128686355849: '1',
 39351426763227391830204: 'J',
 4613841854003088214530: '2',
 53260894319477455608789: '7',
 43551287103958226465895: 'V',
 17626049865178828212667: 'L',
 18096974851160681907684: 'I',
 17263341089396700500015: 'Q',
 43661474652606417174491: 'T',
 26622703831098726513855: 'A',
 27007854359823311888232: 'D',
 17624149399861213282502: 'Z',
 26639989377891308108712: 'M',
 30188127044310298035038: 'U',
 25760514696146198798335: 'N',
 30200880193813367500759: '9',
 16580158695482559583472: 'S',
 27008132336701494820644: 'B',
 53281563160534943388286: 'P',
 26661261708197290581132: 'X',
 42436625