In [1]:
from difflib import SequenceMatcher

from dotenv import load_dotenv
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from msrest.authentication import CognitiveServicesCredentials
import os
from PIL import Image
import sys
import time

from rapidfuzz.distance.metrics_cpp import jaro_winkler_similarity

load_dotenv()

subscription_key = os.environ.get('AZURE_SUBSCRIPTION_KEY')
endpoint = os.environ.get('AZURE_ENDPOINT')

cv_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))


def text_from_image(image_path, mode="Handwritten"):
    with open(image_path, "rb") as image_file:
        read_response = cv_client.read_in_stream(
            image=image_file,
            mode="Printed",
            raw=True
        )
    operation_id = read_response.headers['Operation-Location'].split('/')[-1]
    while True:
        read_result = cv_client.get_read_result(operation_id)
        if read_result.status not in ['notStarted', 'running']:
            break
        time.sleep(1)
    result = []
    if read_result.status == OperationStatusCodes.succeeded:
        for text_result in read_result.analyze_result.read_results:
            for line in text_result.lines:
                result.append(line.text)
        return ' '.join(result)
 
 
# calculare levstein 
#   D[i,j] = min(
#   D[i-1,j] + 1,            // ștergere
#   D[i,j-1] + 1,            // inserare
#   D[i-1,j-1] + cost        // substituire sau nicio modificare
#)      
def multiple_character_quality_metrics(recognized, org):
    recognized = recognized.strip()
    org = org.strip()

    m, n = len(recognized), len(org)
    matrice = [[0 for _ in range(n + 1)] for _ in range(m + 1)]
    for i in range(m + 1):
        matrice[i][0] = i
    for j in range(n + 1):
        matrice[0][j] = j
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if recognized[i - 1] == org[j - 1]:
                cost = 0
            else:
                cost = 1
            matrice[i][j] = min(matrice[i - 1][j] + 1, matrice[i][j - 1] + 1, matrice[i - 1][j - 1] + cost)

    lev_distance = matrice[m][n]

    error = lev_distance / len(org) if len(org) > 0 else 1.0
    
    jaro = jaro_winkler_similarity(recognized, org)
    
    sequence_ratio = SequenceMatcher(None, recognized, org).ratio()

    print(f"Levenshtein distance: {lev_distance}")
    print(f"Jaro Winkler: {jaro}")
    print(f"Sequence Ratio: {sequence_ratio}")
    print(f"Rata de succes: {1 - error:.4f}")
    print(f"Rata de eroare: {error:.4f}")


def multiple_word_quality_metrics(recognized, org):
    recognized = recognized.strip()
    org = org.strip()

    recognized_words = recognized.split()
    org_words = org.split()

    m, n = len(recognized_words), len(org_words)
    matrice = [[0 for _ in range(n + 1)] for _ in range(m + 1)]
    for i in range(m + 1):
        matrice[i][0] = i
    for j in range(n + 1):
        matrice[0][j] = j
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if recognized_words[i - 1] == org_words[j - 1]:
                cost = 0
            else:
                cost = 1
            matrice[i][j] = min(matrice[i - 1][j] + 1, matrice[i][j - 1] + 1, matrice[i - 1][j - 1] + cost)

    lev_distance = matrice[m][n]
    error = lev_distance / len(org_words) if len(org_words) > 0 else 1.0
    
    recognized_unique = set(recognized_words)
    org_unique = set(org_words)
    overlap = len(recognized_unique.intersection(org_unique)) / min(len(recognized_unique), len(org_unique)) if min(len(org_unique),len(recognized_unique)) > 0 else 1.0
    
    intersec = len(recognized_unique.intersection(org_unique))
    union = len(recognized_unique.union(org_unique))
    jaccard = intersec/ union if union > 0 else 0.0
    print(f"Levenshtein distance: {lev_distance}")
    print(f"Overlap: {overlap}")
    print(f"Jaccard: {jaccard}")
    print(f"Rata de succes: {1 - error:.4f}")
    print(f"Rata de eroare: {error:.4f}")


recognized_text = text_from_image("test2.jpeg", "Printed")
original = "Succes în rezolvarea TEMELOR la LABORATOARELE de Inteligență Artificială!"
multiple_character_quality_metrics(recognized_text, original)
multiple_word_quality_metrics(recognized_text, original)
    

Levenshtein distance: 14
Jaro Winkler: 0.7336414564126219
Sequence Ratio: 0.8163265306122449
Rata de succes: 0.8082
Rata de eroare: 0.1918
Levenshtein distance: 7
Overlap: 0.3333333333333333
Jaccard: 0.1875
Rata de succes: 0.2222
Rata de eroare: 0.7778
