In [26]:
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from msrest.authentication import CognitiveServicesCredentials

import os
import time

from PIL import Image, ImageDraw, ImageFilter
import jiwer
import difflib
import Levenshtein

import numpy as np

'''
    Computes the Hamming distance between two strings.
    The Hamming distance is the number of positions at which the corresponding symbols are different.
    
    :param string1: the first string
    :param string2: the second string
    
    :return: the Hamming distance between the two strings
    :rtype: int
'''
def HammingDistance(string1 : str, string2: str) -> int:
    if string1 == string2:
        return 0
    
    lenString1, lenString2 = len(string1), len(string2)
    
    # The strings need to be of same size -> -1 is error here
    if lenString1 != lenString2:
        return -1
     
    distance = 0
    
    for i in range(len(string1)):
        if string1[i] != string2[i]:
            distance+=1
    
    return distance

'''
    Computes the Jaro distance between two strings.
    The Jaro distance is a measure of similarity between two strings.
    The higher the Jaro distance for two strings is, the more similar the strings are.
    
    :param string1: the first string
    :param string2: the second string
    
    :return: the Jaro distance between the two strings
    :rtype: float
'''
def JaroDistance(string1: str, string2: str) -> float:
    if string1 == string2:
        return 1.0
    
    lenStr1, lenStr2 = len(string1), len(string2)
    
    if lenStr1 == 0 or lenStr2 == 0:
        return 0.0

    # Distanta maxima pentru care se poate face match
    maxDist = (max(lenStr1, lenStr2) // 2) - 1
    
    # Numarul de match-uri
    matches = 0
    
    # Hash pentru match-uri
    hash_string1 = [0] * lenStr1
    hash_string2 = [0] * lenStr2
    
    # Parcurgem primul string
    for i in range(lenStr1):
        # Verificam daca exista match-uri in al doilea string
        for j in range(max(0, i - maxDist), min(lenStr2, i + maxDist + 1)):
            if string1[i] == string2[i] and hash_string2[j] == 0:
                # Update hash + matches
                hash_string1[i] = 1
                hash_string2[j] = 1
                matches += 1
                break 
            
    # Verificam daca am gasit vreun match
    if matches == 0:
        return 0.0
    
    # Numarul de transpozitii
    transpositions = 0
    point = 0
    
    # Numaram cate potriviri avem, pentru care mai exista un caracter intre ele care se potriveste
    for i in range(lenStr1):
        if hash_string1[i]:
            while hash_string2[point] == 0:
                point += 1
            
            if string1[i] != string2[point]:
                point += 1
                transpositions += 1
            else :
                point += 1
        
        transpositions /= 2
        
    # Jaro similarity
    return ((matches / lenStr1 + matches / lenStr2 + (matches - transpositions) / matches) / 3.0)

'''
    Computes the Jaro-Winkler similarity between two strings.
    The Jaro-Winkler similarity is a measure of similarity between two strings.
    The higher the Jaro-Winkler similarity for two strings is, the more similar the strings are.
    
    :param string1: the first string
    :param string2: the second string
    
    :return: the Jaro-Winkler similarity between the two strings
    :rtype: float
'''
def Jaro_WinklerSimilarity(string1: str, string2: str) -> float:
    jaroDistance = JaroDistance(string1, string2)
    
    # Verificam daca similaritatea Jaro este peste un anumit procentaj
    if jaroDistance > 0.7:
        # Cautam lungimea celui mai lung prefix
        prefix = 0
        
        for i in range(min(len(string1), len(string2))):
            if string1[i] == string2[i]:
                prefix += 1
            else:
                break
        # Permitem doar un prefix de maximum 4 caractere
        prefix = min(4, prefix)
        jaroDistance += 0.1 * prefix * (1 - jaroDistance)
    return jaroDistance

'''
    Computes the Levenshtein distance between two strings.
    The Levenshtein distance is a measure of the similarity between two strings.
    The higher the Levenshtein distance for two strings is, the more different the strings are.
    
    :param string1: the first string
    :param string2: the second string
    
    :return: the Levenshtein distance between the two strings
    :rtype: int
'''
def LevenshteinDistance(string1: str, string2: str) -> int:
    return Levenshtein.distance(string1, string2)

'''
    Computes the Jaro-Winkler similarity between two strings using the Levenshtein library.
    The Jaro-Winkler similarity is a measure of similarity between two strings.
    The higher the Jaro-Winkler similarity for two strings is, the more similar the strings are.
    
    :param string1: the first string
    :param string2: the second string
    
    :return: the Jaro-Winkler similarity between the two strings
    :rtype: float

'''
def LevenshteinJaroWinkler(string1: str, string2: str) -> float:
    return Levenshtein.jaro_winkler(string1, string2)

'''
    Computes the Hamming distance between two strings using the Levenshtein library.
    The Hamming distance is the number of positions at which the corresponding symbols are different.
    
    :param string1: the first string
    :param string2: the second string
    
    :return: the Hamming distance between the two strings
    :rtype: int

'''
def LevenshteinHamming(string1: str, string2: str) -> int:
    return Levenshtein.hamming(string1, string2)

'''
    Computes the Word Error Rate (WER) between two strings.
    The Word Error Rate is a measure of the similarity between two strings.
    The lower the WER for two strings is, the more similar the strings are.
    
    :param reference: the first string
    :param hypothesis: the second string
    
    :return: the Word Error Rate between the two strings
    :rtype: float
'''
def computeWER(reference: str, hypothesis: str) -> float:
    return jiwer.wer(reference, hypothesis)

'''
    Computes the Character Error Rate (CER) between two strings.
    The Character Error Rate is a measure of the similarity between two strings.
    The lower the CER for two strings is, the more similar the strings are.
    
    :param reference: the first string
    :param hypothesis: the second string
    
    :return: the Character Error Rate between the two strings
    :rtype: float

'''
def computeCER(reference: str, hypothesis: str) -> float:
    return jiwer.cer(reference, hypothesis)

'''
    Computes the ratio of the Longest Common Subsequence (LCS) between two strings.
    The Longest Common Subsequence is a measure of the similarity between two strings.
    The higher the LCS for two strings is, the more similar the strings are.
    
    :param string1: the first string
    :param string2: the second string
    
    :return: the ratio of the Longest Common Subsequence between the two strings
    :rtype: float
'''
def LCS(string1: str, string2: str) -> float:
    sequence_matcher = difflib.SequenceMatcher(None, string1, string2)
    return sequence_matcher.ratio()

'''
    Authenticates the Computer Vision client using the key and endpoint from the environment variables.
    
    :return: the Computer Vision client
    :rtype: ComputerVisionClient

'''
def authenticate() -> ComputerVisionClient:
    key = os.environ["VISUAL_KEY"]
    endpoint = os.environ["VISUAL_ENDPOINT"]
    
    return ComputerVisionClient(endpoint=endpoint, credentials=CognitiveServicesCredentials(key))

'''
    Processes the image using the Computer Vision client and returns the extracted text.
    
    :param path: the path to the image
    :param computer_vision_client: the Computer Vision client
    
    :return: the extracted text
    :rtype: list

'''

def processImage(path: str, computer_vision_client: ComputerVisionClient) -> list:
    img = open(path, "rb")
    
    read_response = computer_vision_client.read_in_stream(
        image=img,
        mode="Printed",
        raw=True
    )
    
    operation_id = read_response.headers['Operation-Location'].split('/')[-1]
    
    while True:
        read_result = computer_vision_client.get_read_result(operation_id)
        if read_result.status not in ['notStarted', 'running']:
            break
        time.sleep(1)
        
    result = []
    if read_result.status == OperationStatusCodes.succeeded:
        for text_result in read_result.analyze_result.read_results:
            for line in text_result.lines:
                result.append(line.text)
                
    return result

'''
    Computes the intersection over union between two lists of rectangles.
    The intersection over union is a measure of the similarity between two lists of rectangles.
    The higher the intersection over union for two lists of rectangles is, the more similar the lists are.
    
    :param rectangles: the first list of rectangles
    :param rectangles_truth: the second list of rectangles
    
    :return: the intersection over union between the two lists of rectangles
    :rtype: float
'''
def intersection_over_union(rectangles: list, rectangles_truth: list):
    iou = 0.0
    
    for i in range(len(rectangles)):
        x  =  max(rectangles[i][0], rectangles_truth[i][0])
        y  =  max(rectangles[i][1], rectangles_truth[i][1])
        xt =  min(rectangles[i][2], rectangles_truth[i][2])
        yt =  min(rectangles[i][3], rectangles_truth[i][3])
        
        interArea = max(0, xt - x + 1) * max(0, yt - y + 1)
        rectanglesArea = (rectangles[i][2] - rectangles[i][0] + 1) * (rectangles[i][3] - rectangles[i][1] + 1)
        rectangles_truthArea = (rectangles_truth[i][2] - rectangles_truth[i][0] + 1) * (rectangles_truth[i][3] - rectangles_truth[i][1] + 1)
        
        iou += interArea / float(rectanglesArea + rectangles_truthArea - interArea)
        
    return iou / len(rectangles)
        
'''
    Draws the bounding boxes around the words in the image using the Computer Vision client.
    
    :param path: the path to the image
    :param computer_vision_client: the Computer Vision client
'''
def drawBoundingBoxes(path: str, computer_vision_client: ComputerVisionClient):
    img = open(path, "rb")
    
    read_response = computer_vision_client.read_in_stream(
        image=img,
        mode="Printed",
        raw=True
    )
    
    operation_id = read_response.headers['Operation-Location'].split('/')[-1]
    
    while True:
        read_result = computer_vision_client.get_read_result(operation_id)
        if read_result.status not in ['notStarted', 'running']:
            break
        time.sleep(1)
    
    image_name = path.split("/")[-1]
    
    with Image.open(path) as image:
        draw = ImageDraw.Draw(image)
        
        rectangles = []
        
        for line in read_result.analyze_result.read_results[0].lines:
            for word in line.words:
                bounding_box = word.bounding_box
                
                x_values = bounding_box[::2]
                y_values = bounding_box[1::2]
                x_min = min(x_values)
                y_min = min(y_values)
                x_max = max(x_values)
                y_max = max(y_values)
                
                rectangles.append([x_min, y_min, x_max, y_max])
                draw.rectangle([x_min,y_min,x_max,y_max], outline='red')
                
        if image_name == "test1.png":
            rectangles_truth = [
                [176.0,43.3,318.5,105.5],
                [336.5,50.9,415.4,93.0],
                [236.6,112.8,348.7,151.1]
            ]
            draw.rectangle(rectangles_truth[0], outline='blue')
            draw.rectangle(rectangles_truth[1], outline='blue')
            draw.rectangle(rectangles_truth[2], outline='blue')
            
            print(f"Intersection over union = {intersection_over_union(rectangles, rectangles_truth)}")
        elif image_name == "test2.jpeg":
            rectangles_truth = [
                [76.0,296.94,405.7,475.45],
                [502.2,327.9,638.8,455.4],
                [698.9,306.0,1334.6,435.4],
                [117.89,588.4,722.6,726.8],
                [855.59,570.2,1050.5,717.7],
                [77.8,919.9,753.6,1021.9],
                [879.3,923.5,1015.9,1036.5],
                [99.7,1125.7,724.4,1366.2],
                [762.7,1143.9,1372.9,1371.6],
                [1385.65,1140.3,1456.7,1295.1]
            ]
            draw.rectangle(rectangles_truth[0], outline='blue')
            draw.rectangle(rectangles_truth[1], outline='blue')
            draw.rectangle(rectangles_truth[2], outline='blue')
            draw.rectangle(rectangles_truth[3], outline='blue')
            draw.rectangle(rectangles_truth[4], outline='blue')
            draw.rectangle(rectangles_truth[5], outline='blue')
            draw.rectangle(rectangles_truth[6], outline='blue')
            draw.rectangle(rectangles_truth[7], outline='blue')
            draw.rectangle(rectangles_truth[8], outline='blue')
            draw.rectangle(rectangles_truth[9], outline='blue')
            print(f"Intersection over union = {intersection_over_union(rectangles, rectangles_truth)}")
        
        image.save("output_" + image_name)

'''
    Grayscales an image.
    
    :param path: the path to the image
'''

def grayscale_image(path: str):
    image = Image.open(path)
    
    grayscaled_image = image.convert('L')
    
    paths = path.split("/")
    
    if "test1.png" in paths:
        grayscaled_image.save("images/grayscaled_images/test1.png")
    elif "test2.jpeg" in paths:
        grayscaled_image.save("images/grayscaled_images/test2.jpeg")

'''
    Resizes the image to a specified size.
    
    :param path: the path to the image
'''
def resize_image(path: str, width: int, height: int):
    image = Image.open(path)
    
    resized_image = image.resize((width, height))
    
    paths = path.split("/")
    
    if "test1.png" in paths:
        resized_image.save("images/resized_images/test1.png")
    elif "test2.jpeg" in paths:
        resized_image.save("images/resized_images/test2.jpeg")
    
'''
    Erodes the image using a specified filter.
    
    :param path: the path to the image
'''
def erode_image(path: str):
    image = Image.open(path)
    
    image_array = np.array(image)
    dilated_image = Image.fromarray(image_array).filter(ImageFilter.MaxFilter(5))
    eroded_image = dilated_image.filter(ImageFilter.MinFilter(5))
    
    paths = path.split("/")
    
    if "test1.png" in paths:
        eroded_image.save("images/eroded_images/test1.png")
    elif "test2.jpeg" in paths:
        eroded_image.save("images/eroded_images/test2.jpeg")
    
    
'''
    Binarizes the image using a specified threshold.
    
    :param path: the path to the image

'''
def binarize_image(path: str):
    image = Image.open(path).convert('L')
    
    binarized_image = image.point(lambda x: 0 if x < 128 else 255, '1')
    
    paths = path.split("/")
    
    if "test1.png" in paths:
        binarized_image.save("images/binarized_images/test1.png")
    elif "test2.jpeg" in paths:
        binarized_image.save("images/binarized_images/test2.jpeg")

'''
   Reduces the noise of an image.
   
    :param path: the path to the image 
'''
def noise_reduction_image(path: str):
    image = Image.open(path)
    
    reduced_noise_image = image.filter(ImageFilter.MedianFilter(size=5))
    
    paths = path.split("/")
    
    if "test1.png" in paths:
        reduced_noise_image.save("images/reduced_noise_images/test1.png")
    elif "test2.jpeg" in paths:
        reduced_noise_image.save("images/reduced_noise_images/test2.jpeg")

'''
    Tests the OCR model using the Computer Vision client.
    
    :param path: the path to the image
    :param computer_vision_client: the Computer Vision client
    :param groundTruth: the ground truth for the extracted text

'''
def test(path: str, computer_vision_client: ComputerVisionClient, groundTruth: list):
    result = processImage(path, computer_vision_client)
    
    if len(result) != len(groundTruth):
        print("Modelul OCR nu a extras numarul de propozitii asteptate")
        return
    
    for i in range(len(result)):
        reference = groundTruth[i]
        hypothesis = result[i]
        
        print(f"Ipoteza: {hypothesis} | Referinta: {reference}")
        print(f"CER = {computeCER(reference, hypothesis)}")
        print(f"WER = {computeWER(reference, hypothesis)}")
        print(f"Hamming distance (propriu) = {HammingDistance(reference, hypothesis)}")
        print(f"Hamming Distance (Levenshtein library) = {LevenshteinHamming(reference, hypothesis)}")
        print(f"Jaro-Winkler Similarity (propriu) = {Jaro_WinklerSimilarity(reference, hypothesis)}")
        print(f"Jaro-Winkler similarity (Levenshtein library) = {LevenshteinJaroWinkler(reference, hypothesis)}")
        print(f"Levenshtein distance = {LevenshteinDistance(reference, hypothesis)}")
        print(f"Longest common subsequence = {LCS(reference, hypothesis)}")
        print()

def main():
    computer_vision_client = authenticate()

    path1 = "images/test1.png"
    groundTruth1 = ["Google Cloud", "Platform"]
    
    path2 = "images/test2.jpeg"
    groundTruth2 = ["Succes in rezolvarea", "tEMELOR la", "LABORAtoarele de", "Inteligenta Artificiala!"]
    
    test(path1, computer_vision_client, groundTruth1)
    drawBoundingBoxes(path1, computer_vision_client)
    print()
    
    test(path2, computer_vision_client, groundTruth2)
    drawBoundingBoxes(path2, computer_vision_client)
    print()
    
    # grayscale_image(path2)
    # test("images/grayscaled_images/test2.jpeg", computer_vision_client, groundTruth2)
    # drawBoundingBoxes("images/grayscaled_images/test2.jpeg", computer_vision_client)
    
    # resize_image(path1, 800, 800)
    # test("images/resized_images/test1.png", computer_vision_client, groundTruth2)
    # drawBoundingBoxes("images/resized_images/test1.png", computer_vision_client)
    
    # erode_image(path1)
    # test("images/eroded_images/test1.png", computer_vision_client, groundTruth2)
    # drawBoundingBoxes("images/eroded_images/test1.png", computer_vision_client)
    
    # binarize_image(path1)
    # test("images/binarized_images/test1.png", computer_vision_client, groundTruth2)
    # drawBoundingBoxes("images/binarized_images/test1.png", computer_vision_client)
    
    # noise_reduction_image(path2)
    # test("images/reduced_noise_images/test2.jpeg", computer_vision_client, groundTruth2)
    # drawBoundingBoxes("images/reduced_noise_images/test2.jpeg", computer_vision_client)

main()

Ipoteza: Google Cloud | Referinta: Google Cloud
CER = 0.0
WER = 0.0
Hamming distance (propriu) = 0
Hamming Distance (Levenshtein library) = 0
Jaro-Winkler Similarity (propriu) = 1.0
Jaro-Winkler similarity (Levenshtein library) = 1.0
Levenshtein distance = 0
Longest common subsequence = 1.0

Ipoteza: Platform | Referinta: Platform
CER = 0.0
WER = 0.0
Hamming distance (propriu) = 0
Hamming Distance (Levenshtein library) = 0
Jaro-Winkler Similarity (propriu) = 1.0
Jaro-Winkler similarity (Levenshtein library) = 1.0
Levenshtein distance = 0
Longest common subsequence = 1.0

Intersection over union = 0.8037711900222942

Ipoteza: Lucces in resolvarea | Referinta: Succes in rezolvarea
CER = 0.1
WER = 0.6666666666666666
Hamming distance (propriu) = 2
Hamming Distance (Levenshtein library) = 2
Jaro-Winkler Similarity (propriu) = 0.9148873293841326
Jaro-Winkler similarity (Levenshtein library) = 0.9333333333333332
Levenshtein distance = 2
Longest common subsequence = 0.9

Ipoteza: TEMELOR la | 