In [70]:
## Load all the supporting modules
import matplotlib
import datetime
import json
import re
import pandas as pd
import numpy as np
from typing import List, Union
from jiwer import compute_measures, transforms

In [71]:
!pip install jiwer



In [72]:
!pip show jiwer

Name: jiwer
Version: 3.0.3
Summary: Evaluate your speech-to-text system with similarity measures such as word error rate (WER)
Home-page: https://github.com/jitsi/jiwer
Author: Nik Vaessen
Author-email: nikvaes@gmail.com
License: Apache-2.0
Location: c:\users\deviy\anaconda3\envs\ocr_39\lib\site-packages
Requires: click, rapidfuzz
Required-by: 


In [73]:
from typing import List, Union

## create CER transform object that will be used to modify the word into list of character using the transform module
cer_transform = transforms.Compose([transforms.RemoveMultipleSpaces(), transforms.Strip(), transforms.ReduceToSingleSentence(""), transforms.ReduceToListOfListOfChars()])

def cer_compute(predictions: Union[str, List[str]], ground_truths: Union[str, List[str]]) -> float:
    """compute character error rate from pair of list of string of prediction and references

    Args:
        predictions (Union[str, List[str]]): list of string of the predictions
        ground_truths (Union[str, List[str]]): list of the string of the ground_truths

    Returns:
        float: float score metrics of CER.
    """
    incorrect = 0
    total = 0

    if isinstance(predictions, str):
        prediction = [prediction]

    if isinstance(ground_truths, str):
        ground_truths = [ground_truths]

    for prediction, truth in zip(predictions, ground_truths):
        measures = compute_measures(truth, prediction, truth_transform=cer_transform, hypothesis_transform=cer_transform)
        incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"]
        total += measures["substitutions"] + measures["deletions"] + measures["hits"]

    return incorrect / total


In [74]:
def wer_compute(predictions: Union[str, List[str]], ground_truths: Union[str, List[str]]) -> float:
    """compute word error rate from pair of list of string of prediction and references

    Args:
        predictions (Union[str, List[str]]): list of string of the predictions
        ground_truths (Union[str, List[str]]): list of the string of the ground_truths

    Returns:
        float: float score metrics of WER.
    """
    incorrect = 0
    total = 0

    if isinstance(predictions, str):
        prediction = [predictions]

    if isinstance(ground_truths, str):
        ground_truths = [ground_truths]

    for prediction, truth in zip(predictions, ground_truths):
        measures = compute_measures(truth, prediction)
        incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"]
        total += measures["substitutions"] + measures["deletions"] + measures["hits"]

    return incorrect / total


In [75]:
def compute_matrics(df: pd.DataFrame, column_name):
    ## compute the wer and cer metrics
    for index, row in df.iterrows():
        filename = row['filename']
        ref, output = row[f'{column_name}_act'], row[f'{column_name}']
        if (ref != ref):
            ref = None
        if (output != output) or (output == None):
            output = ""
            
        ref = str(ref).lower()
        output = str(output).lower()
        cer = cer_compute([output], [ref])
        wer = wer_compute([output], [ref])

        df.loc[df['filename'] == filename, 'cer'] = round(cer, 2)
        df.loc[df['filename'] == filename, 'wer'] = round(wer, 2)
        df.loc[df['filename'] == filename, 'match_acc'] = 1 if round(wer, 2) == 0 else 0.0
    return df




In [76]:
def get_metrics(df: pd.DataFrame):
    mean_cer = "{:.2f}".format(df['cer'].mean() * 100)
    mean_wer = "{:.2f}".format(df['wer'].mean() * 100)
    acc = "{:.2f}".format(df['match_acc'].mean() * 100)
    
    print(f'Mean CER = {mean_cer}%, Mean WER = {mean_wer}%, Mean Acc = {acc}%')
    
    return mean_cer, mean_wer, acc

In [77]:
def eval_ocr(df_case, column_name):
    df = compute_matrics(df_case, column_name=column_name)
    mean_cer, mean_wer, mean_acc = get_metrics(df)
    
    return df, mean_cer, mean_wer, mean_acc

In [78]:
def eval_all(df_case):
    result = {}
    metrics_df = {"field": [], "mean_cer_ai": [], "mean_wer_ai": [], "mean_acc_ai": [], }
    for col in cols:
        df, mean_cer, mean_wer, mean_acc = eval_ocr(df_case, col)
        result[col] = df
        metrics_df['field'].append(col)
        metrics_df['mean_cer_ai'].append(mean_cer)
        metrics_df['mean_wer_ai'].append(mean_wer)
        metrics_df['mean_acc_ai'].append(mean_acc)
        print ("======"*10)
    
    df_sum = pd.DataFrame.from_dict(metrics_df)
    result["acc_fields"] = df_sum
    
    return result

In [79]:
def combined_act_pred(act,pred):
    df_combined=pd.merge(act,pred,on='filename')
    df_combined[act_cols]=df_combined[act_cols].applymap(lambda x: str(x).upper())
    df_combined[cols]=df_combined[cols].fillna('')
    return df_combined

In [80]:
cols=['name','member_id','phone','address','district','district_code']

In [81]:
df_actual=pd.read_csv('test_actual.csv',quotechar='"')
df_actual.head()

Unnamed: 0,filename,name,member_id,phone,address,district,district_code
0,1.png,Lily Johnson,123-789-4560,+123-999-7890,"123 Imaginary Avenue, Dreamland, DL 00000",Dreamland,DL
1,10.png,Michael Taylor,123-876-5430,+123-333-4444,"808 Willow Way, Riverside, RS 77777",Riverside,RS
2,12.png,Benjamin William,123-234-5681,+123-111-3322,"101 Cedar Lane, Hilltop, HT 44444",Hilltop,HT
3,14.png,Jacob Miller,123-024-8190,+123-001-1224,"321 Oak Street, Riverdale, RD 11111",Riverdale,RD
4,6.png,Mia Wilson,123-876-5430,+123-333-4444,"808 Rainbow Lane, Unicorn Valley, UV, 66666",Unicorn Valley,UV


In [82]:
df_actual.columns

Index(['filename', 'name', 'member_id', 'phone', 'address', 'district',
       'district_code'],
      dtype='object')

In [83]:
df_actual.columns=['filename','name_act','member_id_act','phone_number_act','address_act','district_act','district_code_act']

In [84]:
cols=['name','member_id','phone_number','address','district','district_code']

In [85]:
act_cols=[i+'_act' for i in cols]

In [86]:
df_v1=pd.read_csv('test_predict_model_v1.csv',quotechar='"') # after fix postpro
df_combined_v1=combined_act_pred(df_actual,df_v1)
result_v1= eval_all(df_combined_v1)
result_v1['acc_fields']

  df_combined[act_cols]=df_combined[act_cols].applymap(lambda x: str(x).upper())


Mean CER = 73.67%, Mean WER = 100.00%, Mean Acc = 0.00%
Mean CER = 12.33%, Mean WER = 83.33%, Mean Acc = 16.67%
Mean CER = 5.00%, Mean WER = 33.33%, Mean Acc = 66.67%
Mean CER = 73.33%, Mean WER = 97.67%, Mean Acc = 0.00%
Mean CER = 100.00%, Mean WER = 100.00%, Mean Acc = 0.00%
Mean CER = 100.00%, Mean WER = 100.00%, Mean Acc = 0.00%


Unnamed: 0,field,mean_cer_ai,mean_wer_ai,mean_acc_ai
0,name,73.67,100.0,0.0
1,member_id,12.33,83.33,16.67
2,phone_number,5.0,33.33,66.67
3,address,73.33,97.67,0.0
4,district,100.0,100.0,0.0
5,district_code,100.0,100.0,0.0


In [87]:
df_v2=pd.read_csv('test_predict_model_v2.csv',quotechar='"') # after fix postpro
df_combined_v2=combined_act_pred(df_actual,df_v2)
result_v2= eval_all(df_combined_v2)
result_v2['acc_fields']

  df_combined[act_cols]=df_combined[act_cols].applymap(lambda x: str(x).upper())


Mean CER = 0.00%, Mean WER = 0.00%, Mean Acc = 100.00%
Mean CER = 0.00%, Mean WER = 0.00%, Mean Acc = 100.00%
Mean CER = 0.00%, Mean WER = 0.00%, Mean Acc = 100.00%
Mean CER = 4.00%, Mean WER = 13.50%, Mean Acc = 50.00%
Mean CER = 0.00%, Mean WER = 0.00%, Mean Acc = 100.00%
Mean CER = 0.00%, Mean WER = 0.00%, Mean Acc = 100.00%


Unnamed: 0,field,mean_cer_ai,mean_wer_ai,mean_acc_ai
0,name,0.0,0.0,100.0
1,member_id,0.0,0.0,100.0
2,phone_number,0.0,0.0,100.0
3,address,4.0,13.5,50.0
4,district,0.0,0.0,100.0
5,district_code,0.0,0.0,100.0


In [88]:
df_combined_v1

Unnamed: 0,filename,name_act,member_id_act,phone_number_act,address_act,district_act,district_code_act,name,member_id,phone_number,address,district,district_code,cer,wer,match_acc
0,1.png,LILY JOHNSON,123-789-4560,+123-999-7890,"123 IMAGINARY AVENUE, DREAMLAND, DL 00000",DREAMLAND,DL,HE Ms,123-789-4560,+123-999-7890,AE A e Mh000,,,1.0,1.0,0.0
1,10.png,MICHAEL TAYLOR,123-876-5430,+123-333-4444,"808 WILLOW WAY, RIVERSIDE, RS 77777",RIVERSIDE,RS,"Heoae, Taoc",123-76543-0,+123-333444,9 Mo MADF Me M AE,,,1.0,1.0,0.0
2,12.png,BENJAMIN WILLIAM,123-234-5681,+123-111-3322,"101 CEDAR LANE, HILLTOP, HT 44444",HILLTOP,HT,Beenjain Wllawn,123-234-5661,+123-111-3322,AW MAE,,,1.0,1.0,0.0
3,14.png,JACOB MILLER,123-024-8190,+123-001-1224,"321 OAK STREET, RIVERDALE, RD 11111",RIVERDALE,RD,,123-024-190,+123-001-1224,3210a 5e Bedae 011111,,,1.0,1.0,0.0
4,6.png,MIA WILSON,123-876-5430,+123-333-4444,"808 RAINBOW LANE, UNICORN VALLEY, UV, 66666",UNICORN VALLEY,UV,,123-765490,+123-333444,A A W M 66666,,,1.0,1.0,0.0
5,7.png,MATTHEW GARCIA,123-345-6782,+123-999-0010,"999 OAKWOOD DRIVE, HILLCREST, HC, 00000",HILLCREST,HC,MEDae Lasla,123-345-6752,+123-999-0010,SAA Me Me D00000,,,1.0,1.0,0.0


In [89]:
df_actual=pd.read_csv('test_actual_rotated.csv',quotechar='"')
df_actual.head()

Unnamed: 0,filename,name,member_id,phone,address,district,district_code
0,1_90_rotated.png,Lily Johnson,123-789-4560,+123-999-7890,"123 Imaginary Avenue, Dreamland, DL 00000",Dreamland,DL
1,10_90_rotated.png,Michael Taylor,123-876-5430,+123-333-4444,"808 Willow Way, Riverside, RS 77777",Riverside,RS
2,12_270_rotated.png,Benjamin William,123-234-5681,+123-111-3322,"101 Cedar Lane, Hilltop, HT 44444",Hilltop,HT
3,14_180_rotated.png,Jacob Miller,123-024-8190,+123-001-1224,"321 Oak Street, Riverdale, RD 11111",Riverdale,RD
4,6_180_rotated.png,Mia Wilson,123-876-5430,+123-333-4444,"808 Rainbow Lane, Unicorn Valley, UV, 66666",Unicorn Valley,UV


In [90]:
cols=['name','member_id','phone_number','address','district','district_code']
act_cols=[i+'_act' for i in cols]

In [91]:
df_actual.columns=['filename']+act_cols

In [92]:
df_rotated=pd.read_csv('test_predict_model_rotated_v2.csv',quotechar='"') # after fix postpro
df_combined_rotated=combined_act_pred(df_actual,df_rotated)
result_rotated= eval_all(df_combined_rotated)
result_rotated['acc_fields']

  df_combined[act_cols]=df_combined[act_cols].applymap(lambda x: str(x).upper())


Mean CER = 33.33%, Mean WER = 33.33%, Mean Acc = 66.67%
Mean CER = 33.33%, Mean WER = 33.33%, Mean Acc = 66.67%
Mean CER = 33.33%, Mean WER = 33.33%, Mean Acc = 66.67%
Mean CER = 36.17%, Mean WER = 44.00%, Mean Acc = 16.67%
Mean CER = 33.33%, Mean WER = 33.33%, Mean Acc = 66.67%
Mean CER = 33.33%, Mean WER = 33.33%, Mean Acc = 66.67%


Unnamed: 0,field,mean_cer_ai,mean_wer_ai,mean_acc_ai
0,name,33.33,33.33,66.67
1,member_id,33.33,33.33,66.67
2,phone_number,33.33,33.33,66.67
3,address,36.17,44.0,16.67
4,district,33.33,33.33,66.67
5,district_code,33.33,33.33,66.67


In [95]:
df_combined_rotated.loc[df_combined_rotated['phone_number_act']!=df_combined_rotated['phone_number']]

Unnamed: 0,filename,name_act,member_id_act,phone_number_act,address_act,district_act,district_code_act,name,member_id,phone_number,address,district,district_code,cer,wer,match_acc
2,12_270_rotated.png,BENJAMIN WILLIAM,123-234-5681,+123-111-3322,"101 CEDAR LANE, HILLTOP, HT 44444",HILLTOP,HT,,,,,,,1.0,1.0,0.0
3,14_180_rotated.png,JACOB MILLER,123-024-8190,+123-001-1224,"321 OAK STREET, RIVERDALE, RD 11111",RIVERDALE,RD,,,,,,,1.0,1.0,0.0


In [94]:
df_combined_rotated

Unnamed: 0,filename,name_act,member_id_act,phone_number_act,address_act,district_act,district_code_act,name,member_id,phone_number,address,district,district_code,cer,wer,match_acc
0,1_90_rotated.png,LILY JOHNSON,123-789-4560,+123-999-7890,"123 IMAGINARY AVENUE, DREAMLAND, DL 00000",DREAMLAND,DL,Lily Johnson,123-789-4560,+123-999-7890,"123 Imaginary Avenue, Dreamland, DL OO000",DreamLand,DL,0.0,0.0,1.0
1,10_90_rotated.png,MICHAEL TAYLOR,123-876-5430,+123-333-4444,"808 WILLOW WAY, RIVERSIDE, RS 77777",RIVERSIDE,RS,Michael Taylor,123-876-5430,+123-333-4444,"808 Willow Way, Riverside, RS 77777",Riverside,RS,0.0,0.0,1.0
2,12_270_rotated.png,BENJAMIN WILLIAM,123-234-5681,+123-111-3322,"101 CEDAR LANE, HILLTOP, HT 44444",HILLTOP,HT,,,,,,,1.0,1.0,0.0
3,14_180_rotated.png,JACOB MILLER,123-024-8190,+123-001-1224,"321 OAK STREET, RIVERDALE, RD 11111",RIVERDALE,RD,,,,,,,1.0,1.0,0.0
4,6_180_rotated.png,MIA WILSON,123-876-5430,+123-333-4444,"808 RAINBOW LANE, UNICORN VALLEY, UV, 66666",UNICORN VALLEY,UV,Mia Wilson,123-876-5430,+123-333-4444,"808 Rainbow Lane, Unicorn Valley, UV 66666",Unicorn Valley,UV,0.0,0.0,1.0
5,7_90_rotated.png,MATTHEW GARCIA,123-345-6782,+123-999-0010,"999 OAKWOOD DRIVE, HILLCREST, HC, 00000",HILLCREST,HC,Matthew Garcia,123-345-6782,+123-999-0010,"999 Oakwood Drive, Hillcrest, HC Ooo00",Hillcrest,HC,0.0,0.0,1.0
