In [None]:
'''
Implementation of Error analysis for the best Text Recognition model:
- Author: Nguyen Duc Duy Anh
- GitHub: https://github.com/duyanh1909
'''
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('TransformerOCR/best_validate.csv')
df = df.fillna('')
df.columns = ['path', 'actual', 'predict']
df[df['predict'] == ''][['path', 'predict']]

Unnamed: 0,path,predict
7399,DVSKTT-3 Ban ky toan thu/DVSKTT_ban_toan_X_60a...,
7544,DVSKTT-3 Ban ky toan thu/DVSKTT_ban_toan_II_36...,


In [None]:
def get_error_chars(label, pred_label):
    list_errors = []
    label = label.replace('[UNK]', '?')
    label_length, pred_length = len(label), len(pred_label)
    if pred_label == label: return list_errors

    pred_start, start, end = 0, 0, 0
    while start <= end < label_length:
  
        actual_char = '[UNK]' if label[end] == '?' else label[end]

        if label[start:end + 1] in pred_label[pred_start:pred_length]: pass
        else:
            if end < pred_length and end + 1 < label_length and pred_label[end] == label[end + 1]:
                list_errors.append(actual_char)
            elif end < pred_length:
                list_errors.append(pred_label[end])
            else: 
                list_errors.append(actual_char)
                
            pred_start = end
            start = end + 1
        end += 1
    return list_errors

In [None]:
df['error_char'] = df.apply(lambda row: get_error_chars(row['actual'], row['predict']), axis=1)
df['error_actual'] = df.apply(lambda row: get_error_chars(row['predict'], row['actual']), axis=1)
df['count_error_char'] = list(map(lambda x: len(x), df['error_char']))
df

Unnamed: 0,path,actual,predict,error_char,error_actual,count_error_char
0,DVSKTT-4 Ban ky thuc luc/DVSKTT_ban_thuc_XII_7...,使通好執事迷而不反我是以有徃年之師帝遣,使通好執事迷而不反義是以有徃年之師帝遣,[義],[我],1
1,Tale of Kieu 1866/page065b_5.jpg,池有大蛇入見二十日上以砲旗皷制之賜月,池有大蛇入見二十日上以砲旗皷制之四月,[四],[賜],1
2,DVSKTT-5 Ban ky tuc bien/DVSKTT_ban_tuc_XVIII_...,非傾人宗祧若是其幾無所不至以致大明之疑,非傾人宗桃若是其幾無所不至以致大明之疑,[桃],[祧],1
3,Tale of Kieu 1872/page75a_9.jpg,十一月戊辰朔日有食以裴時亨密奏故不之,十一月戊辰朔日有食以裴時亨辱奏故不之,[辱],[密],1
4,DVSKTT-4 Ban ky thuc luc/DVSKTT_ban_thuc_XIII_...,使人追之不及十三日卯時克終自元軍回賊,使人追之不及十三日卯時克終自元軍回賊,[],[],0
...,...,...,...,...,...,...
7547,Tale of Kieu 1866/page014b_10.jpg,折毀,折毀,[],[],0
7548,DVSKTT-3 Ban ky toan thu/DVSKTT_ban_toan_II_27...,劉鋹,對鄕,"[對, 鄕]","[劉, 鋹]",2
7549,Luc Van Tien/nlvnpf-0059-053_19.jpg,𨤮𣌉岌淨𩂟𣾺,𨤮𣌉馭淨常𣾺,"[馭, 常]","[岌, 𩂟]",2
7550,DVSKTT-3 Ban ky toan thu/DVSKTT_ban_toan_X_66a...,責,賁,[賁],[責],1


In [None]:
df_train_vocab_once = pd.read_csv('vocab_train_1_2_3.csv')
once = df_train_vocab_once[df_train_vocab_once['frequence'] == 1]['word'].values

In [None]:
list_frequence_1_2_3 = []
for error_char, error_actual in df.loc[:, ['error_char', 'error_actual']].values:
    for char in actual:
        if (char in once) & (char not in error_char) & (char not in error_actual):
            list_frequence_1_2_3.append(char)
list_infrequence[1]

'窟'

In [None]:
df_infrequence = pd.DataFrame(data=list_infrequence, columns=['word'])
df_infrequence['frequence'] = [
    int(df_train_vocab_once[df_train_vocab_once['word'] == word]['frequence'].values) 
    for word in list_infrequence
]

In [None]:
# Number of characters is still correctly predicted even though it rarely appears in the Train set
conditions = [
    (df_infrequence['frequence'] == 1),
    (df_infrequence['frequence'] == 2),
    (df_infrequence['frequence'] == 3)
]

values = ['1', '2', '3']
df_infrequence['word'] = np.select(conditions, values)
df_infrequence.groupby(['word'], as_index=False).count().sort_values(['word'], ascending=True)[['word', 'frequence']]

Unnamed: 0,word,frequence
0,1,32
1,2,70
2,3,85


In [None]:
# Distribution of the wrongly predicted characters of the Validate set
conditions = [
    (df['count_error_char'] == 0),
    (df['count_error_char'] == 1),
    (df['count_error_char'] == 2),
    (df['count_error_char'] == 3),
    (df['count_error_char'] == 4),
    (df['count_error_char'] >= 5)
]

values = ['0', '1', '2', '3', '4', '>=5']
df['error'] = np.select(conditions, values)
df.groupby(['error'], as_index=False).count().sort_values(['error'], ascending=True)[['error', 'count_error_char']]

Unnamed: 0,error,count_error_char
0,0,2069
1,1,2073
2,2,1432
3,3,876
4,4,514
5,>=5,588
