In [None]:
'''
Implementation of NomNaOCR patches statistics:
- Author: Nguyen Duc Duy Anh
- GitHub: https://github.com/duyanh1909
'''
import glob
import numpy as np
import pandas as pd
from collections import Counter

!unzip -q NomNaOCR/Patches.zip -d /tmp/
PATH_DATA = glob.glob('/tmp/Patches/*.txt')
PATH_DATA

['/tmp/stistic_data/Patches/Validate.txt',
 '/tmp/stistic_data/Patches/Train.txt']

## General statistics

In [None]:
def read_label_patches(path):
    with open(path, 'r', encoding='utf-8') as f:
        return list(map(
            lambda x: x.split('\t')[1].replace('\n', ''), 
            f.readlines()
        ))

In [None]:
data_train = read_label_patches(PATH_DATA[1])
data_val = read_label_patches(PATH_DATA[0])
dataset = data_train + data_val
len(dataset) # 38318
dataset[:5]

['不得棄本遂末并托以販賣技術游足游手其有',
 '㐌衝身世群算浽芇',
 '象七隻來献營門王大喜九月捕得吳廷峩',
 '𢚸貞払𡏡劳刀\U000f086a蜍',
 '十一月除税使鋪正等職令勑旨\U000f0c65今各税務']

In [None]:
from IHRNomDB_Rs import print_intersection
print_intersection(data_val, data_train)

Characters intersection train 93.2405165456013
Characters intersection val 64.41315862838026


In [None]:
df_train_vocab = pd.DataFrame(
    data = list(Counter(''.join(data_train)).most_common()), 
    columns = ['word', 'frequence']
)
df_train_vocab_once = df_train_vocab[df_train_vocab['frequence'].isin([1, 2, 3])]

In [None]:
df_val_vocab = pd.DataFrame(
    data = list(Counter(''.join(data_val)).most_common()), 
    columns = ['word', 'frequence']
)
word_infreq_train = df_train_vocab_once['word'].values
df_val_infrequence_in_val = df_val_vocab[df_val_vocab['word'].isin(word_infreq_train)]
df_val_infrequence_in_val

Unnamed: 0,word,frequence
1824,󰠡,7
2092,呦,6
2140,𨒣,5
2200,脧,5
2233,鰥,5
...,...,...
4950,楞,1
4951,𬁑,1
4953,錚,1
4954,旧,1


# Number of characters in the dataset

In [None]:
df_vocab = pd.DataFrame(
    data = list(Counter(''.join(dataset)).most_common()), 
    columns = ['word', 'frequence']
)
len(df_vocab)

7509

In [None]:
conditions = [
    (df_vocab['frequence'] == 1),
    (df_vocab['frequence'] >= 2) & (df_vocab['frequence'] <= 5),
    (df_vocab['frequence'] >= 6) & (df_vocab['frequence'] <= 10),
    (df_vocab['frequence'] >= 11) & (df_vocab['frequence'] <= 20),
    (df_vocab['frequence'] >= 21) & (df_vocab['frequence'] <= 50),
    (df_vocab['frequence'] >= 51) & (df_vocab['frequence'] <= 100),
    (df_vocab['frequence'] > 100),
]
df_vocab['range_word'] = np.select(conditions, ['1', '2-5', '6-10', '11-20', '21-50', '51-100', '100+'])
sum(
    df_vocab.groupby(['range_word'], as_index=False)
        .sum(['frequence'])
        .sort_values(['frequence'], ascending = True)['frequence']
)

459547

# Number of characters by length

In [None]:
df_sentence = pd.DataFrame(data=dataset, columns=['sentence'])
df_sentence['number_of_word'] = list(map(lambda x: len(x), dataset))
df_sentence.head()

Unnamed: 0,sentence,number_of_word
0,不得棄本遂末并托以販賣技術游足游手其有,19
1,㐌衝身世群算浽芇,8
2,象七隻來献營門王大喜九月捕得吳廷峩,17
3,𢚸貞払𡏡劳刀󰡪蜍,8
4,十一月除税使鋪正等職令勑旨󰱥今各税務,18


In [None]:
conditions = [(df_sentence['number_of_word'] == num) for num in range(1, 19)]
conditions.append((df_sentence['number_of_word'] >= 19))

df_sentence['count_sentence'] = np.select(conditions, list(range(1, 20)))
df_sentence
    .groupby(['count_sentence'], as_index=False)['number_of_word']
    .count()
    .sort_values(['count_sentence'], ascending=True)

Unnamed: 0,count_sentence,number_of_word
0,1,631
1,2,1318
2,3,1371
3,4,1232
4,5,727
5,6,6023
6,7,406
7,8,5824
8,9,396
9,10,432


In [None]:
conditions = [(df_sentence['number_of_word'] == num) for num in range(1, 24)]
conditions.append((df_sentence['number_of_word'] >= 24))

df_sentence['count_sentence'] = np.select(conditions, list(range(1, 25)))
df_sentence
    .groupby(['count_sentence'], as_index=False)['number_of_word']
    .count()
    .sort_values(['count_sentence'], ascending=True)

Unnamed: 0,count_sentence,number_of_word
0,1,631
1,2,1318
2,3,1371
3,4,1232
4,5,727
5,6,6023
6,7,406
7,8,5824
8,9,396
9,10,432
