In [None]:
import numpy as np
import pandas as pd

#--------------------------------------
#
#    Read manually preprocessed data
#
#--------------------------------------

def saveDataFrame(path, df):
    print("Saving CSV:", path)
    df.to_csv(path, encoding='utf-8-sig', index=False, sep=";")

def readDataFrame(path):
    print("Reading CSV:", path)
    return pd.read_csv(path, encoding='utf-8-sig', sep=";")  

raw_data = readDataFrame(r'../input/ekr-docs/ekr_135_full_manually.csv')


print(raw_data.describe())
print('---------------------------------------')
print(raw_data['ocr_text'][0][:500])

# Train

In [None]:
#--------------------------------------
#
#    Convert sequences to NER format: [(tag, token), (tag, token), ...]
#
#--------------------------------------

O_tag = 'O'

class SequenceTagger():

    """
        Lowercarse and remove redundant whitespaces
    """
    def clean(self, text): 
        return ' '.join(str(text).lower().split())
    
    
    """
        - Convert text and its tags to a list of tag-value pairs:
          'text ...', {'tag': 'value', ...}   =>   [('O', some), ('xtag', text), ('ytag', value), ...]
        - Text should contains tag_values exacatly --- TODO: use Levenshtein distance insted of exact match!
        - Calculate O tag values
        - Also tokenize text by whitespaces
    """
    def tag_sequence(self, text, tag_values, unk_value='-'):
        
        text = self.clean(text)
        
        # calculate positions (charcter indexes) of tags  from tex(tag, start, end)
        tag_indexes = []
        for tag, value in tag_values.items():
            if value != unk_value:
                value = self.clean(value)
                start = text.find(value)
                if start != -1:
                    tag_indexes.append((tag, start, start+len(value)))
                else:
                    print('WARNING: no tag "'+tag+'" found with value "'+value+'"')
        
        # sort tags by appearence in text
        tag_indexes.sort(key=lambda val: val[1])
        
        tag_value_results = []
        # helper: add tag to results if its value is not empty
        def append(tag, value):
            if value:
                tag_value_results.append((tag, value))
                
        # create tag-value pairs by extracting values from text with indexes
        for i in range(len(tag_indexes)):
            
            tag, start, end = tag_indexes[i]
            
            # add O text before the first tag
            if i == 0:
                append(O_tag, text[0 : start])
                
            # add current tag
            append(tag, text[start : end])
                
            # add O text between current and next tag (or after the last tag)
            if i == len(tag_indexes)-1:
                next_start = None
            else:
                _, next_start, _ = tag_indexes[i+1]
            append(O_tag, text[end : next_start])
            
        return tag_value_results #, tag_value_results # return with whole sequences


    def tag_df(self, df, text_col, cols2tags):
        def apply_seq_tagging(df_row):
            tag_values = { tag: df_row[col] for col, tag in cols2tags.items() }
            return self.tag_sequence(df_row[text_col], tag_values)
        
        return df.apply(apply_seq_tagging, axis=1).values

    def tag(self, docs):
        def apply_seq_tagging(doc):
            tag_values = doc['tags']
            return self.tag_sequence(doc['text'], tag_values)
        
        return [apply_seq_tagging(doc) for doc in docs]
    

cols2tags = {
    'Ajánlatkérő adószáma': 'A_ADO', 
    'Vezető ajánlattevő adószáma': 'V_ADO',
    'Ajánlatkérő cím': 'A_CIM', 
    'Vezető ajánlattevő cím': 'V_CIM', 
    'Ajánlatkérő megnevezése': 'A_NEV',
    'Vezető ajánlattevő megnevezése': 'V_NEV',
    'Szerződött bruttó ellenérték (összeg/keretösszeg)': 'BR_AR',
    'Szerződött nettó ellenérték (összeg/keretösszeg)': 'N_AR'
}
    
sequence_tagger = SequenceTagger()
tagged_docs = sequence_tagger.tag_df(raw_data, 'ocr_text', cols2tags)


print(tagged_docs[0][0:5]) # first doc first 5 tags

In [None]:
import re

class PreservingTokenizer():
    
    """
        Tokenize text values of a doc while preservering position informations
        - [(tag, value), ...] => [(token, token_index, tag), ...]
        - May split words to subwords => token_index contains the original word index
    """
    def tokenize_doc(self, doc):
        tokenized_doc = []
        word_counter = 0
        for tag, value in doc:
            words = value.split()
            # making every a-Z and 1-9 standalone
            subwords = [(subword, word_counter+wi) for wi, word in enumerate(words) \
                                                    for subword in re.sub(r'([^\w])', r' \1 ', word).split()]
                
            if tag == O_tag:
                for subword, wi in subwords:
                    tokenized_doc.append((subword, wi, tag))    
            else:
                subword, wi = subwords[0]
                tokenized_doc.append((subword, wi, 'B-'+tag))
                for subword, wi in subwords[1:]:
                    tokenized_doc.append((subword, wi, 'I-'+tag))
                    
            word_counter += len(words)
                    
        return tokenized_doc
        
        
    def tokenize_docs(self, docs):
        return [self.tokenize_doc(doc) for doc in docs]
    
    """
        Detokenize (join) text values of a doc while preservering position informations
        - [(token, token_index, tag), ...] => [(tag, value), ...]
        - Concatenate splitted words by token_index
        - Handle prediction error for in-tag cases, e.g.: B I O I => B I I I
    """
    def detokenize_doc(self, doc, max_error_distance=2):  # doc: [(subword, wi, tag), ...] # todo: handle removed token
        tag_values = []
        last_index = -1
        for i, item in enumerate(doc):
            subword, wi, tag = item
            tag = tag if tag == O_tag else tag[2:]
            
            if not tag_values:
                tag_values.append((tag, subword))
            else:
                last_tag, last_value = tag_values[-1]
                sep = '' if last_index == wi else ' '
                if last_tag == tag or \
                (   # correct tag if the following tag is the same as last
                    last_tag != O_tag and \
                    any([last_tag == next_tag for _, _, next_tag in doc[i:min(i+max_error_distance, len(doc))]])
                ):
                    tag_values[-1] = (last_tag, last_value +sep+ subword)
                else:
                    tag_values.append((tag, subword))
                
            last_index = wi
            
        return tag_values
    
    def detokenize_docs(self, docs):
        return [self.detokenize_doc(doc) for doc in docs]



preserving_tokenizer = PreservingTokenizer()
tokenized_docs = preserving_tokenizer.tokenize_docs(tagged_docs)


print(tagged_docs[0][0:5]) # first doc first 5 tags
print(tokenized_docs[0][0:15]) # first doc first 15 tags
print(preserving_tokenizer.detokenize_docs(tokenized_docs)[0][0:5])

In [None]:
#--------------------------------------
#
#    Normalize data
#
#--------------------------------------

# Todo: use unk value? E.g. use only words contained by min 5% of every doc 

import unicodedata
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download('stopwords')


# Turn a Unicode string to plain ASCII, thanks to # https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


class FeaturePreprocessor():
    def __init__(self, lang = 'hungarian'):
        self.stopwords = stopwords.words(lang)
        self.stemmer = SnowballStemmer(lang)

    """
        Lowercarse and stemm
    """    
    def normalizeWord(self, word):
        lowered = str(word).casefold()
        stemmed = self.stemmer.stem(lowered)
        
        lowered = unicodeToAscii(lowered)
        stemmed = unicodeToAscii(stemmed)
        
        return lowered, stemmed
    
    """
        Create feature dictionary for word at doc[index+shift]
    """
    def word_features(self, doc, index, shift=0): # todo reduce run time
        index = index + shift
        word = doc[index][0]
        lowered, stemmed = self.normalizeWord(word)

        prefix = f'{shift}:' if shift < 0 else (f'+{shift}:' if shift > 0 else '')
        features = {
            # todo: isupper, istitle
            'bias': 1.0,
            f'{prefix}stem': stemmed,
            f'{prefix}[:3]': lowered[:3],
            f'{prefix}[:2]': lowered[:2],
            f'{prefix}[-3:]': lowered[-3:],
            f'{prefix}[-2:]': lowered[-2:],
            f'{prefix}isdigit': lowered.isdigit(),
            f'{prefix}isalpha': lowered.isalpha(),
            f'{prefix}isalnum': lowered.isalnum(),
            f'{prefix}isstop': lowered in self.stopwords
        }
        if index == 0:
            features[f'{prefix}BOS'] = True
        elif index == len(doc)-1:
            features[f'{prefix}EOS'] = True
            
        return features
    
    """
        Create n-gram like features for word at doc[index]
        - n = 2 * n_step + 1
    """
    def n_gram(self, doc, index, n_step):
        features = {}
        for shift in range(-n_step, n_step+1):
            if index+shift >= 0 and index+shift < len(doc):
                features.update(self.word_features(doc, index, shift))

        return (features, doc[index]) 
    
    
    def transform(self, docs, n_step=2):
        return [[self.n_gram(doc, index, n_step) for index in range(len(doc))] for doc in docs]
    


feature_preprocessor = FeaturePreprocessor()
prepared_docs = feature_preprocessor.transform(tokenized_docs)


print(' '.join(np.array(tokenized_docs[0])[:100,0]))
print('---------------------------------------')
print(np.array(prepared_docs[0])[2]) # first doc third feature
print('---------------------------------------')
print(' '.join(np.array(tokenized_docs[0])[:5,2])) # first doc first 5 tags

In [None]:
#--------------------------------------
#
#    Split doc into equal-length windows
#
#--------------------------------------


# TODO: use the longest tag value for calc the window size?
class SequenceSplitter():
    """
        Split length evenly into sublength
    """ 
    def split_length_evenly(self, total_len, max_sublength):
        # calculate split lengths
        quotient = int(total_len / max_sublength) 
        remainder = total_len % max_sublength
        if remainder == 0:
            return np.array([max_sublength] * quotient)
        lens = np.array([max_sublength] * quotient + [remainder])
        
        # optimize length to have the smallest diff between min and max
        min_len, max_len = lens.min(), lens.max()
        while(max_len - min_len >= 2):
            lens[lens.argmin()] = min_len + 1
            lens[lens.argmax()] = max_len - 1
            min_len, max_len = lens.min(), lens.max()
       
        return lens
    
    """
        Get starting indexes of sequence windows
    """
    def window_indexes(self, total_size, window_size, min_overlap_size):
        total_len = total_size
        # window = section + overlapping
        # total_len = section1 + section2 +...+ last_section
        
        # last section have full window size since it can not overlap with the following section
        last_section_len = window_size
        # other sections may have various sizes, overlapping should "pad" them to window size
        sections_total_len = total_len - last_section_len
        max_section_len = window_size - min_overlap_size
        
        # calculate sections' various lengths
        section_lens = self.split_length_evenly(sections_total_len, max_section_len)
        section_lens = np.append(section_lens, last_section_len)
        
        # calculate sections' (or windows') starting indexes
        window_indexes = section_lens.cumsum() - section_lens
        
        return window_indexes
    
    """
        Split tokens and tags into overlapping windows
    """
    def n_paragraph(self, doc, window_size, min_overlap_size):
        
        total_len = len(doc)
        window_indexes = self.window_indexes(total_len, window_size, min_overlap_size)
        
        return [doc[i : i+window_size] for i in window_indexes]

    """
        Split every doc into overlapping windows with equal lengths
    """
    def split_docs(self, docs, window_size=150, min_overlap_size=50):
        splitted_docs = []
        for doc_i, doc in enumerate(docs):
            for window in self.n_paragraph(doc, window_size, min_overlap_size):
                splitted_docs.append((doc_i, window))
        return splitted_docs

    """
        Concat overlapping windows into original format docs
        - Handle overlapping tags
    """
    def concat_doc(self, doc, window_size=150, min_overlap_size=50): # doc: [(subword, wi, tag), ...]
        section_size = window_size - min_overlap_size
        concated_doc = []
        next_token_i = -1
        for window_i, window in enumerate(doc):
            for token_i, item in enumerate(window):
                _, wi, tag = item
                # if the previous window's overlap already added this token
                if token_i <= next_token_i:
                    continue
                # if in overlap and O tag, go to the next window (except if last window)
                if token_i > section_size and tag == O_tag and not window_i == len(doc)-1:
                    break
                # else add new token item
                concated_doc.append(item)
                next_token_index = wi
                
        return concated_doc
    
    """
        Concat every windows into their original doc
    """
    def concat_docs(self, docs, window_size=150, min_overlap_size=50):
        splitted_docs = [[]]
        last_doc_i = 0
        for doc_i, window in docs:
            if doc_i == last_doc_i:
                splitted_docs[-1].append(window)
            else:
                splitted_docs.append([window])
                last_doc_i = doc_i
                
        return [self.concat_doc(doc, window_size, min_overlap_size) for doc in splitted_docs]
                
                   
    
sequence_splitter = SequenceSplitter()
window_size = 150
overlap_size = 50
splitted_docs = sequence_splitter.split_docs(prepared_docs, window_size, overlap_size)


print(splitted_docs[0][1][1]) # first doc, second window, second token

In [None]:
X = [[item[0] for item in window] for _, window in splitted_docs]
y = [[item[1][2] for item in window] for _, window in splitted_docs]

print(np.array(X).shape)
print(np.array(y).shape)

In [None]:
#!pip install sklearn_crfsuite

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [None]:
#--------------------------------------
#
#    Train model
#
#--------------------------------------

import sklearn
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF(
    all_possible_transitions=True
)

params_space = {
    'algorithm': ['l2sgd', 'lbfgs'], # lbfgs, l2sgd
    #'c2': [0.9],
    #'c1': [0.2, 0.7], # csak lbfgs
    'max_iterations': [100, 200], #[700, 1000], 
    #'num_memories': [6, 10], # csak lbfgs
}

f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=np.unique(flatten(y)))

rs = RandomizedSearchCV(
    crf, params_space,
    cv=4,
    verbose=1,
    n_jobs=1,
    n_iter=10,
    scoring=f1_scorer
)

rs.fit(X, y)
crf = rs.best_estimator_

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
#--------------------------------------
#
#    Evaluate and save model
#
#--------------------------------------

labels = list(crf.classes_)

y_pred = crf.predict(X)

print(classification_report(flatten(y), flatten(y_pred), labels=labels))

labels.remove('O')
metrics.flat_f1_score(y, y_pred, average='weighted', labels=labels)

In [None]:
from joblib import dump, load
dump(crf, 'model.joblib')

# Inference

In [None]:
tagged_test = [[(O_tag, text)] for text in raw_data['ocr_text'].values]

tokenized_test = preserving_tokenizer.tokenize_docs(tagged_test)

print(tokenized_test[0][:15])

In [None]:
prepared_test = feature_preprocessor.transform(tokenized_test)

print(prepared_test[0][2])

In [None]:
splitted_test = sequence_splitter.split_docs(prepared_test)

print(splitted_test[0][1][1])

In [None]:
X_test = [[item[0] for item in window] for _, window in splitted_test]

print(np.array(X).shape)

In [None]:
loaded_crf = load('model.joblib')

In [None]:
test_pred = loaded_crf.predict(X_test)

print(test_pred[0])

In [None]:
def merge_pred_with_docs(pred, docs):
    return [
        (
            docs[i][0], 
            [
                (
                    docs[i][1][ii][1][0], 
                    docs[i][1][ii][1][1], 
                    pred[i][ii]
                ) 
                for ii in range(len(docs[0]))
            ]
        ) 
        for i in range(len(docs))
    ]

merged_pred = merge_pred_with_docs(test_pred, splitted_test)

print(merged_pred[0])

In [None]:
concat_pred = sequence_splitter.concat_docs(merged_pred)

print(concat_pred[0][:15])

In [None]:
detokenized_pred = preserving_tokenizer.detokenize_docs(concat_pred)

print(detokenized_pred[0][:5])

In [None]:
def extract_results(pred):
    results = []
    for doc in pred:
        tag_values = {}
        for tag, value in doc:
            if tag != O_tag and (tag not in tag_values or len(tag_values[tag]) < len(value)):
                tag_values[tag] = value
        results.append(tag_values)
        
    return results

  
results = extract_results(detokenized_pred)
    
print(results[0])

In [None]:
def init():
    model = load('model.joblib')
    return model

In [None]:
def predict(texts, model):
    tagged_test = [[(O_tag, text)] for text in texts]
    tokenized_test = preserving_tokenizer.tokenize_docs(tagged_test)
    prepared_test = FeaturePreprocessor().transform(tokenized_test)
    splitted_test = sequence_splitter.split_docs(prepared_test)
    X_test = [[item[0] for item in window] for _, window in splitted_test]
    
    test_pred = model.predict(X_test)

    merged_pred = merge_pred_with_docs(test_pred, splitted_test)
    concat_pred = sequence_splitter.concat_docs(merged_pred)
    detokenized_pred = preserving_tokenizer.detokenize_docs(concat_pred)
    results = extract_results(detokenized_pred)
    return results