In [None]:
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.insert(1, '..')
from classes import DataLoader, MyIterator, Chunker, WordEmbedding, Evaluator, MyBertEmbedding
from classes.utils import load_presidio
from tqdm.auto import tqdm, trange
import pandas as pd
import numpy as np
from collections import OrderedDict

## Load data

In [None]:
in_dir = '../data/wiki/'
out_dir = '../data/wiki/'
loader = DataLoader(in_dir, out_dir)
# load the processed xml files
loader.load(load_xml=True)

In [None]:
soups = loader.get_soups()

## Load presidio annotaion

In [None]:
soups = load_presidio(soups)

In [None]:
documents = loader.get_chunk_lbl(chunking=True, tokenizer='stanford')

In [None]:
# documents['actor_10429.xml']

## Load our pre-trained model

In [None]:
# word embedding parameters
vec_size = 300
window = 10
sg = 1
min_count = 1
epochs = 10
t = 'word2vec'

In [None]:
%%time
# build the embedding Model
embeddingModel = WordEmbedding(sg, vec_size, window, min_count, t=t)
# load the model
embeddingModel.load()

## Or load wiki fasttext

In [None]:
%%time
# model_path = "fasttext_models/wiki.simple.bin"
# model_path = "fasttext_models/cc.en.300.bin"
# model_path = "fasttext_models/wiki.en.bin"

# embeddingModel.load(model_path, model_type='fasttext')

## Or load google news vecs in gensim

In [None]:
# model_path = 'word2vec_models/GoogleNews-vectors-negative300.bin.gz'
# embeddingModel.load(model_path, model_type='word2vec')

## Or load bert pre-trained model

In [None]:
# %%time
# embeddingModel.model = MyBertEmbedding(size='base')

In [None]:
%%time
evaluator1 = Evaluator(loader, embeddingModel)

## Evaluate the ducuments

In [None]:
%%time
gen_soups = evaluator1.export(documents, soups, threshold=0.25)

In [None]:

def export_table(evaluator, soups, 
                 rows=['nertext3', 'nertext4', 'nertext7', 'presidio', 'word2vec'], 
                 alies=['NER 3', 'NER 4', 'NER 7', 'Presidio', 'Our method'], t=0.15, tp=0, table=None, df=None):
    data = {}
    if table is None:
        table = "  & Precision & Recall & F1 \\\\ \hline"
    for i in range(len(rows)):
        if tp == 0:
            p, r, f1 = evaluator.evaluate_all(documents, soups, threshold=t, tag=rows[i], silent=True)
        elif tp == 1:
            p, r, f1 = evaluator.coefficient_of_variation(documents, soups, threshold=t, tag=rows[i])
        else:
            actor_id = 'actor_19602.xml'
            p, r, f1 = evaluator1.evaluate(actor_id, documents, soups, threshold=t, tag=rows[i])
        p, r, f1 = p * 100, r * 100, f1 * 100
        data[alies[i]] = {w:v for w, v in zip("Precision&Recall&F1".split('&'), [p, r, f1])}
        if rows[i] != 'word2vec':
            table += "\n%s & %.2f\\%%  & %.2f\\%%  & %.2f\\%% \\\\ \hline " % (alies[i], p, r, f1)
        else:
            table += "\n%s & \\textbf{%.2f\\%%}  & \\textbf{%.2f\\%%}  & \\textbf{%.2f\\%%} \\\\ \hline " % (alies[i], p, r, f1)
        
    if not df is None:
        if tp == 0:
            df = pd.DataFrame(data).transpose()["Precision&Recall&F1".split('&')]
        else:
            df = pd.DataFrame(data).transpose()["Precision&Recall&F1".split('&')].\
            rename(columns=dict(zip("Precision&Recall&F1".split('&'), "Precision CP&Recall CP&F1 CP".split('&'))))
    display(df)
    return table

In [None]:
lines = export_table(evaluator1, gen_soups, t=0.25, tp=0, df=pd.DataFrame())
# print(lines)

In [None]:
lines = export_table(evaluator1, gen_soups, t=0.25, tp=1, table="  & Precision CP & Recall CP & F1 CP \\\\ \hline", df=1)
# print(lines)

## Export the anonymized documents

In [None]:
new_soups = evaluator1.export(documents, soups, threshold=0.15)