In [None]:
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.insert(1, '..')
from classes import DataLoader, MyIterator, Chunker, WordEmbedding, Evaluator, MyBertEmbedding
from classes.utils import load_presidio, google_search, results, save_hits, IC, Utility_preservation
from tqdm.auto import tqdm, trange
import numpy as np
import pandas as pd
import ast
import math

## Load data

In [None]:
in_dir = '../data/wiki/'
out_dir = '../data/wiki/'
loader = DataLoader(in_dir, out_dir)
# load the processed xml files
loader.load(load_xml=True)

In [None]:
soups = loader.get_soups()

## Load presidio annotaion

In [None]:
soups = load_presidio(soups)

In [None]:
# loader = DataLoader('./generalized/', out_dir)
# loader.load(load_xml=None)

In [None]:
documents = loader.get_chunk_lbl(chunking=True, refresh=True, tokenizer='spacy', testTokenizer=True, originalChunk=True)

In [None]:
lines = loader.get_lines(abs_tag='originalabstract', chunk=True, tokenizer='stanford', testTokenizer=True, originalChunk=True)
docs = list(lines.values())

## Load and finetune our pre-trained model on the new articles which they are belong to the generalized entities

In [None]:
# word embedding parameters
vec_size = 300
window = 10
sg = 1
min_count = 1
epochs = 10
t = 'word2vec'

In [None]:
%%time
# build the embedding Model
embeddingModel = WordEmbedding(sg, vec_size, window, min_count, workers=10, t=t)
# load the processed xml files
in_dir =  '../data/NewArticles/'
loader1 = DataLoader(in_dir, in_dir)
loader1.load(load_xml=None, generate=True)
lines1 = loader1.get_lines(abs_tag='originalabstract', chunk=True, refresh=True, originalChunk=True)
docs1 = list(lines1.values())

# train the embedding Model
embeddingModel.fit(MyIterator(docs + docs1), epochs)
# load the model
# embeddingModel.load()

In [None]:
%%time
evaluator1 = Evaluator(loader, embeddingModel)

In [None]:
%%time
gen_soups = evaluator1.export_generalized(documents, soups)

In [None]:
results('fadi & hassan')

## Calculate the utility of the anonymized text

In [None]:
def get_all(tag, tp='g'):
    ics = []
    cs = []
    for key in tqdm(gen_soups, desc=tag):
        ic, c = Utility_preservation(gen_soups[key], tag=tag, evaluate=tp, loader=loader)
        ics.append(ic)
        cs.append(c)
    return np.average(ics), sum(cs)

def export_table(evaluator, soups, 
                 rows=['nertext3', 'nertext4', 'nertext7', 'presidio', 'word2vec_gen'], 
                 alies=['NER 3', 'NER 4', 'NER 7', 'Presidio', 'Our method'], table=None, df=None):
    data = {}
    if table is None:
        table = "  & Suppression & Generalization & Avg. masked terms \\\\ \hline"
    for i in trange(len(rows)):
        s, c1 = get_all(tag=rows[i], tp='r')
        g, c2 = get_all(tag=rows[i], tp='g')
        c = c1
        c /= len(soups)
        data[alies[i]] = {w:v for w, v in zip("Suppression&Generalization&Avg. masked terms".split('&'), [s, g, c])}
        table += "\n%s & %.2f\\%%  & %.2f\\%% & %.2f\\%% \\\\ \hline " % (alies[i], s, g, c)
        
    if not df is None:
        df = pd.DataFrame(data).transpose()["Suppression&Generalization&Avg. masked terms".split('&')]
    save_hits()
    display(df)
    return table

In [None]:
lines = export_table(evaluator1, gen_soups, df=pd.DataFrame())
# print(lines)