In [None]:
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.insert(1, '..')
from classes import DataLoader, MyIterator, Chunker, WordEmbedding, Evaluator, MyBertEmbedding
from classes.utils import load_presidio, load_original_article_from_wikipedia, pre_processing, add_annotaion_tag, nlp1
from tqdm.auto import tqdm, trange
import pandas as pd
import numpy as np
from collections import OrderedDict
from sklearn.model_selection import train_test_split
import spacy

## Set some environment variables to make tensorflow behavior deterministic

In [None]:
import tensorflow as tf
import os
import random
SEED = 2020
os.environ['TF_DETERMINISTIC_OPS'] = '1'
# Now build your graph and train it
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
tf.__version__

## Load data

In [None]:
in_dir = '../data/wiki/'
out_dir = '../data/wiki/'
loader = DataLoader(in_dir, out_dir)
# load the processed xml files
loader.load(load_xml=True)

In [None]:
soups = loader.get_soups()
soups = OrderedDict(soups)

## Load presidio annotaion

In [None]:
soups = load_presidio(soups)

In [None]:
documents = loader.get_chunk_lbl(chunking=True, refresh=True, tokenizer='spacy', testTokenizer=True, originalChunk=True)

In [None]:
lines = loader.get_lines(abs_tag='originalabstract', chunk=True, tokenizer='stanford', testTokenizer=True, originalChunk=True)
docs = list(lines.values())

## Load and finetune our pre-trained model on the new articles which they are belong to the generalized entities

In [None]:
# word embedding parameters
vec_size = 300
window = 10
sg = 1
min_count = 1
epochs = 10
t = 'fasttext'

In [None]:
%%time
# build the embedding Model
embeddingModel = WordEmbedding(sg, vec_size, window, min_count, workers=10, t=t)
# load the processed xml files
in_dir =  '../data/NewArticles/'
loader1 = DataLoader(in_dir, in_dir)
loader1.load(load_xml=None, generate=True)
lines1 = loader1.get_lines(abs_tag='originalabstract', chunk=True, refresh=True, originalChunk=True)
docs1 = list(lines1.values())

# train the embedding Model
embeddingModel.fit(MyIterator(docs + docs1), epochs)
# load the model
# embeddingModel.load()

## Or load wiki fasttext

In [None]:
# # model_path = "fasttext_models/wiki.simple.bin"
# model_path = "fasttext_models/wiki.en.bin"

# embeddingModel.load(model_path, model_type='fasttext')

# Download from wikipedia

In [None]:
pages_df = load_original_article_from_wikipedia(soups)

In [None]:
# print number of sentences per article
# len(pages_df[pages_df['label'] != 'OTHER']['key'].value_counts()), pages_df[pages_df['label'] != 'OTHER']['key'].value_counts()

## Train, validation splitting

In [None]:
train_df, dev_df = train_test_split(pages_df, stratify=pages_df['label'], test_size=0.2, random_state=42)

In [None]:
train_df = train_df[train_df['text'].apply(lambda x: len(x.split(' ')) >= 2)]

## Pre-processing the data

In [None]:
train_texts, train_labels, dev_texts, dev_labels, unique_labels = pre_processing(train_df, dev_df)

## Build bert classifier

In [None]:
from transformers import AutoConfig, AutoModel, AutoTokenizer, BertConfig, DistilBertConfig, \
                         TFBertModel, BertTokenizerFast, DistilBertTokenizer, BatchEncoding, \
                         TFBertForSequenceClassification, TFDistilBertModel
from tokenizers import Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

In [None]:
# Bert
MODEL_NAME = 'bert-base-cased'
# MODEL_NAME = 'distilbert-base-cased'
if 'large' in MODEL_NAME:
    vector_size = 1024
else:
    vector_size = 768

if 'distil' in MODEL_NAME:
    config  = DistilBertConfig.from_pretrained(MODEL_NAME, output_hidden_states=False, num_labels=len(unique_labels))
    bert_model = TFDistilBertModel.from_pretrained(MODEL_NAME, config=config)
    tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
else:
    config = BertConfig.from_pretrained(MODEL_NAME, output_hidden_states=False, num_labels=len(unique_labels))
    bert_model = TFBertForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
    tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
    bert_model.layers[-1].activation = tf.keras.activations.softmax


In [None]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
bert_model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

## Input encoding

In [None]:
train_encodings = tokenizer([sent for sent in train_texts], padding=True, truncation=True, max_length=512, return_tensors='tf')
dev_encodings = tokenizer([sent for sent in dev_texts], padding=True, truncation=True, max_length=512, return_tensors='tf')

In [None]:
le = LabelEncoder()
le.fit(pages_df['label'])
y_train = le.transform(train_df['label'])
y_train = to_categorical(y_train.reshape(-1, 1))
y_dev = le.transform(dev_df['label'])
y_dev = to_categorical(y_dev.reshape(-1,1))

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
dev_dataset = tf.data.Dataset.from_tensor_slices((
    dict(dev_encodings),
    y_dev
))

## Train BERT

In [None]:
bsize = 18
history = bert_model.fit(train_dataset.batch(bsize), 
               epochs=20, 
               batch_size=bsize,
               # validation_split=0.2,
               validation_data=dev_dataset.batch(bsize),
               callbacks=[EarlyStopping(monitor='val_accuracy', patience=5, min_delta=0.0001)])

## Load the generalized text

In [None]:
%%time
evaluator1 = Evaluator(loader, embeddingModel)
gen_soups = evaluator1.export_generalized(documents, soups, threshold=0.25)
print(len(gen_soups))

In [None]:
#Add annotation tag
gen_soups = add_annotaion_tag(gen_soups)

## Use BERT to predict the names of the actors that Wikipedia article summaries belong to them

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from collections import Counter, OrderedDict
def my_split2(soup, tag='nertext3'):
    orig_text = []
    gen_text = []
    for e in soup.find(tag).text.split(' '):
        if '{' in e and '}' in e and '[' in e and ']' in e:
            orig = e[e.index('[')+1:e.index(']')]
            gen = e[1:e.index('[')].replace(' ', '_')
            origs = orig.split('_')
            gens = gen.split('_')
            orig_text.extend(origs)
#             gen_text.extend(gens)
#             orig_text.append(origs[0])
            gen_text.append(gens[0])
#             gen_text.append(gen)
        else:
            orig_text.extend(e.split('_'))
            gen_text.extend(e.split('_'))
    # print(' '.join(gen_text))
    return orig_text, gen_text

def predict(soup, tag='nertext3'):
    orig_text, gen_text = my_split2(soup, tag=tag)
    # print(gen_text)
    orig_sents = [str(sent) for sent in nlp1(' '.join(orig_text)).sents]
    gen_sents = []
    offset = 0
    for sent in orig_sents:
        gen_sents.append(' '.join(gen_text[offset:offset + sent.count(' ') + 1]))
        offset += sent.count(' ') + 1
    # gen_sents = [str(sent) for sent in nlp1(gen_text).sents]
    orig_tokens = tokenizer(orig_sents, padding=True, truncation=True, max_length=512, return_tensors='tf')
    gen_tokens = tokenizer(gen_sents, padding=True, truncation=True, max_length=512, return_tensors='tf')
    orig_out = bert_model(orig_tokens)[0].numpy().argmax(axis=1)
    gen_out = bert_model(gen_tokens)[0].numpy().argmax(axis=1)
    return list(le.inverse_transform(orig_out)), list(le.inverse_transform(gen_out))
    
def evaluate(soup, tag='nertext3'):
    original = False
    if tag == 'original':
        tag = 'nertext3'
        original = True
    label = soup.find('title').text
    orig_labels, gen_labels = predict(soup, tag=tag)
    if original:
        vc = Counter(orig_labels)
    else:
        vc = Counter(gen_labels)
    vc = OrderedDict(sorted(vc.items(), key=lambda kv: kv[1], reverse=True))
    # print(vc)
    # print(vc.items())
    pred = list(vc)[0]
    if len(vc) > 1:
        if vc[list(vc)[0]] == vc[list(vc)[1]]:
            pred = 'OTHER'
    return label, pred

## Evaluate BERT prediction

In [None]:
tags = ['original', 'annotation', 'nertext3', 'nertext4', 'nertext7', 'presidio', 'word2vec', 'word2vec_gen']
aliases = ['Original summary', 'Manual annotation', 'NER 3', 'NER 4', 'NER 7', 'Presidio', 'Our method', 'Our method + gen']
data = []
cols = ['tag', 'file', 'orig', 'pred']

for i, tag in tqdm(enumerate(tags), total=len(tags)):
    alias = aliases[i]
    count = 0
    for key in gen_soups:
        if gen_soups[key].find('title').text in unique_labels:
            orig, pred = evaluate(gen_soups[key], tag=tag)
            data.append([alias, key, orig, pred])
    # data.append([tag, 'general', 'OTHER', 'OTHER'])
df1 = pd.DataFrame(data, columns=cols)

In [None]:
# Summary level
columns = ['Input', 'Correct predictions', 'Correct predictions %']
data = []
for tag, group in tqdm(df1.groupby('tag', sort=False), total=len(df1['tag'].value_counts())):
    count = sum([x == y for x, y in zip(group['orig'], group['pred'])])
    data.append([tag, count, count / len(gen_soups)])
df3 = pd.DataFrame(data, columns=columns)
df3