# Preprocessing

In [1]:
import pickle
import json
import os
from preprocessing import clean_gt, clean_raw, label_entity

In [None]:
split = pickle.load(open('data/train_test_eval_filenames_new.pkl', 'rb'))
split

In [39]:
#train = 94, test = 30, eval = 10

In [3]:
"""
import random
new_split = {"train":[], "test": [], "eval": []}
for key in split:
    for page in split[key]:
        mag = page.split("_")[0].split("-")[0]
        year = page.split("_")[1]
        
        if mag == "dkm" and (year == "1941" or year == "2010"):
            new_split["train"].append(page)
        if (year == "1990"):
            new_split["test"].append(page)
        if mag =="sbz" and (year == "1895" or year == "1940" or year == "1965" or year == "2010"):
            new_split["train"].append(page)

eval_set = random.sample(new_split["train"], int(len(new_split["train"])/10)) #set 10% of train aside for eval
for page in eval_set:
    new_split["train"].remove(page)
new_split["eval"] = eval_set

with open('train_test_eval_filenames_new.pkl', 'wb') as out:
    pickle.dump(new_split, out)
"""

In [4]:
data = {
    "train": [],
    "test": [],
    "eval": []
}
gt_data = []
for mag in ["dkm", "sbz"]:
    for year in os.listdir(f'data/raw/link/{mag}'):
        with open(os.path.join("data/raw/link", mag, year)) as f:
            input_linked = json.load(f)
        with open(os.path.join("data/ground-truth", mag, year)) as f:
            gt = json.load(f)
        gt = clean_gt(gt)
        gt_data += gt
        input_linked = clean_raw(input_linked)

        #due to non-determinism in the flair NER:
        all_refs_gt = [g["page"]+g["coord"] for g in gt] 
        all_refs_linked = [ent["page"]+ent["coord"] for l in input_linked for ent in l]
        all_valid_refs = set(all_refs_gt).intersection(set(all_refs_linked))

        for ent_variations in input_linked:
            for key in split:
                ent_instances = []
                for ent in ent_variations:
                    if ent["page"] in split[key]:
                        if (ent["page"]+ent["coord"]) in all_valid_refs:
                            ent_instances.append({"ent": ent, "label": label_entity(ent, gt)})
                if ent_instances:
                    data[key].append(ent_instances)

In [5]:
with open("data/processed.pkl", "wb") as f:
    pickle.dump(data, f)

# Candidate Generation

In [1]:
import pickle
from tqdm.notebook import  tqdm
from candidate_generation import create_metagrid_candidates

In [2]:
# load everytime you run this as we pop keys to keep data clean..
with open("data/processed.pkl", "rb") as f:
    data = pickle.load(f)

for split in ["train", "eval", "test"]:
    ent_cand_label = []
    i = 0
    for entity_list in tqdm(data[split], smoothing=0.01):
        i += 1
        # Create candidates only for the first entry in the list as all the entity information is always the same
        # The only thing that changes are pages and page_coordinates
        candidates = create_metagrid_candidates(ent=entity_list[0]["ent"])
        # Generate the list of page_coordinates and the corresponding labels!
        coord_list = []
        label_list = []
        for ent_dict in entity_list:
            ent = ent_dict["ent"]
            coord_list.append({
                "page": ent.pop("page", ""), 
                "coords": ent.pop("coord", "")
            })
            label_list.append(ent_dict["label"])
        ent_cand_label.append({"entity": ent, "candidates": candidates, "occurences": coord_list, "labels": label_list})
        if i % 100 == 0:
            with open(f"candidates-gnd-{split}.pkl", "wb") as f:
                pickle.dump(ent_cand_label, f)
                    
    with open(f"candidates-gnd-{split}.pkl", "wb") as f:
       pickle.dump(ent_cand_label, f)

  0%|          | 0/574 [00:00<?, ?it/s]

# Feature Generation

### To get the relevant fastttext model uncomment and run the following cell

In [38]:
# from gensim.models.fasttext import FastText, load_facebook_vectors
# model = load_facebook_vectors("cc.de.300.bin/cc.de.300.bin")
# model.save("./fasttext")

In [9]:
import pickle
from tqdm.notebook import  tqdm
from feature_generation import create_features

In [None]:
for split in ["train", "eval", "test"]:
    with open(f"candidates-gnd-{split}.pkl", "rb") as f:
        ent_cand_label = pickle.load(f)

    for ent_dict in tqdm(ent_cand_label):
        feature_list = []
        for candidate in ent_dict["candidates"]:
            features = create_features(ent_dict["entity"], candidate)
            feature_list.append(features)
        ent_dict["features"] = feature_list
    with open(f"data/features/{split}.pkl", "wb") as f:
            pickle.dump(ent_cand_label, file=f)

# Ranking

## Load Data

In [3]:
import pickle
from tqdm.notebook import tqdm
import numpy as np
from feature_generation import get_gnd

In [14]:
d = {"train": {}, "eval": {}}
for split in ["train", "eval"]:
    for filename in ["y", "X"]:
        with open(f"data/features/{split}/{filename}.pkl", "rb") as f:
            d[split][filename] = pickle.load(file=f)

In [None]:
d = {"train": {}, "eval": {}}
for split in ["train", "eval"]:
    with open(f"data/features/{split}.pkl", "rb") as f:
        d[split] = pickle.load(file=f)

In [None]:
def prep_training(entity_dict_list):
    X = []
    y = []
    X_ent = []
    y_ent = []
    for ent_dict in entity_dict_list:
        ent = ent_dict["entity"]
        labels = ent_dict["labels"]
        cand_gnds = [get_gnd(x) for x in ent_dict["candidates"]]
        
        for coocurence, label in zip(ent_dict["occurences"], ent_dict["labels"]):
            for features, candidate in zip(ent_dict["features"], ent_dict["candidates"]):



In [5]:
X_train = []
y_train = []
X_test = []
y_test = []
y_test_ids = []
x_test_ids = []

for X, y in zip(d["train"]["X"], d["train"]["y"]):
    # The features are always the same for a given entity...
    features = X.pop()
    for feature in features:
        X_train.append(feature[1:])
        if feature[0] == y[0]:
            y_train.append(1)
        else:
            y_train.append(0)

for x_ent, y_ent in zip(d["eval"]["X_ent"], d["eval"]["y_ent"]):
    for x in x_ent:
        X_test.append(x[1:])
        if x[0] == y_ent[0]:
            y_test.append(1)
        else:
            y_test.append(0)
        x_test_ids.append(x[0])
        y_test_ids.append(y_ent[0])

# Training

In [16]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="squared_error", bootstrap=True)
model.fit(X_train, y_train)

NameError: name 'X_train' is not defined

## Evaluation

In [7]:
def rank_candidates(candidates, model):
    candidate_scores = []
    candidate_ids = []
    for candidate in candidates:
        features = candidate[1:]
        score = model.predict(np.array(features).reshape(1,-1))[0]
        candidate_scores.append(score)
        candidate_ids.append(candidate[0])
    if candidate_ids:
        candidate_ids = np.array(candidate_ids)
        indices = np.argsort(candidate_scores)
        ids_sorted = np.array(candidate_ids)[indices[::-1]]
        scores_sorted = np.array(candidate_scores)[indices[::-1]]
        return ids_sorted, scores_sorted
    else:
        return [], []

### Mention Level

In [8]:
ids = []
scores = []
y_ids = []
for x_ent, y_ent in tqdm(zip(d[split]["X_ent"], d[split]["y_ent"])):
    ids_local, scores_local = rank_candidates(x_ent, model)
    ids.append(ids_local)
    scores.append(scores_local)
    y_ids.append(y_ent[0])

0it [00:00, ?it/s]

In [26]:
(ids[0])

array(['1054380449', '1054380538', '126133123', '1208538853',
       '1160560757', '129740101', '126133115', '1116500493', '1232438294',
       '1063799058', '126133158', '1080667032', '1054380597',
       '1054380562', '12613314X', '173450776', '1038629071', '111795834',
       '105438049X', '1054380473', '126133131'], dtype='<U10')

In [23]:
treshold = 0.000
split = "eval"

In [51]:
tp = 0
fp = 0
tn = 0
fn = 0
for ids_local, scores_local, y in zip(ids, scores, y_ids):
    ids_local_thresh = [id_loc for id_loc, score_loc in zip(ids_local, scores_local) if score_loc > treshold]
    #ids_local_thresh = ids_local
    #print(ids_local)
    if y in ids_local_thresh:
        tp += 1
    elif y == '' and ids_local_thresh != []:
        fp += 1
    elif y != '' and ids_local_thresh == []:
        fn += 1
    else:
        tn += 1

print(tp, fp, fn)
accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = 2* (precision * recall)/(precision+recall)
print("F1:          ", round(f1,4))
print("Recall:      ", round(recall,4))        
print("Precision:   ", round(precision,4))
print("Accuracy:    ", round(accuracy,4))


21 28 23
F1:           0.4516
Recall:       0.4773
Precision:    0.4286
Accuracy:     0.7119


### Entity Level

In [39]:
def normalize_prediction(ids, scores, cutoff_score=0.5, n_candidates=5):
    count = 0
    result = []
    for id, score in zip(ids, scores):
        count += 1
        if count > n_candidates:
            return result
        else:
            if score > cutoff_score:
                result.append(id)
            else:
                result.append('')
    if len(result) < 1 :
        result.append('')
    return result

In [37]:
split = "eval"

In [38]:
ids = []
scores = []
for x_ent in tqdm(d[split]["X_ent"]):
    ids_local, scores_local = rank_candidates(x_ent, model)
    ids.append(ids_local)
    scores.append(scores_local)

  0%|          | 0/177 [00:00<?, ?it/s]

In [40]:
treshold = 0.01
n = 7

In [41]:
predictions = []
for ids_local, scores_local in zip(ids, scores):
    prediction = normalize_prediction(ids_local, scores_local, cutoff_score=treshold, n_candidates = n)
    predictions.append(prediction)
tp = 0
fp = 0
tn = 0
fn = 0

for prediction, y in zip(predictions, d[split]["y_ent"]):
    if y[0] == '':
        if y[0] in prediction:
            tn +=1
        else:
            fp += 1
    else:
        if y[0] in prediction:
            tp += 1
        else:
            fn += 1

accuracy = (tp +tn)/(tp +tn +fp +fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = 2* (precision * recall)/(precision +recall)
print("F1:          ", round(f1,4))
print("Recall:      ", round(recall,4))        
print("Precision:   ", round(precision,4))
print("Accuracy:    ", round(accuracy,4))
print(tp, tn, fp, fn)

F1:           0.5385
Recall:       0.4118
Precision:    0.7778
Accuracy:     0.7966
21 120 6 30
