# Preprocessing

In [1]:
import pickle
import json
import os
from preprocessing import clean_gt, clean_raw, label_entity

In [None]:
split = pickle.load(open('data/train_test_eval_filenames_new.pkl', 'rb'))
split

In [39]:
# Though we do have a train/eval split, we actually simply combine the two for the cross-validation

In [4]:
data = {
    "train": [],
    "test": [],
    "eval": []
}
gt_data = []
for mag in ["dkm", "sbz"]:
    for year in os.listdir(f'data/raw/link/{mag}'):
        with open(os.path.join("data/raw/link", mag, year)) as f:
            input_linked = json.load(f)
        with open(os.path.join("data/ground-truth", mag, year)) as f:
            gt = json.load(f)
        gt = clean_gt(gt)
        gt_data += gt
        input_linked = clean_raw(input_linked)

        #due to non-determinism in the flair NER:
        all_refs_gt = [g["page"]+g["coord"] for g in gt] 
        all_refs_linked = [ent["page"]+ent["coord"] for l in input_linked for ent in l]
        all_valid_refs = set(all_refs_gt).intersection(set(all_refs_linked))

        for ent_variations in input_linked:
            for key in split:
                ent_instances = []
                for ent in ent_variations:
                    if ent["page"] in split[key]:
                        if (ent["page"]+ent["coord"]) in all_valid_refs:
                            ent_instances.append({"ent": ent, "label": label_entity(ent, gt)})
                if ent_instances:
                    data[key].append(ent_instances)


In [5]:
with open("data/processed.pkl", "wb") as f:
    pickle.dump(data, f)

# Candidate Generation

In [1]:
import pickle
from tqdm.notebook import  tqdm
from candidate_generation import create_metagrid_candidates, get_candidates_fuseki

In [None]:
# load everytime you run this as we pop keys to keep data clean..
with open("data/processed.pkl", "rb") as f:
    data = pickle.load(f)

for split in ["train", "eval", "test"]:
    ent_cand_label = []
    i = 0
    for entity_list in tqdm(data[split], smoothing=0.01):
        i += 1
        # Create candidates only for the first entry in the list as all the entity information is always the same
        # The only thing that changes are pages and page_coordinates
        candidates = create_metagrid_candidates(ent=entity_list[0]["ent"])
        # Generate the list of page_coordinates and the corresponding labels!
        coord_list = []
        label_list = []
        for ent_dict in entity_list:
            ent = ent_dict["ent"]
            coord_list.append({
                "page": ent.pop("page", ""), 
                "coords": ent.pop("coord", "")
            })
            label_list.append(ent_dict["label"])
        ent_cand_label.append({"entity": ent, "candidates": candidates, "occurences": coord_list, "labels": label_list})
        if i % 100 == 0:
            with open(f"data/candidates/metagrid/candidates-gnd-{split}.pkl", "wb") as f:
                pickle.dump(ent_cand_label, f)
                    
    with open(f"data/candidates/metagrid/candidates-gnd-{split}.pkl", "wb") as f:
        pickle.dump(ent_cand_label, f)

# Feature Generation

#### To get the relevant fastttext model uncomment and run the following cell

In [38]:
# from gensim.models.fasttext import FastText, load_facebook_vectors
# model = load_facebook_vectors("cc.de.300.bin/cc.de.300.bin")
# model.save("./fasttext")

In [3]:
import pickle
from tqdm import  tqdm
from feature_generation import candidates_to_features, process_fuseki_candidates

In [None]:
# load every time you run this as we pop keys to keep data clean..
with open("data/processed.pkl", "rb") as f:
    data = pickle.load(f)

generator = "metagrid" # or "fuseki"

problematic_entities = []
for split in ["train", "eval", "test"]:
    ent_cand_label = []
    i = 0
    for entity_list in tqdm(data[split], smoothing=0.01):
        i += 1
        # Create candidates only for the first entry in the list as all the entity information is always the same
        # The only thing that changes are pages and page_coordinates
        
        # fuseki:
        if generator == "fuseki":
            unique_candidate_dict = get_candidates_fuseki(entity_list[0]["ent"])
            candidates = process_fuseki_candidates(unique_candidate_dict)
        
        # metagrid:
        if generator == "metagrid":
            candidates = create_metagrid_candidates(ent=entity_list[0]["ent"])
        
        #Generate the list of page_coordinates and the corresponding labels!
        coord_list = []
        gt_label = []
        for ent_dict in entity_list:
            ent = ent_dict["ent"]
            coord_list.append({
                "page": ent.pop("page", ""), 
                "coords": ent.pop("coord", "")
            })
            gt_label.append(ent_dict["label"])

        if len(gt_label)!=1:
            problematic_entities.append({"ent_list": entity_list, "gt_labels": gt_label, "mag": coord_list})
        gt_label = gt_label.pop()
        
        ent_cand_label.append({"entity": ent, "candidates": candidates, "occurences": coord_list, "labels": gt_label})
        
        if i % 100 == 0:
            with open(f"data/candidates/{generator}/candidates-gnd-{split}-{i}.pkl", "wb") as f:
                pickle.dump(ent_cand_label, f)
                    
    with open(f"data/candidates/{generator}/candidates-gnd-{split}.pkl", "wb") as f:
        pickle.dump(ent_cand_label, f)

# Feature Generation

In [12]:
import pickle
from tqdm import  tqdm
from feature_generation import candidates_to_features

FileNotFoundError: [Errno 2] No such file or directory: './fasttext'

In [None]:
generator = "metagrid" # or "fuseki"

In [None]:
for split in ["train", "eval", "test"]:
    with open(f"data/candidates/{generator}/candidates-gnd-{split}.pkl", "rb") as f:
        ent_cand_label = pickle.load(f)

    list_of_good_entities = []
    list_of_problematic_entities = []
    for ent_dict in tqdm(ent_cand_label):
        if len(set(ent_dict["labels"])) > 1:
            for label in set(ent_dict["labels"]):
                ent_dict["label"] = label
                features = candidates_to_features(ent=ent_dict["entity"], candidates=ent_dict["candidates"], gt_label=ent_dict["label"])
                ent_dict.update(features)
                list_of_problematic_entities.append(ent_dict.copy())
        else:
            ent_dict["label"] = set(ent_dict["labels"]).pop()
            features = candidates_to_features(ent=ent_dict["entity"], candidates=ent_dict["candidates"], gt_label=ent_dict["label"])
            ent_dict.update(features)
            list_of_good_entities.append(ent_dict)
            
    with open(f"data/features/{generator}/{split}.pkl", "wb") as f:
        pickle.dump(list_of_good_entities, file=f)
    
    with open(f"data/features/{generator}/{split}_problematic.pkl", "wb") as f:
        pickle.dump(list_of_problematic_entities, file=f)

# Ranking

## Load Data

In [1]:
import pickle
from tqdm import tqdm
import numpy as np
from evaluation import perform_experiment, crossvalidate_experiment

In [2]:
generator = "metagrid" # or "fuseki"

In [3]:
d = {"train": {}, "eval": {}, "test": {}}
for split in ["train", "eval", "test"]:
    with open(f"data/features/{generator}/{split}.pkl", "rb") as f:
        d[split] = pickle.load(file=f)

# problematic entities
d_problem = {"train": {}, "eval": {}, "test": {}}
for split in ["train", "eval", "test"]:
    with open(f"data/features/{generator}/{split}_problematic.pkl", "rb") as f:
        d_problem[split] = pickle.load(file=f)

d_combined = {"train": d["train"] + d_problem["train"], "eval": d["eval"] + d_problem["eval"], "test": d["test"] + d_problem["test"]}

**Best scores we could get**

In [4]:
from sklearn.ensemble import ExtraTreesRegressor

In [7]:
ent_scores, ment_scores = perform_experiment(
    keep_empty=True,
    do_sample=True,
    oversampling=2, # Multiple of how often we oversample y = 1
    balance=3, # multiple of y = 0 samples vs y = 1 samples
    train=d["train"] + d["eval"],
    eval=d["test"],
    model=ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="squared_error", bootstrap=True),
    n_s=[1,10], # How many candidates do we keep
    thresholds=[0.01, 0.2], # Where do we cut off
    verbose=False # Print stuff
)

In [8]:
print("\nEntity Level")
for score in ent_scores:
    print("N:", score["top_n"], "Threshold:", score["threshold"])
    score["score"].print_scores()

print("Mention Level")
for score in ment_scores:
    print("N:", score["top_n"], "Threshold:", score["threshold"])
    score["score"].print_scores()


Entity Level
N: 1 Threshold: 0.01
F1: 0.356 RE:  0.216 PR: 1.0 AC: 0.426
TP: 32 FN: 116 FP 0 TN 54


N: 1 Threshold: 0.2
F1: 0.346 RE:  0.209 PR: 1.0 AC: 0.421
TP: 31 FN: 117 FP 0 TN 54


N: 10 Threshold: 0.01
F1: 0.365 RE:  0.223 PR: 1.0 AC: 0.431
TP: 33 FN: 115 FP 0 TN 54


N: 10 Threshold: 0.2
F1: 0.346 RE:  0.209 PR: 1.0 AC: 0.421
TP: 31 FN: 117 FP 0 TN 54


Mention Level
N: 1 Threshold: 0.01
F1: 0.406 RE:  0.255 PR: 1.0 AC: 0.487
TP: 116 FN: 339 FP 0 TN 206


N: 1 Threshold: 0.2
F1: 0.401 RE:  0.251 PR: 1.0 AC: 0.484
TP: 114 FN: 341 FP 0 TN 206


N: 10 Threshold: 0.01
F1: 0.412 RE:  0.259 PR: 1.0 AC: 0.49
TP: 118 FN: 337 FP 0 TN 206


N: 10 Threshold: 0.2
F1: 0.401 RE:  0.251 PR: 1.0 AC: 0.484
TP: 114 FN: 341 FP 0 TN 206




Including problematic entities

In [9]:
ent_scores, ment_scores = perform_experiment(
    keep_empty=True,
    do_sample=True,
    oversampling=2, # Multiple of how often we oversample y = 1
    balance=3, # multiple of y = 0 samples vs y = 1 samples
    train=d_combined["train"] + d_combined["eval"],
    eval=d_combined["test"],
    model=ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="squared_error", bootstrap=True),
    n_s=[1,10], # How many candidates do we keep
    thresholds=[0.01, 0.2], # Where do we cut off
    verbose=False # Print stuff
)

In [10]:
print("\nEntity Level")
for score in ent_scores:
    print("N:", score["top_n"], "Threshold:", score["threshold"])
    score["score"].print_scores()

print("Mention Level")
for score in ment_scores:
    print("N:", score["top_n"], "Threshold:", score["threshold"])
    score["score"].print_scores()


Entity Level
N: 1 Threshold: 0.01
F1: 0.325 RE:  0.194 PR: 1.0 AC: 0.403
TP: 34 FN: 141 FP 0 TN 61


N: 1 Threshold: 0.2
F1: 0.317 RE:  0.189 PR: 1.0 AC: 0.398
TP: 33 FN: 142 FP 0 TN 61


N: 10 Threshold: 0.01
F1: 0.333 RE:  0.2 PR: 1.0 AC: 0.407
TP: 35 FN: 140 FP 0 TN 61


N: 10 Threshold: 0.2
F1: 0.325 RE:  0.194 PR: 1.0 AC: 0.403
TP: 34 FN: 141 FP 0 TN 61


Mention Level
N: 1 Threshold: 0.01
F1: 0.632 RE:  0.462 PR: 1.0 AC: 0.567
TP: 440 FN: 513 FP 0 TN 232


N: 1 Threshold: 0.2
F1: 0.631 RE:  0.461 PR: 1.0 AC: 0.566
TP: 439 FN: 514 FP 0 TN 232


N: 10 Threshold: 0.01
F1: 0.638 RE:  0.468 PR: 1.0 AC: 0.572
TP: 446 FN: 507 FP 0 TN 232


N: 10 Threshold: 0.2
F1: 0.637 RE:  0.467 PR: 1.0 AC: 0.571
TP: 445 FN: 508 FP 0 TN 232




Perform crossvalidation experiments like this:

In [None]:
 ent_scores, ment_scores = crossvalidate_experiment(
    train=d_combined["train"], # + d_combined["eval"],
    eval=d_combined["eval"],
    n_fold = 5,
    keep_empty=True,
    do_sample=True,
    oversampling=2, # Multiple of how often we oversample y = 1
    balance=3, # multiple of y = 0 samples vs y = 1 samples
    model=ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="squared_error", bootstrap=True),
    n_s=[1,10], # How many candidates do we keep
    thresholds=[0.01, 0.1, 0.3, 0.5], # Where do we cut off
    verbose=False # Print stuff
)

In [6]:
print("\nEntity Level")
for score in ent_scores:
    print("N:", score["top_n"], "Threshold:", score["threshold"])
    score["score"].print_scores()

print("Mention Level")
for score in ment_scores:
    print("N:", score["top_n"], "Threshold:", score["threshold"])
    score["score"].print_scores()


Entity Level
N: 1 Threshold: 0.01
F1: 0.484 RE:  0.324 PR: 0.958 AC: 0.608
TP: 23 FN: 48 FP 1 TN 53


N: 1 Threshold: 0.1
F1: 0.479 RE:  0.319 PR: 0.958 AC: 0.603
TP: 23 FN: 49 FP 1 TN 53


N: 1 Threshold: 0.3
F1: 0.463 RE:  0.306 PR: 0.957 AC: 0.595
TP: 22 FN: 50 FP 1 TN 53


N: 1 Threshold: 0.5
F1: 0.447 RE:  0.292 PR: 0.955 AC: 0.587
TP: 21 FN: 51 FP 1 TN 53


N: 10 Threshold: 0.01
F1: 0.545 RE:  0.375 PR: 1.0 AC: 0.643
TP: 27 FN: 45 FP 0 TN 54


N: 10 Threshold: 0.1
F1: 0.531 RE:  0.361 PR: 1.0 AC: 0.635
TP: 26 FN: 46 FP 0 TN 54


N: 10 Threshold: 0.3
F1: 0.5 RE:  0.333 PR: 1.0 AC: 0.619
TP: 24 FN: 48 FP 0 TN 54


N: 10 Threshold: 0.5
F1: 0.484 RE:  0.319 PR: 1.0 AC: 0.611
TP: 23 FN: 49 FP 0 TN 54


Mention Level
N: 1 Threshold: 0.01
F1: 0.626 RE:  0.459 PR: 0.983 AC: 0.649
TP: 119 FN: 140 FP 2 TN 144


N: 1 Threshold: 0.1
F1: 0.626 RE:  0.459 PR: 0.983 AC: 0.649
TP: 119 FN: 140 FP 2 TN 144


N: 1 Threshold: 0.3
F1: 0.615 RE:  0.448 PR: 0.983 AC: 0.642
TP: 116 FN: 143 FP 2 TN 144


# Hyperparameter tuning:

In [3]:
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel

models = [
    GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='squared_error'),
    ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="squared_error", bootstrap=True),
    ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="absolute_error", bootstrap=True),
    ExtraTreesClassifier(n_estimators=100, random_state=0, bootstrap=True),
    ElasticNet(random_state=0),
    GaussianProcessRegressor(kernel = DotProduct() + WhiteKernel(), random_state=0)
]

model_names = ["Gradient Boosting Regressor", "Extra Trees Regressor Squared", "Extra Trees Regressor Absolute", "Extra Trees Classifier", "Elastic Net", "Gaussian Process"]

In [11]:
balance_list = [1, 2, 3, 5]
oversampling_list = [1, 2, 3, 5]
n_s = [1,10]
thresholds = [0.01, 0.1, 0.2]

In [None]:
model_results_detailed = []

for model, model_name in zip(models, model_names):
    print(model_name)
    results = []
    for data, data_name in zip([d, d_combined], ["cleaned", "combined"]):
        print("Data:", data_name)
        for keep_empty in [True, False]:
            print("keep_empty:", keep_empty)
            for do_sample in [True, False]:
                print("do_sample:", do_sample)
                if do_sample:
                    for balance in balance_list:
                        print("Balance:", balance)
                        for oversampling in oversampling_list:
                            print("Oversampling:", oversampling)
                            ent_scores, ment_scores = crossvalidate_experiment(
                                    train = data["train"],
                                    eval = data["eval"],
                                    n_fold = 5,
                                    keep_empty=keep_empty,
                                    do_sample=do_sample, 
                                    oversampling=oversampling, 
                                    balance=balance,
                                    model=model,
                                    n_s=n_s,
                                    thresholds=thresholds,
                                    verbose=False 
                                )
                            results.append({
                                "keep_empty": keep_empty,
                                "do_sample": do_sample,
                                "balance": balance,
                                "oversampling": oversampling,
                                "ent_scores": ent_scores,
                                "ment_scores": ment_scores,
                                "model": model_name,
                                "data": data_name
                            })
                else:
                    balance = 1
                    oversampling = 1 
                    ent_scores, ment_scores = crossvalidate_experiment(
                            train = data["train"],
                            eval = data["eval"],
                            n_fold = 5,
                            keep_empty=keep_empty,
                            do_sample=do_sample, 
                            oversampling=oversampling, 
                            balance=balance,
                            model=model,
                            n_s=n_s,
                            thresholds=thresholds,
                            verbose=False 
                        )
                    results.append({
                        "keep_empty": keep_empty,
                        "do_sample": do_sample,
                        "balance": balance,
                        "oversampling": oversampling,
                        "ent_scores": ent_scores,
                        "ment_scores": ment_scores,
                        "model": model_name,
                        "data": data_name
                    })
    model_results_detailed.append(results)

In [None]:
with open('parameter_tuning/model_results_detailed.pkl', 'wb') as out:
     pickle.dump(model_results_detailed, out)

In [None]:
# top_10 scores are necessarily better than top_1 scores, here we can decide which one we choose to get metrics about
top_n = 1

In [19]:
def extract(scores, score_name, dict, current_setup, top_n):
    for score in scores:
        score_dict = score["score"].get_score()
        if score["top_n"] == top_n:
            curr_setup["top_n"] = score["top_n"]
            curr_setup["threshold"] = score["threshold"]
            if score_dict[score_name] > dict[f"top_{score_name}"]:
                dict[f"top_{score_name}"] = score_dict[score_name]
                dict[f"top_{score_name}_setup"] = [current_setup]
            elif score_dict[score_name] == dict[f"top_{score_name}"]:
                dict[f"top_{score_name}_setup"].append(current_setup)
    return dict

model_results = []
for model_name in model_names:
    dictionary={
        "ent": {
            "top_F1": 0,
            "top_Recall": 0,
            "top_Precision": 0,
            "top_F1_setup": [],
            "top_Recall_setup": [],
            "top_Precision_setup": []
        },
        "ment": {
            "top_F1": 0,
            "top_Recall": 0,
            "top_Precision": 0,
            "top_F1_setup": [],
            "top_Recall_setup": [],
            "top_Precision_setup": []
        }
    }
    for di in model_results_detailed:
        if di["model"] == model_name:
            for scoring_level in ["ent", "ment"]:
                scores = di[f"{scoring_level}_scores"]
                curr_setup = {
                    "data": di["data"],
                    "do_sample": di["do_sample"],
                    "balance": di["balance"],
                    "oversampling": di["oversampling"],
                    "keep_empty": di["keep_empty"]
                }
                for score_name in ["F1", "Recall", "Precision"]:
                    dictionary[scoring_level] = extract(scores, score_name, dictionary[scoring_level], curr_setup, top_n=top_n )
    model_results.append(dictionary)        


In [20]:
score = "F1"

In [None]:
for scores, model_name in zip(model_results, model_names):
    balance = []
    combined = []
    oversampling = []
    keep_empty = []
    do_sample = []
    top_n = []
    thresholds = []
    print(model_name)
    for scoring_level in ["ent", "ment"]:
        print(f"{scoring_level}\t", f"{score}:\t", scores[scoring_level][f"top_{score}"])
        print(f"{scoring_level}\t", f"{score} Setup:")
        for setup in scores[scoring_level][f"top_{score}_setup"]:
            if setup["data"] == "cleaned":
                combined.append(0)
            else:
                combined.append(1)
            balance.append(setup["balance"])
            oversampling.append(setup["oversampling"])
            top_n.append(setup["top_n"])
            thresholds.append(setup["threshold"])
            if setup["do_sample"]:
                do_sample.append(1)
            else:
                do_sample.append(0)
            if setup["keep_empty"]:
                keep_empty.append(1)
            else:
                keep_empty.append(0)
    
        print(f"Number of setups: {len(do_sample)}")

        print("Mean")
        print("sample", np.mean(do_sample))
        print("empty ", np.mean(keep_empty))
        print("combin", np.mean(combined))
        print("tresh ", np.mean(thresholds))
        print("top_n ", np.mean(top_n))
        print("overs ", np.mean(oversampling))
        print("balanc", np.mean(balance))

        print("Median")
        print("sample", np.median(do_sample))
        print("empty ", np.median(keep_empty))
        print("combin", np.median(combined))
        print("tresh ", np.median(thresholds))
        print("top_n ", np.median(top_n))
        print("overs ", np.median(oversampling))
        print("balanc", np.median(balance))
        print("\n\n")
            