# Preprocessing

In [1]:
import pickle
import json
import os
from preprocessing import clean_gt, clean_raw, label_entity

In [None]:
split = pickle.load(open('data/train_test_eval_filenames_new.pkl', 'rb'))
split

In [39]:
#train = 94, test = 30, eval = 10

In [3]:
# import random
# new_split = {"train":[], "test": [], "eval": []}
# for key in split:
#     for page in split[key]:
#         mag = page.split("_")[0].split("-")[0]
#         year = page.split("_")[1]
        
#         if mag == "dkm" and (year == "1941" or year == "2010"):
#             new_split["train"].append(page)
#         if (year == "1990"):
#             new_split["test"].append(page)
#         if mag =="sbz" and (year == "1895" or year == "1940" or year == "1965" or year == "2010"):
#             new_split["train"].append(page)

# eval_set = random.sample(new_split["train"], int(len(new_split["train"])/10)) #set 10% of train aside for eval
# for page in eval_set:
#     new_split["train"].remove(page)
# new_split["eval"] = eval_set

# with open('train_test_eval_filenames_new.pkl', 'wb') as out:
#     pickle.dump(new_split, out)

In [4]:
data = {
    "train": [],
    "test": [],
    "eval": []
}
gt_data = []
for mag in ["dkm", "sbz"]:
    for year in os.listdir(f'data/raw/link/{mag}'):
        with open(os.path.join("data/raw/link", mag, year)) as f:
            input_linked = json.load(f)
        with open(os.path.join("data/ground-truth", mag, year)) as f:
            gt = json.load(f)
        gt = clean_gt(gt)
        gt_data += gt
        input_linked = clean_raw(input_linked)

        #due to non-determinism in the flair NER:
        all_refs_gt = [g["page"]+g["coord"] for g in gt] 
        all_refs_linked = [ent["page"]+ent["coord"] for l in input_linked for ent in l]
        all_valid_refs = set(all_refs_gt).intersection(set(all_refs_linked))

        for ent_variations in input_linked:
            for key in split:
                ent_instances = []
                for ent in ent_variations:
                    if ent["page"] in split[key]:
                        if (ent["page"]+ent["coord"]) in all_valid_refs:
                            ent_instances.append({"ent": ent, "label": label_entity(ent, gt)})
                if ent_instances:
                    data[key].append(ent_instances)

In [5]:
with open("data/processed.pkl", "wb") as f:
    pickle.dump(data, f)

# Candidate Generation

In [1]:
import pickle
from tqdm.notebook import  tqdm
from candidate_generation import create_metagrid_candidates

In [2]:
# load everytime you run this as we pop keys to keep data clean..
with open("data/processed.pkl", "rb") as f:
    data = pickle.load(f)

for split in ["train", "eval", "test"]:
    ent_cand_label = []
    i = 0
    for entity_list in tqdm(data[split], smoothing=0.01):
        i += 1
        # Create candidates only for the first entry in the list as all the entity information is always the same
        # The only thing that changes are pages and page_coordinates
        candidates = create_metagrid_candidates(ent=entity_list[0]["ent"])
        # Generate the list of page_coordinates and the corresponding labels!
        coord_list = []
        label_list = []
        for ent_dict in entity_list:
            ent = ent_dict["ent"]
            coord_list.append({
                "page": ent.pop("page", ""), 
                "coords": ent.pop("coord", "")
            })
            label_list.append(ent_dict["label"])
        ent_cand_label.append({"entity": ent, "candidates": candidates, "occurences": coord_list, "labels": label_list})
        if i % 100 == 0:
            with open(f"candidates-gnd-{split}.pkl", "wb") as f:
                pickle.dump(ent_cand_label, f)
                    
    with open(f"candidates-gnd-{split}.pkl", "wb") as f:
       pickle.dump(ent_cand_label, f)

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/219 [00:00<?, ?it/s]

# Feature Generation

### To get the relevant fastttext model uncomment and run the following cell

In [38]:
# from gensim.models.fasttext import FastText, load_facebook_vectors
# model = load_facebook_vectors("cc.de.300.bin/cc.de.300.bin")
# model.save("./fasttext")

In [3]:
import pickle
from tqdm.notebook import  tqdm
from feature_generation import candidates_to_features

In [55]:
for split in ["train", "eval", "test"]:
    with open(f"candidates-gnd-{split}.pkl", "rb") as f:
        ent_cand_label = pickle.load(f)

    list_of_good_entities = []
    list_of_problematic_entities = []
    for ent_dict in tqdm(ent_cand_label):
        if len(set(ent_dict["labels"])) > 1:
            for label in set(ent_dict["labels"]):
                ent_dict["label"] = label
                features = candidates_to_features(ent=ent_dict["entity"], candidates=ent_dict["candidates"], gt_label=ent_dict["label"])
                ent_dict.update(features)
                list_of_problematic_entities.append(ent_dict.copy())
        else:
            ent_dict["label"] = set(ent_dict["labels"]).pop()
            features = candidates_to_features(ent=ent_dict["entity"], candidates=ent_dict["candidates"], gt_label=ent_dict["label"])
            ent_dict.update(features)
            list_of_good_entities.append(ent_dict)
            
    with open(f"data/features/{split}.pkl", "wb") as f:
        pickle.dump(list_of_good_entities, file=f)
    
    with open(f"data/features/{split}_problematic.pkl", "wb") as f:
        pickle.dump(list_of_problematic_entities, file=f)

  0%|          | 0/574 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/219 [00:00<?, ?it/s]

# Ranking

## Load Data

In [1]:
import pickle
from tqdm.notebook import tqdm
import numpy as np
from evaluation import perform_experiment, plot_metrics_over_treshold

In [2]:
d = {"train": {}, "eval": {}}
for split in ["train", "eval"]:
    with open(f"data/features/{split}.pkl", "rb") as f:
        d[split] = pickle.load(file=f)

d_problem = {"train": {}, "eval": {}}
for split in ["train", "eval"]:
    with open(f"data/features/{split}_problematic.pkl", "rb") as f:
        d_problem[split] = pickle.load(file=f)

d_combined = {"train": d["train"]+d_problem["train"], "eval": d["eval"] + d_problem["eval"]}

In [3]:
from sklearn.ensemble import ExtraTreesRegressor

**Best scores I could get so far**

In [7]:
ent_scores, ment_scores = perform_experiment(
    keep_empty=True, 
    do_sample=True,
    oversampling=1, # Multiple of how often we oversample y = 1
    balance=1, # multiple of y = 0 samples vs y = 1 samples
    d=d_combined, 
    model=ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="squared_error", bootstrap=True),
    n_s=[1,10], # How many candidates do we keep
    tresholds=[0.01], # Where do we cut off
    verbose=False # Print stuff
)
print("\nEntity Level")
for score in ent_scores:
    print("N:", score["top_n"], "Treshold:", score["treshold"])
    score["score"].print_scores()

print("Mention Level")
for score in ment_scores:
    print("N:", score["top_n"], "Treshold:", score["treshold"])
    score["score"].print_scores()


Entity Level
N: 1 Treshold: 0.01
F1: 0.69 RE:  0.526 PR: 1.0 AC: 0.633
TP: 20 TN: 18 FP 0 TN 11


N: 10 Treshold: 0.01
F1: 0.69 RE:  0.526 PR: 1.0 AC: 0.633
TP: 20 TN: 18 FP 0 TN 11


Mention Level
N: 1 Treshold: 0.01
F1: 0.77 RE:  0.626 PR: 1.0 AC: 0.706
TP: 62 TN: 37 FP 0 TN 27


N: 10 Treshold: 0.01
F1: 0.77 RE:  0.626 PR: 1.0 AC: 0.706
TP: 62 TN: 37 FP 0 TN 27




In [8]:
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor

models = [
    GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='squared_error'),
    ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="squared_error", bootstrap=True),
    ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="absolute_error", bootstrap=True),
    ExtraTreesClassifier(n_estimators=100, random_state=0, bootstrap=True),
    ElasticNet(random_state=0)
]

model_names = ["GBR", "Tree Reg Squared", "Tree Reg Abs", "Tree Class", "Elastic"]

In [62]:
balance_list = [1, 2, 3, 5]
oversampling_list = [1, 2, 3, 5]
n_s = [1]
tresholds = [0.01, 0.1, 0.2, 0.5]


In [63]:
results = []

for model, model_name in zip(models, model_names):
    print(model_name)
    for data, data_name in zip([d, d_combined], ["cleaned", "combined"]):
        for keep_empty in [True, False]:
            for do_sample in [True, False]:
                if do_sample:
                    for balance in balance_list:
                        #print("Balance:", balance)
                        for oversampling in oversampling_list:
                            #print("Oversampling:", oversampling)
                            ent_scores, ment_scores = perform_experiment(keep_empty=keep_empty, 
                                do_sample=do_sample, 
                                oversampling=oversampling, 
                                balance=balance, 
                                d=data, 
                                model=model,
                                n_s=n_s,
                                tresholds=tresholds
                            )
                            results.append({
                                "keep_empty": keep_empty,
                                "do_sample": do_sample,
                                "balance": balance,
                                "oversampling": oversampling,
                                "ent_scores": ent_scores,
                                "ment_scores": ment_scores,
                                "model": model_name,
                                "data": data_name
                            })
                else:
                    balance = 1
                    oversampling = 1 
                    ent_scores, ment_scores = perform_experiment(keep_empty=keep_empty, 
                                do_sample=do_sample, 
                                oversampling=oversampling, 
                                balance=balance, 
                                d=data, 
                                model=model,
                                n_s=n_s,
                                tresholds=tresholds
                            )
                    results.append({
                        "keep_empty": keep_empty,
                        "do_sample": do_sample,
                        "balance": balance,
                        "oversampling": oversampling,
                        "ent_scores": ent_scores,
                        "ment_scores": ment_scores,
                        "model": model_name,
                        "data": data_name
                    })

GBR
Balance: 1
Oversampling: 1
Oversampling: 2
Oversampling: 3
Oversampling: 5
Balance: 2
Oversampling: 1
Oversampling: 2
Oversampling: 3
Oversampling: 5
Balance: 3
Oversampling: 1
Oversampling: 2
Oversampling: 3
Oversampling: 5
Balance: 5
Oversampling: 1
Oversampling: 2
Oversampling: 3
Oversampling: 5
Balance: 1
Oversampling: 1
Oversampling: 2
Oversampling: 3
Oversampling: 5
Balance: 2
Oversampling: 1
Oversampling: 2
Oversampling: 3
Oversampling: 5
Balance: 3
Oversampling: 1
Oversampling: 2
Oversampling: 3
Oversampling: 5
Balance: 5
Oversampling: 1
Oversampling: 2
Oversampling: 3
Oversampling: 5
Balance: 1
Oversampling: 1
Oversampling: 2
Oversampling: 3
Oversampling: 5
Balance: 2
Oversampling: 1
Oversampling: 2
Oversampling: 3
Oversampling: 5
Balance: 3
Oversampling: 1
Oversampling: 2
Oversampling: 3
Oversampling: 5
Balance: 5
Oversampling: 1
Oversampling: 2
Oversampling: 3
Oversampling: 5
Balance: 1
Oversampling: 1
Oversampling: 2
Oversampling: 3
Oversampling: 5
Balance: 2
Oversampli

In [None]:
do_sample = True
keep_empty = True
model = "Tree Reg Squared"
data = "combined"

for balance in balance_list:
    for oversampling in oversampling_list:
        plot_metrics_over_treshold(
            tresholds=tresholds, 
            n_s=n_s, 
            oversampling=oversampling, 
            balance=balance, 
            do_sample=True, 
            keep_empty=True, 
            model=model,
            data=data,
            results=results)

In [64]:
def extract(scores, score_name, dict, current_setup):
    for score in scores:
        score_dict = score["score"].get_score()
        curr_setup["top_n"] = score["top_n"]
        curr_setup["treshold"] = score["treshold"]
        if score_dict[score_name] > dict[f"top_{score_name}"]:
            dict[f"top_{score_name}"] = score_dict[score_name]
            dict[f"top_{score_name}_setup"] = [current_setup]
        elif score_dict[score_name] == dict[f"top_{score_name}"]:
            dict[f"top_{score_name}_setup"].append(current_setup)
    return dict

model_results = []
for model_name in model_names:
    dictionary={
        "ent": {
            "top_F1": 0,
            "top_Recall": 0,
            "top_Precision": 0,
            "top_F1_setup": [],
            "top_Recall_setup": [],
            "top_Precision_setup": []
        },
        "ment": {
            "top_F1": 0,
            "top_Recall": 0,
            "top_Precision": 0,
            "top_F1_setup": [],
            "top_Recall_setup": [],
            "top_Precision_setup": []
        }
    }
    for di in results:
        if di["model"] == model_name:
            for scoring_level in ["ent", "ment"]:
                scores = di[f"{scoring_level}_scores"]
                curr_setup = {
                    "data": di["data"],
                    "do_sample": di["do_sample"],
                    "balance": di["balance"],
                    "oversampling": di["oversampling"],
                    "keep_empty": di["keep_empty"]
                }
                for score_name in ["F1", "Recall", "Precision"]:
                    dictionary[scoring_level] = extract(scores, score_name, dictionary[scoring_level], curr_setup )
    model_results.append(dictionary)        


In [66]:
score = "F1"

In [70]:
for scores, model_name in zip(model_results, model_names):
    balance = []
    combined = []
    oversampling = []
    keep_empty = []
    do_sample = []
    top_n = []
    tresholds = []
    print(model_name)
    for scoring_level in ["ent", "ment"]:
        print(f"{scoring_level}\t", f"{score}:\t", scores[scoring_level][f"top_{score}"])
        print(f"{scoring_level}\t", f"{score} Setup:")
        for setup in scores[scoring_level][f"top_{score}_setup"]:
            if setup["data"] == "cleaned":
                combined.append(0)
            else:
                combined.append(1)
            balance.append(setup["balance"])
            oversampling.append(setup["oversampling"])
            top_n.append(setup["top_n"])
            tresholds.append(setup["treshold"])
            if setup["do_sample"]:
                do_sample.append(1)
            else:
                do_sample.append(0)
            if setup["keep_empty"]:
                keep_empty.append(1)
            else:
                keep_empty.append(0)
    
        print(f"Number of setups: {len(do_sample)}")

        print("Mean")
        print("sample", np.mean(do_sample))
        print("empty ", np.mean(keep_empty))
        print("combin", np.mean(combined))
        print("tresh ", np.mean(tresholds))
        #print("top_n ", np.mean(top_n))
        print("overs ", np.mean(oversampling))
        print("balanc", np.mean(balance))

        print("Median")
        print("sample", np.median(do_sample))
        print("empty ", np.median(keep_empty))
        print("combin", np.median(combined))
        print("tresh ", np.median(tresholds))
        #print("top_n ", np.median(top_n))
        print("overs ", np.median(oversampling))
        print("balanc", np.median(balance))
        print("\n\n")
            

GBR
ent	 F1:	 0.69
ent	 F1 Setup:
Number of setups: 40
Mean
sample 1.0
empty  1.0
combin 0.375
tresh  0.5
overs  2.825
balanc 1.825
Median
sample 1.0
empty  1.0
combin 0.0
tresh  0.5
overs  2.5
balanc 2.0



ment	 F1:	 0.77
ment	 F1 Setup:
Number of setups: 80
Mean
sample 1.0
empty  1.0
combin 0.375
tresh  0.5
overs  2.825
balanc 1.825
Median
sample 1.0
empty  1.0
combin 0.0
tresh  0.5
overs  2.5
balanc 2.0



Tree Reg Squared
ent	 F1:	 0.69
ent	 F1 Setup:
Number of setups: 43
Mean
sample 0.9534883720930233
empty  0.8372093023255814
combin 0.6511627906976745
tresh  0.5
overs  2.744186046511628
balanc 2.3488372093023258
Median
sample 1.0
empty  1.0
combin 1.0
tresh  0.5
overs  3.0
balanc 2.0



ment	 F1:	 0.77
ment	 F1 Setup:
Number of setups: 86
Mean
sample 0.9534883720930233
empty  0.8372093023255814
combin 0.6511627906976745
tresh  0.5
overs  2.744186046511628
balanc 2.3488372093023258
Median
sample 1.0
empty  1.0
combin 1.0
tresh  0.5
overs  3.0
balanc 2.0



Tree Reg Abs
ent	 F1:	 

After some tests we find:
- ExtraTreesRegressor works best
- with: 
    - Balance = 1
    - Oversampling = 1
    - do_sample = True
    - keep_empty = True
    - treshold = 0.1 or smaller is more stable in advers conditions for balance 1 and oversampling 1 it does not matter -> this makes the regression more stable
    - => n -> does not matter so much can amolst go to 1 !!!!

- problematic entities seem to help 