# Preprocessing

In [None]:
import pickle
import json
import os
from preprocessing import clean_gt, clean_raw, label_entity

In [None]:
split = pickle.load(open('data/train_test_eval_filenames_new.pkl', 'rb'))
# split

In [None]:
# Though we do have a train/eval split, we actually simply combine the two for the cross-validation

In [None]:
data = {
    "train": [],
    "test": [],
    "eval": []
}
gt_data = []
for mag in ["dkm", "sbz"]:
    for year in os.listdir(f'data/raw/link/{mag}'):
        with open(os.path.join("data/raw/link", mag, year)) as f:
            input_linked = json.load(f)
        with open(os.path.join("data/ground-truth", mag, year)) as f:
            gt = json.load(f)
        gt = clean_gt(gt)
        gt_data += gt
        input_linked = clean_raw(input_linked)

        #due to non-determinism in the flair NER:
        all_refs_gt = [g["page"]+g["coord"] for g in gt] 
        all_refs_linked = [ent["page"]+ent["coord"] for l in input_linked for ent in l]
        all_valid_refs = set(all_refs_gt).intersection(set(all_refs_linked))

        for ent_variations in input_linked:
            for key in split:
                ent_instances = []
                for ent in ent_variations:
                    if ent["page"] in split[key]:
                        if (ent["page"]+ent["coord"]) in all_valid_refs:
                            ent_instances.append({"ent": ent, "label": label_entity(ent, gt)})
                if ent_instances:
                    data[key].append(ent_instances)


In [None]:
with open("data/processed.pkl", "wb") as f:
    pickle.dump(data, f)

# Candidate Generation

In [None]:
import pickle
from tqdm.notebook import  tqdm
from candidate_generation import create_metagrid_candidates, get_candidates_fuseki

In [None]:
# load everytime you run this as we pop keys to keep data clean..
with open("data/processed.pkl", "rb") as f:
    data = pickle.load(f)

for split in ["train", "eval", "test"]:
    ent_cand_label = []
    i = 0
    for entity_list in tqdm(data[split], smoothing=0.01):
        i += 1
        # Create candidates only for the first entry in the list as all the entity information is always the same
        # The only thing that changes are pages and page_coordinates
        candidates = create_metagrid_candidates(ent=entity_list[0]["ent"])
        # Generate the list of page_coordinates and the corresponding labels!
        coord_list = []
        label_list = []
        for ent_dict in entity_list:
            ent = ent_dict["ent"]
            coord_list.append({
                "page": ent.pop("page", ""), 
                "coords": ent.pop("coord", "")
            })
            label_list.append(ent_dict["label"])
        ent_cand_label.append({"entity": ent, "candidates": candidates, "occurences": coord_list, "labels": label_list})
        if i % 100 == 0:
            with open(f"candidates-gnd-{split}.pkl", "wb") as f:
                pickle.dump(ent_cand_label, f)
                    
    with open(f"data/candidates/metagrid/candidates-gnd-{split}.pkl", "wb") as f:
        pickle.dump(ent_cand_label, f)

# Feature Generation

#### To get the relevant fastttext model uncomment and run the following cell

In [None]:
# from gensim.models.fasttext import FastText, load_facebook_vectors
# model = load_facebook_vectors("cc.de.300.bin/cc.de.300.bin")
# model.save("./fasttext")

In [None]:
import pickle
from tqdm import  tqdm
from feature_generation import candidates_to_features, process_fuseki_candidates

In [None]:
# load every time you run this as we pop keys to keep data clean..
with open("data/processed.pkl", "rb") as f:
    data = pickle.load(f)

generator = "metagrid" # or "fuseki"

problematic_entities = []
for split in ["train", "eval", "test"]:
    ent_cand_label = []
    i = 0
    for entity_list in tqdm(data[split], smoothing=0.01):
        i += 1
        # Create candidates only for the first entry in the list as all the entity information is always the same
        # The only thing that changes are pages and page_coordinates
        
        # fuseki:
        if generator == "fuseki":
            unique_candidate_dict = get_candidates_fuseki(entity_list[0]["ent"])
            candidates = process_fuseki_candidates(unique_candidate_dict)
        
        # metagrid:
        if generator == "metagrid":
            candidates = create_metagrid_candidates(ent=entity_list[0]["ent"])
        
        #Generate the list of page_coordinates and the corresponding labels!
        coord_list = []
        gt_label = []
        for ent_dict in entity_list:
            ent = ent_dict["ent"]
            coord_list.append({
                "page": ent.pop("page", ""), 
                "coords": ent.pop("coord", "")
            })
            gt_label.append(ent_dict["label"])

        if len(gt_label)!=1:
            problematic_entities.append({"ent_list": entity_list, "gt_labels": gt_label, "mag": coord_list})
        gt_label = gt_label.pop()
        
        ent_cand_label.append({"entity": ent, "candidates": candidates, "occurences": coord_list, "labels": gt_label})
        
        if i % 100 == 0:
            with open(f"data/candidates/{generator}/candidates-gnd-{split}-{i}.pkl", "wb") as f:
                pickle.dump(ent_cand_label, f)
                    
    with open(f"data/candidates/{generator}/candidates-gnd-{split}.pkl", "wb") as f:
        pickle.dump(ent_cand_label, f)

# Feature Generation

Uncomment and run this to load and save the fasttext model

In [None]:
# from gensim.models.fasttext import FastText, load_facebook_vectors
# model = load_facebook_vectors("cc.de.300.bin/cc.de.300.bin")
# model.save("./fasttext")

In [12]:
import pickle
from tqdm import  tqdm
from feature_generation import create_features, get_gnd
import pickle
from tqdm.notebook import  tqdm
from feature_generation import candidates_to_features, process_fuseki_candidates
from candidate_generation import create_metagrid_candidates, get_candidates_fuseki

FileNotFoundError: [Errno 2] No such file or directory: './fasttext'

In [None]:
for split in ["train", "eval", "test"]:
    with open(f"data/candidates/candidates-gnd-{split}.pkl", "rb") as f:
        ent_cand_label = pickle.load(f)

    list_of_good_entities = []
    list_of_problematic_entities = []
    for ent_dict in tqdm(ent_cand_label):
        if len(set(ent_dict["label"])) > 1:
            for label in set(ent_dict["label"]):
                ent_dict["label"] = label
                features = candidates_to_features(ent=ent_dict["entity"], candidates=ent_dict["candidates"], gt_label=ent_dict["label"])
                ent_dict.update(features)
                list_of_problematic_entities.append(ent_dict.copy())
        else:
            ent_dict["label"] = set(ent_dict["label"]).pop()
            features = candidates_to_features(ent=ent_dict["entity"], candidates=ent_dict["candidates"], gt_label=ent_dict["label"])
            ent_dict.update(features)
            list_of_good_entities.append(ent_dict)
            
    with open(f"data/features/{split}.pkl", "wb") as f:
        pickle.dump(list_of_good_entities, file=f)
    
    with open(f"data/features/{split}_problematic.pkl", "wb") as f:
        pickle.dump(list_of_problematic_entities, file=f)

## Unsupervised Feature Generation

In [None]:
import numpy as np
import logging
import pickle
from tqdm.notebook import  tqdm
import importlib
import unsupervised.raw_text_driver
importlib.reload(unsupervised.raw_text_driver)
import unsupervised.portal_dnb_driver
importlib.reload(unsupervised.portal_dnb_driver)
import unsupervised.data_loader
with open(f"data/features/train.pkl", "rb") as f:
    list_of_good_entities = pickle.load(f)

importlib.reload(unsupervised.data_loader)

# REPLACE THIS RAW DATA PATH
data_loader = unsupervised.data_loader.DataLoader(raw_data_path='/home/aheser/ETH-CS4NLP-22-Project-Linking-GND/data/input/raw')

In [None]:
logging.basicConfig(level=logging.WARNING)
avg_distance_counter = 0
avg_distance = np.array((1,))
for split in ["train", "eval", "test"]:
# for split in ["eval"]:
    print('processing', split)
    with open(f"data/features/{split}.pkl", "rb") as f:
        list_of_good_entities = pickle.load(f)
    
    for current_entity in tqdm(list_of_good_entities):
        distances = data_loader.get_context_distances(current_entity, similarity_measure='cosine_similarity', window_size=10)
        for feature_counter in range(len(current_entity['features'])):
            current_entity['features'][feature_counter].extend(distances[feature_counter])
#             print('avg distance', avg_distance, 'distance vec', distances[feature_counter])
            avg_distance = avg_distance + np.array(distances[feature_counter])
            avg_distance_counter = avg_distance_counter + 1
        
    
    with open(f"data/features/unsupervised_{split}.pkl", "wb") as f:
        pickle.dump(list_of_good_entities, file=f)

avg_distance = avg_distance / avg_distance_counter
# problematic ones (we don't have any vectors, use mean)
for split in ["train", "eval", "test"]:
    print('processing', split)
    with open(f"data/features/{split}_problematic.pkl", "rb") as f:
        list_of_good_entities = pickle.load(f)
    
    for current_entity in tqdm(list_of_good_entities):
        for feature_counter in range(len(current_entity['features'])):
            current_entity['features'][feature_counter].extend(avg_distance)
    
    with open(f"data/features/unsupervised_{split}_problematic.pkl", "wb") as f:
        pickle.dump(list_of_good_entities, file=f)

# Ranking

## Load Data

In [None]:
import pickle
from tqdm import tqdm
import numpy as np
from evaluation import perform_experiment, plot_metrics_over_threshold, crossvalidate_experiment

In [None]:
d = {"train": {}, "eval": {}, "test": {}}
for split in ["train", "eval", "test"]:
    with open(f"data/features/unsupervised_{split}.pkl", "rb") as f:
        d[split] = pickle.load(file=f)

d_problem = {"train": {}, "eval": {}, "test": {}}
for split in ["train", "eval", "test"]:
    with open(f"data/features/unsupervised_{split}_problematic.pkl", "rb") as f:
        d_problem[split] = pickle.load(file=f)

d_combined = {"train": d["train"] + d_problem["train"], "eval": d["eval"] + d_problem["eval"], "test": d["test"] + d_problem["test"]}

In [None]:
# for counter, x in enumerate(d_combined['test']):
#     for feat in x['features']:
#         for feat2 in feat:
#             if np.isnan(feat2):
#                 print(counter)
#                 break

d_problem['test'][0]['features']

# Problematic Entries

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

ent_scores, ment_scores = perform_experiment(
    keep_empty=True,
    do_sample=True,
    oversampling=3, # Multiple of how often we oversample y = 1
    balance=3, # multiple of y = 0 samples vs y = 1 samples
    #train=d_combined["train"] + d_combined["eval"],
    #eval=d_combined["test"],
    train=d_combined["train"] + d_combined["eval"],
    eval=d_combined["test"],
    model=ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="squared_error", bootstrap=True),
    n_s=[1,10], # How many candidates do we keep
    thresholds=[0.01], # Where do we cut off
    verbose=False # Print stuff
)

In [None]:
print("\nEntity Level")
for score in ent_scores:
    print("N:", score["top_n"], "Threshold:", score["threshold"])
    score["score"].print_scores()

print("Mention Level")
for score in ment_scores:
    print("N:", score["top_n"], "Threshold:", score["threshold"])
    score["score"].print_scores()

Perform crossvalidation experiments like this:

In [None]:
 ent_scores, ment_scores = crossvalidate_experiment(
    train=d_combined["train"], # + d_combined["eval"],
    eval=d_combined["eval"],
    n_fold = 5,
    keep_empty=True,
    do_sample=True,
    oversampling=3, # Multiple of how often we oversample y = 1
    balance=3, # multiple of y = 0 samples vs y = 1 samples
    model=ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="squared_error", bootstrap=True),
    n_s=[1,10], # How many candidates do we keep
    thresholds=[0.01, 0.1, 0.3, 0.5], # Where do we cut off
    verbose=False # Print stuff
)

In [None]:
print("\nEntity Level")
for score in ent_scores:
    print("N:", score["top_n"], "Threshold:", score["threshold"])
    score["score"].print_scores()

print("Mention Level")
for score in ment_scores:
    print("N:", score["top_n"], "Threshold:", score["threshold"])
    score["score"].print_scores()

# DIfferent Models Cross Validation

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel

models = [
    GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='squared_error'),
    ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="squared_error", bootstrap=True),
    ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="absolute_error", bootstrap=True),
    ExtraTreesClassifier(n_estimators=100, random_state=0, bootstrap=True),
    ElasticNet(random_state=0),
    GaussianProcessRegressor(kernel = DotProduct() + WhiteKernel(), random_state=0)
]

model_names = ["Gradient Boosting Regressor", "Extra Trees Regressor Squared", "Extra Trees Regressor Absolute", "Extra Trees Classifier", "Elastic Net", "Gaussian Process"]

In [None]:
# models = [ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="squared_error", bootstrap=True)]
# model_names = ["Tree Reg Squared"]

In [None]:
balance_list = [2, 3]
oversampling_list = [2, 3]
n_s = [1,10]
thresholds = [0.01, 0.1, 0.2]

In [None]:
model_results_detailed = []

for model, model_name in zip(models, model_names):
    print(model_name)
    results = []
    for data, data_name in zip([d, d_combined], ["cleaned", "combined"]):
        print("Data:", data_name)
        for keep_empty in [True, False]:
            print("keep_empty:", keep_empty)
            for do_sample in [True, False]:
                print("do_sample:", do_sample)
                if do_sample:
                    for balance in balance_list:
                        print("Balance:", balance)
                        for oversampling in oversampling_list:
                            print("Oversampling:", oversampling)
                            ent_scores, ment_scores = crossvalidate_experiment(
                                    d = data,
                                    n_fold = 5,
                                    keep_empty=keep_empty,
                                    do_sample=do_sample, 
                                    oversampling=oversampling, 
                                    balance=balance,
                                    model=model,
                                    n_s=n_s,
                                    thresholds=thresholds,
                                    verbose=False 
                                )
                            results.append({
                                "keep_empty": keep_empty,
                                "do_sample": do_sample,
                                "balance": balance,
                                "oversampling": oversampling,
                                "ent_scores": ent_scores,
                                "ment_scores": ment_scores,
                                "model": model_name,
                                "data": data_name
                            })
                else:
                    balance = 1
                    oversampling = 1 
                    ent_scores, ment_scores = crossvalidate_experiment(
                            d = data,
                            n_fold = 5,
                            keep_empty=keep_empty,
                            do_sample=do_sample, 
                            oversampling=oversampling, 
                            balance=balance,
                            model=model,
                            n_s=n_s,
                            thresholds=thresholds,
                            verbose=False 
                        )
                    results.append({
                        "keep_empty": keep_empty,
                        "do_sample": do_sample,
                        "balance": balance,
                        "oversampling": oversampling,
                        "ent_scores": ent_scores,
                        "ment_scores": ment_scores,
                        "model": model_name,
                        "data": data_name
                    })
    model_results_detailed.append(results)

In [None]:
with open('parameter_tuning/model_results_detailed.pkl', 'wb') as out:
     pickle.dump(model_results_detailed, out)

In [None]:
import os

do_sample = True
keep_empty = True
model = "Tree Reg Squared"
data = "combined"

if not os.path.exists('plots'):
    os.makedirs('plots')
    
for balance in balance_list:
    for oversampling in oversampling_list:
        plot_metrics_over_threshold(
            thresholds=thresholds, 
            n_s=n_s, 
            oversampling=oversampling, 
            balance=balance, 
            do_sample=True, 
            keep_empty=True, 
            model=model,
            data=data,
            results=results)

In [None]:
# top_10 scores are necessarily better than top_1 scores, here we can decide which one we choose to get metrics about
top_n = 1

In [None]:
def extract(scores, score_name, dict, current_setup, top_n):
    for score in scores:
        score_dict = score["score"].get_score()
        if score["top_n"] == top_n:
            curr_setup["top_n"] = score["top_n"]
            curr_setup["threshold"] = score["threshold"]
            if score_dict[score_name] > dict[f"top_{score_name}"]:
                dict[f"top_{score_name}"] = score_dict[score_name]
                dict[f"top_{score_name}_setup"] = [current_setup]
            elif score_dict[score_name] == dict[f"top_{score_name}"]:
                dict[f"top_{score_name}_setup"].append(current_setup)
    return dict

model_results = []
for model_name in model_names:
    dictionary={
        "ent": {
            "top_F1": 0,
            "top_Recall": 0,
            "top_Precision": 0,
            "top_F1_setup": [],
            "top_Recall_setup": [],
            "top_Precision_setup": []
        },
        "ment": {
            "top_F1": 0,
            "top_Recall": 0,
            "top_Precision": 0,
            "top_F1_setup": [],
            "top_Recall_setup": [],
            "top_Precision_setup": []
        }
    }
    for di in model_results_detailed:
        if di["model"] == model_name:
            for scoring_level in ["ent", "ment"]:
                scores = di[f"{scoring_level}_scores"]
                curr_setup = {
                    "data": di["data"],
                    "do_sample": di["do_sample"],
                    "balance": di["balance"],
                    "oversampling": di["oversampling"],
                    "keep_empty": di["keep_empty"]
                }
                for score_name in ["F1", "Recall", "Precision"]:
                    dictionary[scoring_level] = extract(scores, score_name, dictionary[scoring_level], curr_setup, top_n=top_n )
    model_results.append(dictionary)        


In [None]:
score = "F1"

In [None]:
for scores, model_name in zip(model_results, model_names):
    balance = []
    combined = []
    oversampling = []
    keep_empty = []
    do_sample = []
    top_n = []
    thresholds = []
    print(model_name)
    for scoring_level in ["ent", "ment"]:
        print(f"{scoring_level}\t", f"{score}:\t", scores[scoring_level][f"top_{score}"])
        print(f"{scoring_level}\t", f"{score} Setup:")
        for setup in scores[scoring_level][f"top_{score}_setup"]:
            if setup["data"] == "cleaned":
                combined.append(0)
            else:
                combined.append(1)
            balance.append(setup["balance"])
            oversampling.append(setup["oversampling"])
            top_n.append(setup["top_n"])
            thresholds.append(setup["threshold"])
            if setup["do_sample"]:
                do_sample.append(1)
            else:
                do_sample.append(0)
            if setup["keep_empty"]:
                keep_empty.append(1)
            else:
                keep_empty.append(0)
    
        print(f"Number of setups: {len(do_sample)}")

        print("Mean")
        print("sample", np.mean(do_sample))
        print("empty ", np.mean(keep_empty))
        print("combin", np.mean(combined))
        print("tresh ", np.mean(thresholds))
        print("top_n ", np.mean(top_n))
        print("overs ", np.mean(oversampling))
        print("balanc", np.mean(balance))

        print("Median")
        print("sample", np.median(do_sample))
        print("empty ", np.median(keep_empty))
        print("combin", np.median(combined))
        print("tresh ", np.median(thresholds))
        print("top_n ", np.median(top_n))
        print("overs ", np.median(oversampling))
        print("balanc", np.median(balance))
        print("\n\n")
            

After some tests we find:
- ExtraTreesRegressor works best
- with: 
    - Balance = 1
    - Oversampling = 1
    - do_sample = True
    - keep_empty = True
    - treshold = 0.1 or smaller is more stable in advers conditions for balance 1 and oversampling 1 it does not matter -> this makes the regression more stable
    - => n -> does not matter so much can amolst go to 1 !!!!

- problematic entities seem to help 