# Preprocessing

In [None]:
import pickle
import json
import os
from preprocessing import clean_gt, clean_raw, label_entity

: 

In [None]:
split = pickle.load(open('/data/train_test_eval_filenames.pkl', 'rb'))

In [None]:
data = {
    "train": [],
    "test": [],
    "eval": []
}
gt_data = []
for mag in ["dkm", "sbz"]:
    for year in os.listdir(f'data/raw/link/{mag}'):
        with open(os.path.join("data/raw/link", mag, year)) as f:
            input = json.load(f)
        with open(os.path.join("data/ground-truth", mag, year)) as f:
            gt = json.load(f)
        gt = clean_gt(gt)
        gt_data += gt
        input = clean_raw(input)
        for ent_variations in input:
            for key in split:
                ent_instances = []
                for ent in ent_variations:
                    if ent["page"] in split[key]:
                        ent_instances.append({"ent": ent, "label": label_entity(ent, gt)})
                if ent_instances:
                    data[key].append(ent_instances)

with open("/data/preprocessed.pkl", "wb") as f:
    pickle.dump(data, f)

# Candidate Generation

In [None]:
import pickle
from tqdm.notebook import  tqdm
from candidate_generation import create_metagrid_candidates, get_coords_from_entitiy

In [None]:
with open("/data/preprocessed.pkl", "rb") as f:
    data = pickle.load(f)

In [None]:
for split in ["train", "eval", "test"]:
    ent_cand_label = []
    i = 0
    for entity_list in tqdm(data[split], smoothing=0.01):
        i += 1
        for ent_dict in entity_list:
            list_of_tuples = []
            ent = ent_dict["ent"]
            ent.update(get_coords_from_entitiy(ent))
            list_of_tuples.append((ent, create_metagrid_candidates(ent=ent_dict["ent"]), ent_dict["label"]))
        ent_cand_label.append(list_of_tuples)
        if i % 100 == 0:
            with open(f"data/candidates/gnd-{split}.pkl", "wb") as f:
                pickle.dump(ent_cand_label, f)
                    
    with open(f"data/candidates/gnd-{split}.pkl", "wb") as f:
        pickle.dump(ent_cand_label, f)

# Feature Generation

In [None]:
import pickle
from tqdm.notebook import  tqdm
from feature_generation import create_features, get_gnd

In [None]:
for split in ["train", "eval", "test"]:
    with open(f"data/candidates/gnd-{split}.pkl", "rb") as f:
        ent_cand_label = pickle.load(f)
    X = []
    y = []
    X_ent = []
    y_ent = []

    def get_gnd(candidate):
        return candidate["Gnd"]

    for ent, candidates, label in tqdm(ent_cand_label):
        y_ent_int = []
        X_ent_int = []
        for c in candidates:
            features = create_features(ent, c)
            X_ent_int.append([get_gnd(c)] + features)
        y_ent.append([label])
        X_ent.append(X_ent_int)
    for filename, file in zip([f"y_ent", f"X_ent"], [y_ent, X_ent]):
        with open(f"data/features/{split}/{filename}.pkl", "wb") as f:
            pickle.dump(file, file=f)

# Ranking

## Load Data

In [2]:
import pickle
from tqdm.notebook import tqdm
import numpy as np

In [3]:
d = {"train": {}, "eval": {}}
for split in ["train", "eval"]:
    for filename in ["y", "X", "y_ent", "X_ent"]:
        with open(f"data/features/{split}/{filename}.pkl", "rb") as f:
            d[split][filename] = pickle.load(file=f)

In [4]:
X_train = []
y_train = []
X_test = []
y_test = []
y_test_ids = []
x_test_ids = []

for x_ent, y_ent in zip(d["train"]["X_ent"], d["train"]["y_ent"]):
    for x in x_ent:
        X_train.append(x[1:])
        if x[0] == y_ent[0]:
            y_train.append(1)
        else:
            y_train.append(0)

for x_ent, y_ent in zip(d["eval"]["X_ent"], d["eval"]["y_ent"]):
    for x in x_ent:
        X_test.append(x[1:])
        if x[0] == y_ent[0]:
            y_test.append(1)
        else:
            y_test.append(0)
        x_test_ids.append(x[0])
        y_test_ids.append(y_ent[0])

In [7]:
import pandas as pd

In [8]:
df = pd.DataFrame(X_train)
df["y"] = y_train

def fun(x):
    if x.shape[0] > 49:
        return x.sample(frac = 0.1)
    else:
        return x.sample(147, replace=True)

sample = df.groupby('y', group_keys=False).apply(lambda x: fun(x))

y_sample = sample["y"]
x_sample = sample[[0, 1, 2, 3, 4, 5, 6, 7]]

# Training

In [9]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor(n_estimators=100, random_state=0, criterion="squared_error", bootstrap=True)
model.fit(x_sample, y_sample)

## Evaluation

In [10]:
def rank_candidates(candidates, model):
    candidate_scores = []
    candidate_ids = []
    for candidate in candidates:
        features = candidate[1:]
        score = model.predict(np.array(features).reshape(1,-1))[0]
        candidate_scores.append(score)
        candidate_ids.append(candidate[0])
    if candidate_ids:
        candidate_ids = np.array(candidate_ids)
        indices = np.argsort(candidate_scores)
        ids_sorted = np.array(candidate_ids)[indices[::-1]]
        scores_sorted = np.array(candidate_scores)[indices[::-1]]
        return ids_sorted, scores_sorted
    else:
        return [], []

### Mention Level

In [33]:
ids = []
scores = []
y_ids = []
for x_ent, y_ent in tqdm(zip(d[split]["X_ent"], d[split]["y_ent"])):
    ids_local, scores_local = rank_candidates(x_ent, model)
    ids.append(ids_local)
    scores.append(scores_local)
    y_ids.append(y_ent[0])

0it [00:00, ?it/s]

In [34]:
treshold = 0.0001
split = "eval"

In [35]:
tp = 0
fp = 0
tn = 0
fn = 0
for ids_local, scores_local, y in zip(ids, scores, y_ids):
    for id, score in zip(ids_local, scores_local):
        if score > treshold:
            if id == y:
                tp += 1
            elif id == '':
                fn += 1
            else:
                fp += 1
        elif y == '':
            tn += 1
        else:
            fn += 1
            
print("tp, tn, fp, fn")
print(tp, tn, fp, fn)
accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = 2* (precision * recall)/(precision+recall)
print("F1:          ", round(f1,4))
print("Recall:      ", round(recall,4))        
print("Precision:   ", round(precision,4))
print("Accuracy:    ", round(accuracy,4))


tp, tn, fp, fn
22 1311 1432 293
F1:           0.0249
Recall:       0.0698
Precision:    0.0151
Accuracy:     0.4359


### Entity Level

In [11]:
def normalize_prediction(ids, scores, cutoff_score=0.5, n_candidates=5):
    count = 0
    result = []
    for id, score in zip(ids, scores):
        count += 1
        if count > n_candidates:
            return result
        else:
            if score > cutoff_score:
                result.append(id)
    if len(result) < 1 :
        result.append('')
    return result

In [12]:
split = "eval"

In [13]:
ids = []
scores = []
for x_ent in tqdm(d[split]["X_ent"]):
    ids_local, scores_local = rank_candidates(x_ent, model)
    ids.append(ids_local)
    scores.append(scores_local)

  0%|          | 0/177 [00:00<?, ?it/s]

In [22]:
treshold = 0.5
n = 10

In [23]:
predictions = []
for ids_local, scores_local in zip(ids, scores):
    prediction = normalize_prediction(ids_local, scores_local, cutoff_score=treshold, n_candidates = n)
    predictions.append(prediction)
tp = 0
fp = 0
tn = 0
fn = 0

for prediction, y in zip(predictions, d[split]["y_ent"]):
    if y[0] == '':
        if y[0] in prediction:
            tn +=1
        else:
            fp += 1
    else:
        if y[0] in prediction:
            tp += 1
        else:
            fn += 1

accuracy = (tp +tn)/(tp +tn +fp +fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = 2* (precision * recall)/(precision +recall)
print("F1:          ", round(f1,4))
print("Recall:      ", round(recall,4))        
print("Precision:   ", round(precision,4))
print("Accuracy:    ", round(accuracy,4))
print("\ntp tn  fp fn")
print(tp, tn, fp, fn)

F1:           0.381
Recall:       0.3922
Precision:    0.3704
Accuracy:     0.6328

tp tn  fp fn
20 92 34 31
