# Evaluating a trained link predictor

We will take a closer look at how a trained link predictor performs in specific cases.

In [1]:
import logging
import os.path as osp

import matplotlib.pyplot as plt
import pandas as pd
from pykeen.evaluation import RankBasedEvaluator
from pykeen.triples import TriplesFactory
import torch
from tqdm.notebook import tqdm

import sys, os
sys.path.append(os.getcwd() + os.sep + os.pardir)

Change this cell to generate a report for other datasets/models:

In [9]:
MODEL_ID = '1r75g9na'

## Loading model and data

In [10]:
base_path = osp.join('..', 'models', MODEL_ID)
model_path = osp.join(base_path, 'trained_model.pkl')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torch.load(model_path).to(device)
train = TriplesFactory.from_path_binary(osp.join(base_path, 'training_triples'))

In [11]:
graph_path = osp.join('..', 'data', 'biokgb', 'graph')
valid_triples = 'biokg.links-valid.csv'
test_triples = 'biokg.links-test.csv'

valid, test = [TriplesFactory.from_path(osp.join(graph_path, f),
                                        entity_to_id=train.entity_to_id,
                                        relation_to_id=train.relation_to_id)
               for f in (valid_triples, test_triples)]

## Evaluation

In [12]:
evaluator = RankBasedEvaluator(filtered=True)

### Evaluation over the full set of relation types

In [None]:
results = evaluator.evaluate(model, test.mapped_triples,
                             additional_filter_triples=[train.mapped_triples,
                                                        valid.mapped_triples])

In [None]:
results.get_metric('both.realistic.hits_at_1')

### Evaluating over specific relation types

In [13]:
train.relation_to_id
result_dicts = []
for relation in tqdm(train.relation_to_id, desc='Evaluating over each relation'):
    triples_subset = test.new_with_restriction(relations=[relation])
    if triples_subset.num_triples > 0:
        subset_result = evaluator.evaluate(model,
                                           triples_subset.mapped_triples,
                                           additional_filter_triples=[train.mapped_triples,
                                                                      valid.mapped_triples],
                                           batch_size=16)
        result_dicts.append({'results': subset_result, 'relation': relation, 'count': triples_subset.num_triples})

Evaluating over each relation:   0%|          | 0/17 [00:00<?, ?it/s]

Evaluating on cuda:0:   0%|          | 0.00/878 [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/1.65k [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/121k [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/261 [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/293 [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/527 [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/13.0 [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/6.75k [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/67.0 [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/426 [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/745 [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/35.0 [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/8.13k [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/10.7k [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/9.76k [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/23.9k [00:00<?, ?triple/s]

Evaluating on cuda:0:   0%|          | 0.00/249 [00:00<?, ?triple/s]

In [14]:
results_df = pd.DataFrame([{'relation': r['relation'], 'count': r['count'], **r['results'].to_flat_dict()} for r in result_dicts])
results_df

Unnamed: 0,relation,count,head.optimistic.adjusted_arithmetic_mean_rank,tail.optimistic.adjusted_arithmetic_mean_rank,both.optimistic.adjusted_arithmetic_mean_rank,head.realistic.adjusted_arithmetic_mean_rank,tail.realistic.adjusted_arithmetic_mean_rank,both.realistic.adjusted_arithmetic_mean_rank,head.pessimistic.adjusted_arithmetic_mean_rank,tail.pessimistic.adjusted_arithmetic_mean_rank,...,both.pessimistic.z_hits_at_k,head.optimistic.adjusted_hits_at_k,tail.optimistic.adjusted_hits_at_k,both.optimistic.adjusted_hits_at_k,head.realistic.adjusted_hits_at_k,tail.realistic.adjusted_hits_at_k,both.realistic.adjusted_hits_at_k,head.pessimistic.adjusted_hits_at_k,tail.pessimistic.adjusted_hits_at_k,both.pessimistic.adjusted_hits_at_k
0,COMPLEX_IN_PATHWAY,878,0.009246,0.002842,0.006043,0.009246,0.002842,0.006043,0.009246,0.002842,...,2579.823082,0.363266,0.832558,0.597912,0.363266,0.832558,0.597912,0.363266,0.832558,0.597912
1,COMPLEX_TOP_LEVEL_PATHWAY,1649,0.167434,0.000121,0.083216,0.167434,0.000121,0.083217,0.167434,0.000121,...,3019.81545,0.043571,0.981199,0.512385,0.043571,0.981199,0.512385,0.043571,0.981199,0.512385
2,DDI,120830,0.015371,0.010181,0.012778,0.015371,0.010181,0.012778,0.015371,0.010181,...,4097.879499,0.06487,0.097531,0.081201,0.06487,0.097531,0.081201,0.06487,0.097531,0.081201
3,DISEASE_GENETIC_DISORDER,261,0.001612,0.031104,0.016357,0.001612,0.031104,0.016357,0.001612,0.031104,...,2221.720149,0.950187,0.938692,0.944439,0.950187,0.938692,0.944439,0.950187,0.938692,0.944439
4,DISEASE_PATHWAY_ASSOCIATION,293,0.030052,0.016021,0.023036,0.030052,0.016021,0.023036,0.030052,0.016021,...,939.906993,0.105718,0.648431,0.377074,0.105718,0.648431,0.377074,0.105718,0.648431,0.377074
5,DPI,527,0.032448,0.021712,0.02708,0.032448,0.021712,0.02708,0.032448,0.021712,...,519.877581,0.092894,0.218143,0.155518,0.092894,0.218143,0.155518,0.092894,0.218143,0.155518
6,DRUG_CARRIER,13,0.062889,0.002433,0.032658,0.062889,0.002433,0.032658,0.062889,0.002433,...,262.494808,0.076836,0.92307,0.499953,0.076836,0.92307,0.499953,0.076836,0.92307,0.499953
7,DRUG_DISEASE_ASSOCIATION,6749,0.066737,0.008736,0.03773,0.066737,0.008736,0.03773,0.066737,0.008736,...,358.556725,0.001387,0.058587,0.029987,0.001387,0.058587,0.029987,0.001387,0.058587,0.029987
8,DRUG_ENZYME,67,0.027691,0.000521,0.014102,0.027691,0.000521,0.014102,0.027691,0.000521,...,426.861402,0.029759,0.686538,0.358148,0.029759,0.686538,0.358148,0.029759,0.686538,0.358148
9,DRUG_PATHWAY_ASSOCIATION,426,0.028615,0.002396,0.015505,0.028615,0.002396,0.015505,0.028615,0.002396,...,564.186253,0.077378,0.298056,0.187717,0.077378,0.298056,0.187717,0.077378,0.298056,0.187717


Here we save the results to a csv file, so we can load it later and make plots.

In [15]:
results_df.to_csv(osp.join(base_path, 'results_by_relation.csv'), index=False, sep='\t')

---

In [None]:
restricted_rels_macro_performance = results_df[results_df.columns[2:]].mean(axis=0)
restricted_rels_macro_performance

Note that this is **not** the same as the original, unrestricted evaluation. When restricting by relation, the average above is a *macro-average*, where all relations are weighted equally. In the unrestricted scenario, we average over all triples, which is a *micro-average* where more frequent relations are weighted higher:

In [None]:
results_all_rels_dict

Since we have the triple counts for each relation, we can compute a micro-average instead:

In [None]:
restricted_rels_micro_performance = results_df[results_df.columns[2:]].mul(results_df['Count'], axis=0).sum(axis=0) / results_df['Count'].sum()
restricted_rels_micro_performance

How do MRR, H@k, and AMR correlate?

In [None]:
def plot_metric_pair(results_df, metric_1: str, metric_2: str):
    """Make a scatter plot with one link prediction metric in each axis."""
    plt.figure()
    plt.scatter(results_df[metric_1], results_df[metric_2])
    plt.xlabel(metric_1)
    plt.ylabel(metric_2)

plot_metric_pair(results_df, 'mean_reciprocal_rank', 'hits_at_10')
plot_metric_pair(results_df, 'mean_reciprocal_rank', 'adjusted_mean_rank')
plot_metric_pair(results_df, 'mean_reciprocal_rank', 'Count')

What are the relations where the model performs better?

In [None]:
def per_relation_plot(results_df, metric: str):
    """Make a bar plot of link prediction performance for each relation."""
    results_df[['Relation', metric]].sort_values(by=metric).plot.barh(x='Relation', figsize=(5, 5), grid=True)

In [None]:
per_relation_plot(results_df, 'hits_at_10')

### Evaluating over specific entity and relation types

The source csv files contain the triples, plus extra information like the types of the entities involved in the triple. We will extract the type information.

In [None]:
train_df = pd.read_csv(osp.join(DATA_PATH, f'processed/{DATASET}-train.tsv'), sep='\t', dtype=str)
train_df.head()

In [None]:
def get_types_to_entities_dict(df):
    """Given a dataframe of triples, containing types for entities at the
    head and tail, extract a dictionary mapping entity types (str) to
    a list of entities of that type."""
    src_df = df[['src', 'src_type']]
    tgt_df = df[['tgt', 'tgt_type']]

    src_df = src_df.rename(columns={'src': 'entity', 'src_type': 'type'})
    tgt_df = tgt_df.rename(columns={'tgt': 'entity', 'tgt_type': 'type'})
    combined_df = pd.concat([src_df, tgt_df]).drop_duplicates(subset='entity')

    type_to_entities = combined_df.groupby('type')['entity'].apply(list).to_dict()

    return type_to_entities

In [None]:
type_to_entities = get_types_to_entities_dict(train_df)
for t, entities in type_to_entities.items():
    print(f'{t}: {len(entities):,} entities')

We can now get a list of e.g. Diseases with this dictionary:

In [None]:
type_to_entities['Disease'][:5]

We can now run the evaluation by relation type, separately for heads and tails of a specific type:

In [None]:
def get_side_prediction_results(model, evaluator, mapped_triples, restrict_entities_to, side: str, relation: str):
        assert side in {'head', 'tail'}

        results = evaluator.evaluate(model, mapped_triples, restrict_entities_to=restrict_entities_to,
                                     do_time_consuming_checks=False, use_tqdm=False)
        results_df = results.to_df()
        results_df = results_df.loc[(results_df['Side'] == side) & (results_df['Type'] == 'avg')]

        results_dict = {'Relation': relation, 'Side': side}
        results_dict.update({metric: value for metric, value in zip(results_df['Metric'].values, results_df['Value'].values)})

        return results_dict

results = []
evaluator = RankBasedEvaluator()
for relation in tqdm(model.triples_factory.relation_to_id, desc='Evaluating over each relation'):
    relation_parts = relation.split('_')
    head_type, tail_type = relation_parts[-2:]

    # Create subsets based on entity and relation
    triples_subset = valid.new_with_restriction(relations=[relation])
    if triples_subset.num_triples == 0:
        continue

    head_entities = type_to_entities[head_type]
    tail_entities = type_to_entities[tail_type]
    head_ids = torch.tensor(train.entities_to_ids(head_entities), dtype=torch.long)
    tail_ids = torch.tensor(train.entities_to_ids(tail_entities), dtype=torch.long)

    head_prediction_results = get_side_prediction_results(model, evaluator, triples_subset.mapped_triples, restrict_entities_to=head_ids, side='head', relation=relation)
    tail_prediction_results = get_side_prediction_results(model, evaluator, triples_subset.mapped_triples, restrict_entities_to=tail_ids, side='tail', relation=relation)
    results.extend([head_prediction_results, tail_prediction_results])

In [None]:
entity_restrict_results_df = pd.DataFrame(results)
entity_restrict_results_df

We can check the performance when predicting separately the head and the tail, for each relation.

In [None]:
results_df_side_labeled = entity_restrict_results_df.copy()
results_df_side_labeled['Relation'] = results_df_side_labeled['Relation'] + '_' + results_df_side_labeled['Side']
per_relation_plot(results_df_side_labeled, 'hits_at_10')

We then average the prediction for the head and the tail:

In [None]:
results_both_df = entity_restrict_results_df.groupby('Relation')[entity_restrict_results_df.columns[2:]].mean().reset_index()
results_both_df

In [None]:
per_relation_plot(results_both_df, 'hits_at_10')

Lastly, the overall average is computed over all metrics.

In [None]:
results_both_df.mean()

Compare with the results when restricting by relation type only:

In [None]:
restricted_rels_macro_performance

Note that this is **not** the same as the original averages computed without restrictions:

In [None]:
restricted_rels_micro_performance

The micro-average is instead the following:

In [None]:
results_both_df[results_both_df.columns[2:]].mul(results_df['Count'], axis=0).sum(axis=0) / results_df['Count'].sum()

We can see that restricting predictions over the correct domain and range of a relation only slightly increases the results.