# Creating embeddings model

Preparing an environment, importing required libs

In [1]:
!pip install "tensorflow-gpu>=1.15.2,<2.0" ampligraph

Collecting numpy<1.19.0,>=1.16.0
  Using cached numpy-1.18.5-cp37-cp37m-manylinux1_x86_64.whl (20.1 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.6
[31mERROR: Cannot uninstall numpy 1.21.6, RECORD file not found. You might be able to recover from this via: 'pip install --force-reinstall --no-deps numpy==1.21.6'.[0m
You should consider upgrading via the '/home/dell/f1-knowledge-base/F1-knowledge-base/graph-embeddings/venv/bin/python -m pip install --upgrade pip' command.[0m


In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import ampligraph

print(tf.version.VERSION)
ampligraph.__version__

  from ._conv import register_converters as _register_converters


1.15.5


'1.4.0'

At first, we have to create pandas dataframe from triples extracted from ontology populated with individuals


In [2]:
from rdflib import Graph, URIRef


ONTOLOGY_IRI = "https://github.com/RogoGit/F1-knowledge-base/f1-ontology"
ONTOLOGY_PREFIX = "f1"
POPULATED_ONTOLOGY_PATH  = '../ontology-with-individuals.owl'


f1_graph = Graph().parse(POPULATED_ONTOLOGY_PATH, format="turtle")
triples_list = []

for subject, predicate, triple_object in f1_graph.triples((None, None, None)):
    if predicate.startswith(URIRef(ONTOLOGY_IRI)):
        triples_list.append([ent.replace(ONTOLOGY_IRI + "#", ONTOLOGY_PREFIX + ":") for ent in [subject, predicate, triple_object]])

f1_df = pd.DataFrame(triples_list, columns = ['Subject', 'Predicate', 'Object'])
print(f1_df)


                                                  Subject  \
0       f1:race_result_2009_brazilian_grand_prix_heidfeld   
1                              f1:driver_michele_alboreto   
2                                          f1:season_1995   
3           f1:race_result_1981_german_grand_prix_salazar   
4                    f1:constructor_standing_1981_toleman   
...                                                   ...   
362467    f1:race_result_1960_portuguese_grand_prix_clark   
362468  f1:race_result_1975_brazilian_grand_prix_mario...   
362469  f1:race_result_1953_german_grand_prix_graffenried   
362470  f1:race_result_1963_south_african_grand_prix_g...   
362471      f1:race_result_2012_canadian_grand_prix_resta   

                            Predicate  \
0       f1:grandPrixResultIsRelatedTo   
1         f1:hasDriverGrandPrixResult   
2                        f1:hasResult   
3                           f1:points   
4                    f1:totalPosition   
...                  

Next step is to create train and test samples for graph embedding training

In [3]:
from ampligraph.evaluation import train_test_split_no_unseen 

X_train, X_test = train_test_split_no_unseen(np.array(triples_list), test_size=0.10, seed=0)

print('Train set size: ', X_train.shape)
print('Test set size: ', X_test.shape)

Train set size:  (326225, 3)
Test set size:  (36247, 3)


Now it is time to define ComplEx model and train model with train sample

In [6]:
import tensorflow.contrib
from ampligraph.latent_features import ComplEx, save_model

model = ComplEx(batches_count=100,
                epochs=300,
                k=100,
                eta=20,
                optimizer='adam',
                optimizer_params={'lr':1e-4},
                loss='multiclass_nll',
                regularizer='LP',
                regularizer_params={'p':3, 'lambda':1e-5},
                seed=0,
                verbose=True)

model.fit(X_train)
save_model(model, './embedding_model.pkl')

Average ComplEx Loss:   0.110975: 100%|██████████| 300/300 [3:23:58<00:00, 40.79s/epoch]  


In order to use already trained model we can run:

In [5]:
import tensorflow.contrib
from ampligraph.latent_features import restore_model

model = restore_model('./embedding_model.pkl')

Next step is to ensure the model can be trained and evaluated correctly. The first of these is defining the filter that will be used to ensure that no negative statements generated by the corruption procedure are actually positives.

In [None]:
from ampligraph.evaluation import evaluate_performance

filter_triples = np.concatenate((X_train, X_test))
ranks = evaluate_performance(X_test,
                             model=model,
                             filter_triples=filter_triples,
                             use_default_protocol=True,
                             verbose=True)

Now let's use mrr_score (mean reciprocal rank) and hits_at_n_score functions

In [7]:
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score

mr = mr_score(ranks)
mrr = mrr_score(ranks)

print("MRR: %.2f" % mrr)
print("MR: %.2f" % mr)

hits_10 = hits_at_n_score(ranks, n=10)
print("Hits@10: %.2f" % hits_10)
hits_3 = hits_at_n_score(ranks, n=3)
print("Hits@3: %.2f" % hits_3)
hits_1 = hits_at_n_score(ranks, n=1)
print("Hits@1: %.2f" % hits_1)

MRR: 0.83
MR: 388.64
Hits@10: 0.93
Hits@3: 0.88
Hits@1: 0.76


Examples of generated embeddings can be seen below:

In [6]:
teams = f1_df.Subject[f1_df.Subject.str.startswith('f1:team_')].unique()
print(teams[:1])
team_embeddings = dict(zip(teams, model.get_embeddings(teams)))
print(list(team_embeddings.items())[:1])
team_embeddings_array = np.array([i for i in team_embeddings.values()])
print(team_embeddings_array[:1])


['f1:team_participation_1968_john_surtees_honda']
[('f1:team_participation_1968_john_surtees_honda', array([ 0.15058921,  0.2814807 ,  0.18882453,  0.17621101, -0.01801558,
       -0.19745371, -0.15330258,  0.19007845, -0.14624396, -0.18978721,
        0.01731578,  0.16899933,  0.19650525, -0.16436966, -0.16565551,
        0.00911854, -0.01223622,  0.1442946 ,  0.07178091, -0.13925822,
       -0.17620327, -0.17334235,  0.15280165, -0.19565189, -0.1704562 ,
        0.01307552,  0.15865031, -0.17643066,  0.15020834, -0.10737054,
       -0.00249887, -0.20082456, -0.10074502, -0.15203606, -0.17853774,
       -0.09260228, -0.12076462, -0.07302562,  0.17158657,  0.00034421,
       -0.19432108,  0.14239483,  0.06417444, -0.14825055, -0.12346714,
        0.1280998 ,  0.02698476,  0.15995395, -0.13690637,  0.2305665 ,
       -0.23822747,  0.12913081, -0.10044429, -0.05091096,  0.2150847 ,
       -0.1700849 ,  0.11347755,  0.05339024,  0.01367681, -0.00865733,
        0.05937546, -0.1170646 , -0

# Link prediction

Now it's time to try to predict missing link. We will try to predict race result for specified driver based on embeddings. First step is to add some new triples with result data, but without position of the driver at the end of race. Assume we already know driver number and starting grid and want to predict his finishing position. Let us take existing result of Lewis Hamilton in 2022 Azerbaijan grand prix, which is not present in knowledge graph yet

In [7]:
df_with_new_data = pd.DataFrame(X_train, columns = ['Subject','Predicate','Object'])
df_with_new_data.loc[len(df_with_new_data)] = \
    ["f1:race_result_2022_azerbaijan_grand_prix_hamilton", "f1:grandPrixResultIsRelatedTo", "f1:driver_lewis_hamilton"]
df_with_new_data.loc[len(df_with_new_data)] = \
    ["f1:driver_lewis_hamilton", "f1:hasDriverGrandPrixResult", "f1:race_result_2022_azerbaijan_grand_prix_hamilton"]
df_with_new_data.loc[len(df_with_new_data)] = \
    ["f1:race_result_2022_azerbaijan_grand_prix_hamilton", "f1:grid", "7"]
df_with_new_data.loc[len(df_with_new_data)] = \
    ["f1:race_result_2022_azerbaijan_grand_prix_hamilton", "f1:driverNumber", "44"]
df_with_new_data.loc[len(df_with_new_data)] = \
    ["f1:grand_prix_2022_azerbaijan_grand_prix", "f1:hasGrandPrixResult", "f1:race_result_2022_azerbaijan_grand_prix_hamilton"]
print(df_with_new_data)

                                                  Subject  \
0          f1:race_result_1978_italian_grand_prix_laffite   
1                 f1:grand_prix_2022_abu_dhabi_grand_prix   
2             f1:race_result_1980_monaco_grand_prix_prost   
3                   f1:driver_standing_1998_olivier_panis   
4           f1:race_result_1983_british_grand_prix_jarier   
...                                                   ...   
326225  f1:race_result_2022_azerbaijan_grand_prix_hami...   
326226                           f1:driver_lewis_hamilton   
326227  f1:race_result_2022_azerbaijan_grand_prix_hami...   
326228  f1:race_result_2022_azerbaijan_grand_prix_hami...   
326229           f1:grand_prix_2022_azerbaijan_grand_prix   

                            Predicate  \
0                      f1:finalStatus   
1                         f1:isPartOf   
2       f1:grandPrixResultIsRelatedTo   
3                      f1:totalPoints   
4                      f1:finalStatus   
...                  

Fit model on triples with new incomplete race result

In [8]:
import tensorflow.contrib
from ampligraph.latent_features import save_model

model.fit(np.array(df_with_new_data))
save_model(model, './embedding_model_new_data.pkl')

Average ComplEx Loss:   0.057507: 100%|██████████| 300/300 [3:14:23<00:00, 38.88s/epoch]  


Restore model if necessary

In [None]:
import tensorflow.contrib
from ampligraph.latent_features import restore_model

model_new_data = restore_model('./embedding_model_new_data.pkl')

Create statements to evaluate probability. There are total 20 drivers in 2022 Azerbaijan Grand Prix. So finishing position of Hamilton can be from 1 to 20

In [9]:
result_statements = np.array([
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '1'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '2'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '3'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '4'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '5'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '6'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '7'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '8'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '9'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '10'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '11'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '12'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '13'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '14'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '15'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '16'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '17'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '18'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '19'],
    ['f1:race_result_2022_azerbaijan_grand_prix_hamilton', 'f1:driverPosition', '20'],
])

Unite the triplets of the graph and the proposed statements

In [10]:
results_statements_filter = np.array(list({tuple(i) for i in np.vstack((df_with_new_data, result_statements))}))
print(len(results_statements_filter))
print(results_statements_filter)

326250
[['f1:race_result_1994_japanese_grand_prix_irvine' 'f1:lapsCompleted'
  '50']
 ['f1:race_result_1953_french_grand_prix_gerard' 'f1:driverPosition' '11']
 ['f1:season_1953' 'f1:hasResult' 'f1:driver_standing_1953_tony_crook']
 ...
 ['f1:team_participation_1974_jean-pierre_beltoise_brm' 'f1:hasTeam'
  'f1:team_brm']
 ['f1:race_result_1963_italian_grand_prix_cabral' 'f1:finalStatus'
  'Did not qualify']
 ['f1:qualifying_result_2004_bahrain_grand_prix_michael_schumacher'
  'f1:Q1Time' '1:30.139']]


In [12]:
from ampligraph.evaluation import evaluate_performance

ranks_statements = evaluate_performance(
    result_statements,
    model=model,
    filter_triples=results_statements_filter,
    corrupt_side = 's+o',
    use_default_protocol=False,
    verbose=True)

    protocol. This may be unnecessary and will lead to a 'harder' task. Besides, it will lead to a much slower
    evaluation procedure. We recommended to set the 'corruption_entities' argument to a reasonably sized set
    of entities. The size of corruption_entities depends on your domain-specific task.


    protocol. This may be unnecessary and will lead to a 'harder' task. Besides, it will lead to a much slower
    evaluation procedure. We recommended to set the 'corruption_entities' argument to a reasonably sized set
    of entities. The size of corruption_entities depends on your domain-specific task.
100%|██████████| 20/20 [00:04<00:00,  4.48it/s]


In [13]:
scores = model.predict(result_statements)
print(scores)

[ 6.6163654   7.8188457   1.028274    6.0552807   4.988968    2.7382526
  6.4778404   1.3771429   5.653017    2.2260032   1.7458227   0.7691908
 -1.0101306  -0.66524905  1.690872    3.1159573   0.99193525 -1.9479747
  0.85791004 -0.00974441]


Present the result of predictions

In [14]:
from scipy.special import expit
probs = expit(scores)

pd.DataFrame(list(zip([' '.join(x) for x in result_statements],
                      ranks_statements,
                      np.squeeze(scores),
                      np.squeeze(probs))),
             columns=['statement', 'rank', 'score', 'prob']).sort_values("prob")

Unnamed: 0,statement,rank,score,prob
17,f1:race_result_2022_azerbaijan_grand_prix_hami...,122005,-1.947975,0.124774
12,f1:race_result_2022_azerbaijan_grand_prix_hami...,116362,-1.010131,0.266954
13,f1:race_result_2022_azerbaijan_grand_prix_hami...,107985,-0.665249,0.339561
19,f1:race_result_2022_azerbaijan_grand_prix_hami...,72788,-0.009744,0.497564
11,f1:race_result_2022_azerbaijan_grand_prix_hami...,41536,0.769191,0.683346
18,f1:race_result_2022_azerbaijan_grand_prix_hami...,42165,0.85791,0.702224
16,f1:race_result_2022_azerbaijan_grand_prix_hami...,38889,0.991935,0.72947
2,f1:race_result_2022_azerbaijan_grand_prix_hami...,40258,1.028274,0.736581
7,f1:race_result_2022_azerbaijan_grand_prix_hami...,38289,1.377143,0.798532
14,f1:race_result_2022_azerbaijan_grand_prix_hami...,36152,1.690872,0.844339


As we can see low positions are less probable (as Lewis Hamilton is one of the best drivers indeed), high positions have higher probability. The real result for this race can be seen in https://ergast.com/api/f1/2022/results?limit=200. Hamilton finished 4th, which has 0.997660 probability in our model