In [1]:
import numpy as np
from rdflib import Graph
from ampligraph.evaluation import train_test_split_no_unseen
from ampligraph.latent_features import ScoringBasedEmbeddingModel
from ampligraph.latent_features.loss_functions import get as get_loss
from ampligraph.latent_features.regularizers import get as get_regularizer
from ampligraph.evaluation import mr_score, mrr_score, hits_at_n_score
import tensorflow as tf

In [2]:
# Загрузка онтологии
g = Graph()
g.parse(r"ontology\basic_ontlogy_characters.rdf")

# Извлечение триплетов
triples = []
for subj, pred, obj in g:
    triples.append((str(subj), str(pred), str(obj)))

In [5]:
# Преобразование в numpy массив
triples = np.array(triples)

# Разделение на обучающую и валидационную выборки
X_train, X_valid = train_test_split_no_unseen(triples, test_size=1000)

# Определение модели
model = ScoringBasedEmbeddingModel(
    k=100,
    eta=20,
    scoring_type='ComplEx',
    seed=0
)

In [7]:
# Настройка оптимизатора, функции потерь и регуляризатора
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
loss = get_loss('multiclass_nll')
regularizer = get_regularizer('LP', {'p': 3, 'lambda': 1e-5})

# Компиляция модели
model.compile(optimizer=optimizer, loss=loss, entity_relation_regularizer=regularizer)

# Обучение модели
model.fit(
    X_train,
    batch_size=int(X_train.shape[0] / 50),
    epochs=300,  # Число эпох
    verbose=True  # Отображение прогресса
)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x1fda63d2e30>

In [8]:
# Оценка модели
ranks = model.evaluate(
    X_valid,
    use_filter={'train': X_train, 'test': X_valid},
    corrupt_side='s,o',
    verbose=True
)

# Метрики
mr = mr_score(ranks)
mrr = mrr_score(ranks)
hits_10 = hits_at_n_score(ranks, n=10)
hits_3 = hits_at_n_score(ranks, n=3)
hits_1 = hits_at_n_score(ranks, n=1)

print("MRR: %.2f" % (mrr))
print("MR: %.2f" % (mr))
print("Hits@10: %.2f" % (hits_10))
print("Hits@3: %.2f" % (hits_3))
print("Hits@1: %.2f" % (hits_1))

MRR: 0.13
MR: 165.86
Hits@10: 0.20
Hits@3: 0.12
Hits@1: 0.09


# Clustering and embedding vizualization

In [None]:
!git clone https://github.com/wyldebeast-wunderliebe/incf.countryutils.git
!cd incf.countryutils && pip install .

In [9]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from ampligraph.discovery import find_clusters
from adjustText import adjust_text

In [10]:
# 1. Извлечение всех сущностей (например, персонажей)
entities = list(set([str(s) for s, _, _ in g]))

# Преобразуем список в numpy массив для работы с find_clusters
entities_array = np.array(entities)

# 2. Получение эмбеддингов для всех сущностей
entity_embeddings = dict(zip(entities, model.get_embeddings(entities_array)))

# 3. Снижение размерности до 2D с помощью PCA
embeddings_2d = PCA(n_components=2).fit_transform(np.array([emb for emb in entity_embeddings.values()]))

# 4. Кластеризация с использованием KMeans
clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0)
clusters = find_clusters(entities_array, model, clustering_algorithm, mode='e')

In [11]:
import plotly.express as px
import pandas as pd

plot_df = pd.DataFrame({
    "entity": entities,
    "embedding1": embeddings_2d[:, 0],
    "embedding2": embeddings_2d[:, 1],
    "cluster": "cluster" + pd.Series(clusters).astype(str)
})

fig = px.scatter(
    plot_df,
    x="embedding1",
    y="embedding2",
    color="cluster",
    hover_name="entity",
    title="Кластеризация графовых эмбеддингов",
    labels={"embedding1": "Компонента 1 (PCA)", "embedding2": "Компонента 2 (PCA)"},
    width=800,
    height=600
)

fig.update_layout(legend_title_text="Кластеры", legend=dict(x=1, y=1))

fig.show()