In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

In [3]:
import logging
import matplotlib.pyplot as plt
import torch
from pykeen.pipeline import pipeline
from statistics import mean, median
import pandas as pd
import numpy as np
import seaborn as sns
import networkx as nx
from pyvis.network import Network
from ipywidgets import interact
import ipywidgets as widgets
from itertools import chain
from pathlib import Path
from rich import print
import plwordnet
import random
from pykeen.triples import TriplesFactory
from pykeen.datasets import EagerDataset

from utils import prepare_for_visualization

## Settings

In [4]:
import warnings

warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

pd.set_option("display.float_format", lambda x: "%.5f" % x)
logging.getLogger("pykeen").setLevel(logging.CRITICAL)

## Loading data

In [5]:
WORDNET_PATH = Path("data/plwordnet/plwordnet_4_2.xml")

In [6]:
wn = plwordnet.load(str(WORDNET_PATH))
print(wn)

In [7]:
def filter_pos_synsets(synsets: list, pos: str) -> list:
    return [
        s_id
        for s_id, s in synsets.items()
        if all([lu.pos == pos for lu in s.lexical_units])
    ]

In [8]:
filtered_synsets = filter_pos_synsets(wn.synsets, "NOUN")

In [9]:
sampled_synsets = set(random.sample(filtered_synsets, 150000))
sampled_relations = set(list(sorted(wn.relation_types.keys()))[:50])

In [10]:
sampled = [
    (h, r, t)
    for h, r, t, in wn.synset_relations
    if r in sampled_relations and h in sampled_synsets and t in sampled_synsets
]
sampled = np.array(sampled)

In [11]:
labeled_sampled = np.array(
    [
        (str(wn.synsets[h]), str(wn.relation_types[r].name), str(wn.synsets[t]))
        for h, r, t, in sampled
    ]
)
labeled_sampled[:2]

array([['{rozmowa.1 konwersacja.1 dialog.1 dyskurs.1}', 'hiperonimia',
        '{pogawędka.1 pogaduszka.1 pogwarka.1 pogaducha.1 rozmówka.1 gawędka.1 gawęda.3 gadu-gadu.1}'],
       ['{pogawędka.1 pogaduszka.1 pogwarka.1 pogaducha.1 rozmówka.1 gawędka.1 gawęda.3 gadu-gadu.1}',
        'hiponimia', '{rozmowa.1 konwersacja.1 dialog.1 dyskurs.1}']],
      dtype='<U271')

In [12]:
triples = TriplesFactory.from_labeled_triples(labeled_sampled)

In [13]:
training, testing = triples.split(0.95)

In [14]:
dataset = EagerDataset(training, testing)

In [15]:
g = nx.DiGraph()
g.add_edges_from([(h, t, {"title": r}) for h, r, t in dataset.training.triples])

## Visualization

In [16]:
_list_nodes = list(g.nodes)


@interact
def visualize(
    nodes=widgets.SelectMultiple(
        options=list(g.nodes), rows=10, value=[_list_nodes[0]]
    ),
    k=[0, 1, 2, 3],
    toggle_physics=False,
):
    filtered = set(
        chain(
            *[
                list(nx.single_source_shortest_path_length(g, n, cutoff=k))
                for n in nodes
            ]
        )
    )
    # print(filtered)
    subgraph = nx.subgraph_view(g, filter_node=lambda x: x in filtered)
    nt = Network(
        "500px", "500px", directed=True, notebook=True, cdn_resources="in_line"
    )
    nt.inherit_edge_colors(False)
    nt.from_nx(subgraph)
    nt.toggle_physics(toggle_physics)
    display(nt.show("basic.html"))

interactive(children=(SelectMultiple(description='nodes', index=(0,), options=('{1,2-benzochinon.1 o-benzochin…

## EDA

In [18]:
data = []
for subset_name in ["training", "testing"]:
    subset_metrics = {"subset": subset_name}
    subset = dataset.__getattribute__(subset_name)
    triples = subset.triples
    subset_metrics["num_triples"] = len(triples)
    subset_metrics["num_entities"] = len(np.unique(triples[:, [0, 2]]))
    subset_metrics["num_relations"] = len(np.unique(triples[:, 1]))
    data.append(subset_metrics)

pd.DataFrame(data)

Unnamed: 0,subset,num_triples,num_entities,num_relations
0,training,139432,66995,12
1,testing,7339,10358,9


In [19]:
metrics = {}
metrics["n_connected_components"] = nx.number_connected_components(g.to_undirected())
metrics["mean_size_of_connected_components"] = mean(
    len(c) for c in nx.connected_components(g.to_undirected())
)
metrics["median_size_of_connected_components"] = median(
    len(c) for c in nx.connected_components(g.to_undirected())
)
metrics["density"] = nx.density(g)
metrics["number_of_selfloops"] = nx.number_of_selfloops(g)
metrics["average_clustering"] = nx.average_clustering(g)
pd.DataFrame({"training": metrics})

Unnamed: 0,training
average_clustering,0.00707
density,3e-05
mean_size_of_connected_components,24.95158
median_size_of_connected_components,2.0
n_connected_components,2685.0
number_of_selfloops,0.0


In [None]:
degree_sequence = sorted(g.degree(), key=lambda x: x[1], reverse=True)[:20]

x, y = zip(*degree_sequence)
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
axes[0].set_title("Degree of nodes")
axes[0].barh(y=x, width=y)
axes[1].set_title("Degree Histogram")
sns.histplot([d for n, d in g.degree()], ax=axes[1], log_scale=True)
plt.tight_layout()

## Training 

In [None]:
result = pipeline(
    dataset=dataset,
    model="TransE",
    model_kwargs={"embedding_dim": 32},
    loss="nssa",
    loss_kwargs={"adversarial_temperature": 0.34, "margin": 9},
    optimizer="Adam",
    optimizer_kwargs={"lr": 0.004},
    negative_sampler_kwargs={"num_negs_per_pos": 33},
    training_kwargs=dict(
        num_epochs=25,
        batch_size=512,
        use_tqdm_batch=False,
    ),
    random_seed=123,
)

Training epochs on cpu:   0%|          | 0/25 [00:00<?, ?epoch/s]

In [None]:
save_location = Path("results/plwordnet")
save_location.mkdir(exist_ok=True, parents=True)
result.save_to_directory(save_location)
print(f"Saved: {os.listdir(save_location)}")

## Metrics

In [None]:
result.plot_losses()
plt.show()

In [None]:
metrics = result.metric_results.to_df()

In [None]:
metrics[(metrics.Side == "both") & (metrics.Type == "realistic")]

## Embeddings visualization

In [None]:
model = result.model
model

In [None]:
embeddings = result.model.entity_representations[0](torch.arange(dataset.num_entities))
labels = np.array(
    [dataset.training.entity_id_to_label[i] for i in range(dataset.num_entities)]
)

In [None]:
sampled_embeddings_idx = random.sample(list(range(len(embeddings))), 5000)

In [None]:
prepare_for_visualization(
    embeddings.detach().numpy()[sampled_embeddings_idx],
    labels[sampled_embeddings_idx],
    Path("logs/plwordnet"),
)

In [None]:
!tensorboard --logdir=logs/plwordnet