Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Eval node degree #42

Open
wants to merge 10 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,479 changes: 3,479 additions & 0 deletions notebooks/07_03_01_lp_graph_stats.ipynb

Large diffs are not rendered by default.

2,915 changes: 2,915 additions & 0 deletions notebooks/07_04_01_eval_lp_node_degree_effect.ipynb

Large diffs are not rendered by default.

2,112 changes: 2,112 additions & 0 deletions notebooks/07_04_03_eval_lp_node_degree_effect-plot.ipynb

Large diffs are not rendered by default.

Binary file added notebooks/data/imgs/eval_split_entity_view.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added notebooks/data/imgs/eval_split_triples_view.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
39 changes: 39 additions & 0 deletions notebooks/nb_utils/artifact_registry.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
model_registry_root_dir = "/home/jovyan/workbench-shared-folder/bioblp/models"#"../models"
#test_set_lists = ['test', 'valid']

[biokgb_graph]
biokgb_data_root_dir = "../data/biokgb"
graph_dir = "graph" # relative to biokgb_data_root_dir
properties_dir = "properties" # relative to biokgb_data_root_dir

[biokgb_graph.data_splits]
train = 'biokg.links-train.csv'
valid = 'biokg.links-valid.csv' # relative to biokgb_graph.graph_dir
test = 'biokg.links-test.csv'
dummy = 'biokg.links-dummy.csv'

[biokgb_graph.properties]

[biokgb_graph.properties.entity_type_metadata_paths]
disease = "biokg.metadata.disease.tsv" # relative to biokgb_properties.property_dir
protein = "biokg.metadata.protein.tsv"
drug = "biokg.metadata.drug.tsv"
pathway = "biokg.metadata.pathway.tsv"
genetic_disorder = "biokg.metadata.genetic_disorder.tsv"


[biokgb_graph.properties.entity_attribute_paths]
disease = "biokg_meshid_to_descr_name.tsv" # relative to biokgb_properties.property_dir
protein = "protein_embeddings_full_24_12_protid_idx_mapping.json"
drug = "biokg_molecule_embeddings_idx_mapping.json"
# drug = "biokg_molecule_embeddings.pt"

# model registry
[kge_models.registered_models] # model directory path relative to model_registry_root_dir
rotate-otxtubeb = "rotate-otxtubeb"
bioblpd-38uz9fjs-16-03-2023 = "bioblpd-38uz9fjs-16-03-2023"
bioblpd-38uz9fjs = "bioblpd-38uz9fjs"
bioblpp-cttc3ucm = "bioblpp-cttc3ucm"
bioblpm-5rxz3k57 = "bioblpm-5rxz3k57"
rotate-bioblp-m-1r75g9na = "rotate-bioblp-m-1r75g9na"
rotate-bioblp-p-4o6x9h96 = "rotate-bioblp-p-4o6x9h96"
152 changes: 152 additions & 0 deletions notebooks/nb_utils/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import abc
import toml
import json

from enum import Enum
from dataclasses import dataclass, field
from typing import List, Dict
import pandas as pd
from pathlib import Path
from pykeen.triples import TriplesFactory


DISEASE = 'disease'
DRUG = 'drug'
PROTEIN = 'protein'


ARTIFACT_REGISTRY_TOML_PATH = "nb_utils/artifact_registry.toml"


class EntityType(str, Enum):
disease = DISEASE
drug = DRUG
protein = PROTEIN


def load_toml(toml_path: str) -> dict:
toml_path = Path(toml_path)
config = {}
with open(toml_path, "r") as f:
config = toml.load(f)

return config


@dataclass
class ModelRegistryConfig:
#model_base_dir: str
registered_model_list: List[str]
registered_model_paths: Dict
registered_model_training_triples_paths: Dict

@classmethod
def from_toml(cls, toml_path):

def cfg_from_dict(cfg_toml_dict):
model_root_dir = Path(cfg_toml_dict.get("model_registry_root_dir"))
registered_model_list = list(cfg_toml_dict.get("kge_models").get("registered_models"))
registered_model_paths = {v: model_root_dir.joinpath(f"{v}") for v in registered_model_list}
registered_model_training_triples_paths = {k: v.joinpath("training_triples") for k,v in registered_model_paths.items()}
cfg = {}
cfg.update({"registered_model_list": registered_model_list})
cfg.update({"registered_model_paths": registered_model_paths})
cfg.update({"registered_model_training_triples_paths": registered_model_training_triples_paths})
return cfg

cfg_toml_dict = load_toml(toml_path)
cfg = cfg_from_dict(cfg_toml_dict)

return cls(**cfg)


# todo: delete
@dataclass
class GraphRegistryConfig:
graph_root_dir = str
biokgb_data_splits: Dict
biokgb_entity_type_metadata_paths: Dict
biokgb_entity_attribute_paths: Dict

@classmethod
def from_toml(cls, toml_path):
cfg_toml_dict = load_toml(toml_path)

biokgb_graph_cfg = cfg_toml_dict.get("biokgb_graph")
biokgb_data_root_dir = Path(biokgb_graph_cfg.get("biokgb_data_root_dir"))
biokgb_graph_dir = biokgb_data_root_dir.joinpath(biokgb_graph_cfg.get("graph_dir"))
biokgb_properties_dir = biokgb_data_root_dir.joinpath(biokgb_graph_cfg.get("properties_dir"))
biokgb_properties = biokgb_graph_cfg.get("properties")

biokgb_data_splits = {k: biokgb_graph_dir.joinpath(v) for k,v in biokgb_graph_cfg.get("data_splits").items()}

biokgb_entity_type_metadata_paths = {k: biokgb_properties_dir.joinpath(v) for k,v in biokgb_properties.get("entity_type_metadata_paths").items()}
biokgb_entity_attribute_paths = {k: biokgb_properties_dir.joinpath(v) for k,v in biokgb_properties.get("entity_attribute_paths").items()}

cfg = {}
cfg.update({"biokgb_data_splits": biokgb_data_splits})
cfg.update({"biokgb_entity_type_metadata_paths": biokgb_entity_type_metadata_paths})
cfg.update({"biokgb_entity_attribute_paths": biokgb_entity_attribute_paths})
return cls(**cfg)



@dataclass
class EvaluationConfig:
model_name: str
model_cfg: Dict
test_set_list: List[str]
test_ent_w_attribute_of_type: Enum

def get_model_config(self, artifact_toml_path=ARTIFACT_REGISTRY_TOML_PATH):
model_registry_cfg = ModelRegistryConfig.from_toml(artifact_toml_path)
registered_models = model_registry_cfg.registered_model_list
if self.model_name not in registered_models:
raise ValueError(f"{self.model_name} not in {registered_models}")

model_path = model_registry_cfg.registered_model_paths[self.model_name]
training_triples_path = model_path.joinpath('training_triples')

self.model_cfg.update({'model_name': self.model_name})
self.model_cfg.update({'model_path': model_path})
self.model_cfg.update({'training_triples_path': training_triples_path})
return self

def update_test_set_dict(self, test_set_list):
if not test_set_triples:
test_set_list = default_test_set_list()
self.test_set_list = test_set_list


def get_default_biokg_test_sets(
artifact_registry_toml_path=ARTIFACT_REGISTRY_TOML_PATH
):
toml_cfg = load_toml(artifact_registry_toml_path)
default_test_set_list = toml_cfg.get('test_set_lists')
return default_test_set_list


def create_entity_attr_aware_test_sets(entity_type_w_attribute: str,
graph_cfg: GraphRegistryConfig,
train: TriplesFactory,
test: TriplesFactory):
entity_metadata_path = graph_cfg.biokgb_entity_type_metadata_paths.get(entity_type_w_attribute)
entity_attribute_path = graph_cfg.biokgb_entity_attribute_paths.get(entity_type_w_attribute)

# create a subset of biokg entities of type Protein
entities = pd.read_csv(entity_metadata_path, sep="\t", names=[entity_type_w_attribute, "rel", "node_type"])
entity_set = set(entities[entity_type_w_attribute].values)
print(f"# {entity_type_w_attribute} entities in larger biokg (pre-benchmark removal): {len(entity_set)}")

# create a set of protein entities for which we have text descriptions
entity_w_attr_df = pd.read_json(entity_attribute_path, orient="index").reset_index()#, sep="\t", header=0, names=["protein", "attr"])
entity_w_attr_df.rename(columns={"index": entity_type_w_attribute, 0: "attr"}, inplace=True)
entity_attr_set = set(entity_w_attr_df[entity_type_w_attribute].values)
print(f"# {entity_type_w_attribute} entities for which we have attributes: {len(entity_attr_set)}")

test_triples_incl_ent_prop, test_triples_excl_ent_prop = split_test_triples_conditioned_on_ent_property(train_triples=train,
typed_ent_set=entity_set,
typed_ent_with_prop_set=entity_attr_set,
test_triples=test)

return test_triples_incl_ent_prop, test_triples_excl_ent_prop
Loading