In [None]:
!pip install networkx
!pip install pykeen

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import pykeen
from collections import Counter
from pykeen.pipeline import pipeline
from pykeen.triples.leakage import Sealant
from pykeen.triples import TriplesFactory
from pykeen.hpo import hpo_pipeline
from google.colab import drive
drive.mount("/content/drive/")

In [3]:
cols_name = ["Regulator", "SearchregulatoryMechanism", "Target"]
v2_weakly = pd.read_csv("drive/MyDrive/GuidedResearch/relations_subgraph.csv", sep="\t", header=None, names=cols_name)
v1 = pd.read_csv("drive/MyDrive/GuidedResearch/relations_v1.csv", sep="\t", header=None, names=cols_name).drop_duplicates()

In [4]:
def num_cc(df, cols_name):
  G = nx.DiGraph()
  for _, row in df.iterrows():
    G.add_edge(row[cols_name[0]], row[cols_name[2]], relation=row[cols_name[1]])
  connected_components = list(nx.weakly_connected_components(G))
  component_sizes = [len(component) for component in connected_components]
  print(f"Number of weakly CC is {len(connected_components)}. The biggest weakly CC is of size {max(component_sizes)}.")
  print(Counter(component_sizes))

In [5]:
num_cc(v1, cols_name)

Number of weakly CC is 81. The biggest weakly CC is of size 1123.
Counter({2: 68, 3: 10, 1123: 1, 1: 1, 4: 1})


In [6]:
G = nx.DiGraph()
for _, row in v1.iterrows():
  G.add_edge(row[cols_name[0]], row[cols_name[2]], relation=row[cols_name[1]])
CC = min(nx.weakly_connected_components(G), key=len)
G_sub = G.subgraph(CC)
dict(G_sub.degree())

{'GFRA1': 2}

In [7]:
test_df = v2_weakly.merge(v1, on=cols_name, how='left', indicator=True).query('_merge == "left_only"').drop(columns='_merge')
num_cc(test_df, cols_name)

Number of weakly CC is 21. The biggest weakly CC is of size 2172.
Counter({2: 15, 3: 4, 2172: 1, 4: 1})


In [8]:
tf = TriplesFactory.from_labeled_triples(v1.to_numpy())
train2, valid = tf.split([0.9, 0.1], random_state=1234)

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [1062, 236]


In [9]:
num_cc(train2.tensor_to_df(tensor=train2.mapped_triples)[["head_label", "relation_label", "tail_label"]], ["head_label", "relation_label", "tail_label"])

Number of weakly CC is 89. The biggest weakly CC is of size 1102.
Counter({2: 74, 3: 11, 1102: 1, 1: 1, 4: 1, 6: 1})


In [10]:
num_cc(valid.tensor_to_df(tensor=valid.mapped_triples)[["head_label", "relation_label", "tail_label"]], ["head_label", "relation_label", "tail_label"])

Number of weakly CC is 24. The biggest weakly CC is of size 183.
Counter({2: 22, 183: 1, 3: 1})


In [11]:
v1.to_csv("relations_v1.csv", header=False, sep="\t", index=False)
train2.tensor_to_df(tensor=train2.mapped_triples)[["head_label", "relation_label", "tail_label"]].to_csv("train2.txt", header=False, sep="\t", index=False)
valid.tensor_to_df(tensor=valid.mapped_triples)[["head_label", "relation_label", "tail_label"]].to_csv("valid.txt", header=False, sep="\t", index=False)
test_df.to_csv("test.txt", header=False, sep="\t", index=False)

In [12]:
print(len(v2_weakly), len(v1), len(train2.tensor_to_df(tensor=train2.mapped_triples)[["head_label", "relation_label", "tail_label"]]), len(valid.tensor_to_df(tensor=valid.mapped_triples)[["head_label", "relation_label", "tail_label"]]), len(test_df))

6085 2356 2120 236 4074
