In [1]:
import tensorflow
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

2026-01-24 03:13:55.136985: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-24 03:13:58.278879: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-01-24 03:14:40.773684: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
I0000 00:00:1769220903.725101   34928 gpu_device.cc:2020] Created device /device:GPU:0 with

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 15995435078273779625
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 14353956864
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 781328334471306805
 physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 4060 Ti, pci bus id: 0000:01:00.0, compute capability: 8.9"
 xla_global_id: 416903419]

In [2]:
from tqdm import tqdm
import numpy as np
import random
from collections import deque

class Edge:
    def __init__(self, node1, node2, relationship):
        self.node1 = node1
        self.node2 = node2
        self.relationship = relationship
    def __eq__(self, other):
        return self.node1.value == other.node1.value and self.node2.value == other.node2.value and self.relationship.name == other.relationship.name 
    def __hash__(self):
        return hash(self.node1.value + self.node2.value + self.relationship.name) 
    def __str__(self):
        return f"{str(self.node1)} {str(self.relationship)}  {str(self.node2)}"
class Relationship:
    """Hashable relationship with its name as ID"""
    def __init__(self, name):
        self.name = name.strip()
    def __eq__(self, n1):
        """Relationships are equal if they have the same name"""
        return self.name == n1.name 
    def __str__(self):
        return self.name
    def __hash__(self):
        """Relationship name identifies the relationship"""
        return hash(self.name) 
class Node:
    """Hashable node with its value as ID"""
    def __init__(self, value):
        self.value = value.strip()
    def __str__(self):
        return self.value
    """Nodes are equal if they have the same value"""
    def __eq__(self, n1):
        return self.value == n1.value
    """Node value identifies the node"""
    def __hash__(self):
        return hash(self.value) 
class Graph:
    """Graph stucture"""
    def __init__(self):
        """Initialize the graph"""
        #sets because we need efficient lookups (O(1) for set O(n) for list)
        self.nodes = set()
        self.relationships = set()
        self.edges = []
        self.node2id = {}
        self.rel2id = {}
    def add_triple(self, s_val, o_val, p_val):
        """Add a single triple to the graph."""
        #set is comprised of unique element, adding an existing element doesn't affect the set 
        self.nodes.add(s_val)
        self.nodes.add(o_val)
        self.relationships.add(p_val)
        self.edges.append(Edge(s_val, o_val, p_val))

    def finalize(self):
        """Build indices once after all data is loaded."""
        self.node2id = {node: i for i, node in enumerate(self.nodes)}
        self.rel2id = {rel: i for i, rel in enumerate(self.relationships)}
        self.adj = {node: [] for node in self.nodes}
        for e in self.edges:
            self.adj[e.node1].append((e.node2, e.relationship))
    def find_l1_paths(self, node):
        pass
    def paths2dataset(self):
        pass      
    def find_paths(self, s, t):
        """Optimized iterative path finding with max depth 6. This function is slow for our problem, moving implementation to C++ with paralellization."""
        all_paths = []
        # Queue stores: (current_node, current_path, visited_set)
        queue = deque([(s, [(s, None)], {s})])
        
        while queue:
            curr_node, path, visited = queue.popleft()
            
            # Stop if path exceeds max length
            if len(path) > 6:
                continue
            # if len(all_paths) == 3:
            #     continue
            for neighbor, rel in self.adj.get(curr_node, []):
                if neighbor == t:
                    # Path length must be > 1 (more than 2 nodes in list)
                    if len(path) > 2:
                        all_paths.append(path + [(neighbor, rel)])
                    continue # Found target, don't need to go deeper from here (simple path)
                
                if neighbor not in visited and len(path) < 4:
                    # Use set union for efficiency in creating the next visited set
                    queue.append((neighbor, path + [(neighbor, rel)], visited | {neighbor}))
        return all_paths
    def graph2dataset(self):
        """Convert data to numpy arrays compatible as neural net input"""
        pos_triples = np.array([
            [self.node2id[e.node1], self.node2id[e.node2], self.rel2id[e.relationship]] 
            for e in self.edges
        ], dtype=np.int32)
        num_pos = len(pos_triples)
        num_nodes = len(self.nodes)
        num_rels = len(self.relationships)
        # We over-sample by 10% to account for accidental "real" edges being picked
        oversample_factor = 1.1
        num_to_sample = int(num_pos * oversample_factor)

        neg_subs = np.random.randint(0, num_nodes, num_to_sample)
        neg_objs = np.random.randint(0, num_nodes, num_to_sample)
        neg_rels = np.random.randint(0, num_rels, num_to_sample)
        
        neg_triples = np.stack([neg_subs, neg_objs, neg_rels], axis=1)

        #Pruning: Filter out samples where sub == obj or triple exists in positive set

        def hash_triples(triples):
            # Maps (s, o, r) to a single unique integer
            return triples[:, 0] * (num_nodes * num_rels) + triples[:, 1] * num_rels + triples[:, 2]

        pos_hashes = set(hash_triples(pos_triples))
        neg_hashes = hash_triples(neg_triples)
        mask = np.array([(h not in pos_hashes) for h in neg_hashes])
        mask &= (neg_triples[:, 0] != neg_triples[:, 1])
        valid_negatives = neg_triples[mask][:num_pos]
        X = np.vstack([pos_triples, valid_negatives])
        y = np.concatenate([np.ones(num_pos), np.zeros(len(valid_negatives))])

        return X, y
            
g = Graph()  
l_bar = '{desc}: {percentage:.3f}%|'
r_bar = '| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, ' '{rate_fmt}{postfix}]'
format = '{l_bar}{bar}{r_bar}'
data = open('../../SiaILP/data/lineage/test.txt').readlines()  
for line in tqdm(data, ncols=100, bar_format=format):
    s,p,o = line.split('\t')
    g.add_triple(Node(s), Node(o), Relationship(p))
g.finalize()
X,y = g.graph2dataset()

triples_file = open('triples.data', 'w+')
for i,item in enumerate(X):
    triples_file.write(f"{int(y[i])} {int(item[0])} {int(item[1])} {int(item[2])}\n")
triples_file.close()
    

100%|████████████████████████████████████████████████████| 253990/253990 [00:03<00:00, 72763.84it/s]


In [None]:
from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate, Dot, Normalization, Lambda, LSTM, Bidirectional, MaxPooling1D, Flatten, Reshape
from tensorflow.keras.models import Model



num_nodes = len(g.nodes) + 1 
num_rels = len(g.relationships) + 1

#single input for the triple [s, o, r]
triple_input = Input(shape=(3,), name="triple_input")
#slice the input
s_idx = Lambda(lambda x: x[:, 0])(triple_input)
o_idx = Lambda(lambda x: x[:, 1])(triple_input)
r_idx = Lambda(lambda x: x[:, 2])(triple_input)
#embedding Layers
node_emb_layer = Embedding(input_dim=num_nodes, output_dim=300, name="Node_Embedding")
rel_emb_layer = Embedding(input_dim=num_rels, output_dim=300, name="Rel_Embedding")
s_emb = node_emb_layer(s_idx)
o_emb = node_emb_layer(o_idx)
r_emb = rel_emb_layer(r_idx)
#reshape in order to fit LSTM
s_seq = Reshape((1, 300))(s_emb)
o_seq = Reshape((1, 300))(o_emb)
#add 2 layer bi-directional LSTM
lstm_layer_1 = Bidirectional(LSTM(150, return_sequences=True))
lstm_layer_2 = Bidirectional(LSTM(150, return_sequences=True))
#first LSTM layer
fst_lstm_mid = lstm_layer_1(s_seq)
scd_lstm_mid = lstm_layer_1(o_seq)
#second LSTM layer
fst_lstm_fin = lstm_layer_2(fst_lstm_mid)
scd_lstm_fin = lstm_layer_2(scd_lstm_mid)
#reduce max
pool_layer = MaxPooling1D(pool_size=1) 
fst_pooled = pool_layer(fst_lstm_fin)
scd_pooled = pool_layer(scd_lstm_fin)
#flatten
fst_final = Flatten()(fst_pooled)
scd_final = Flatten()(scd_pooled)
#merge node embeddings with DNN
nodes_concat = Concatenate()([s_emb, o_emb])
nodes_representation = Dense(300)(nodes_concat)
#normalization
rel_normalized = Normalization(axis=-1)(r_emb)
nodes_normalized = Normalization(axis=-1)(nodes_representation)
#edge probability
edge_probability = Dot(axes=-1)([rel_normalized, nodes_normalized])
edge_probability = Flatten()(edge_probability)
output = Dense(1, activation='sigmoid')(edge_probability)

m = Model(inputs = triple_input, outputs = output)
m.compile(loss='binary_crossentropy', metrics=['accuracy'])
m.fit(X,y,validation_split=0.1)



In [3]:
total = int(len(X)/2)
pos = X[ :total ]
neg = X[total : ]
pos_fl = [[t[0],t[2]] for t in pos]
neg_fl = [[t[0],t[2]] for t in neg]


In [4]:
import numpy as np
paths = [np.array([int(node) for node in s.strip().split(':')[1].split()]) for s in open('paths_output.txt', 'r').readlines()]


In [5]:
pos = [int(s.strip().split(':')[0].split()[1]) for s in open('paths_output.txt', 'r').readlines()]
rel = [int(s.strip().split(':')[0].split()[0]) for s in open('paths_output.txt', 'r').readlines()]

In [6]:
paths3 = [p for p in paths if len(p) == 3]
paths4 = [p for p in paths if len(p) == 4]
paths5 = [p for p in paths if len(p) == 5]
paths6 = [p for p in paths if len(p) == 6]
paths7 = [p for p in paths if len(p) == 7]
paths8 = [p for p in paths if len(p) == 8]
paths9 = [p for p in paths if len(p) == 9]
paths10 = [p for p in paths if len(p) == 10]


all_paths = {
    3 : paths3,
    4 : paths4,
    5 : paths5,
    6 : paths6,
    7 : paths7,
    8 : paths8,
    9 : paths9,
    10 : paths10,
}
for l in all_paths:
    print(len(all_paths[l]))

196
21653
175
2429
1574
16182
297849
1055030


In [7]:
X_conn_based_rels = []
X_conn_based_p1s = []
X_conn_based_p2s = []
X_conn_based_p3s = []
y_conn_based = []
for i in tqdm(range(int(len(paths) / 3))):
    is_pos = pos[i*3]
    rel_id = rel[i*3]
    p1 = np.array(paths[i*3])
    p2 = np.array(paths[i*3+1])
    p3 = np.array(paths[i*3+2])
    
    X_conn_based_p1s.append([p1])
    X_conn_based_p2s.append([p2])
    X_conn_based_p3s.append([p3])
    X_conn_based_rels.append([rel_id])
    y_conn_based.append(is_pos)
    


100%|██████████| 465029/465029 [00:02<00:00, 181915.12it/s]


In [10]:
from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate, Dot, Normalization, Lambda, LSTM, Bidirectional, MaxPooling2D, Flatten, Reshape, GlobalMaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
# Ensure every element is a simple Python list or Numpy array of integers
def clean_sequence(data):
    # This converts everything to a list of numpy arrays
    return [np.array(i).flatten() for i in data]

X_p1_clean = clean_sequence(X_conn_based_p1s)
X_p2_clean = clean_sequence(X_conn_based_p2s)
X_p3_clean = clean_sequence(X_conn_based_p3s)

# Now pad. We specify maxlen to be safe.
max_len = max(len(s) for s in X_p1_clean + X_p2_clean + X_p3_clean)
print(max_len)
X_p1 = pad_sequences(X_p1_clean, maxlen=max_len, padding='post')
X_p2 = pad_sequences(X_p2_clean, maxlen=max_len, padding='post')
X_p3 = pad_sequences(X_p3_clean, maxlen=max_len, padding='post')

# 2. Ensure your relation and labels are numpy arrays

# Convert to numpy and force the (batch, 1) shape
X_rel = np.array(X_conn_based_rels).reshape(-1, 1)
Y = np.array(y_conn_based) # Ensure labels are also a numpy array

num_nodes = len(g.nodes) + 1 
num_rels = len(g.relationships) + 1



#input is not rectangular, we need separate inputs for each path
path1_inp = Input(shape=(10,), name="p1 input")
path2_inp = Input(shape=(10,), name="p2 input")
path3_inp = Input(shape=(10,), name="p3 input")
rel_inp = Input(shape=(1,), name="rel input")
#embedding Layers
node_embedding = Embedding(input_dim=num_nodes, output_dim=300, name="Node_Embedding")
rel_embedding = Embedding(input_dim=num_rels, output_dim=300, name="Rel_Embedding")
p1_emb = node_embedding(path1_inp)
p2_emb = node_embedding(path2_inp)
p3_emb = node_embedding(path3_inp)
r_emb = rel_embedding(rel_inp)
#reshape in order to fit LSTM
p1_seq = Reshape((10, 300))(p1_emb)
p2_seq = Reshape((10, 300))(p2_emb)
p3_seq = Reshape((10, 300))(p3_emb)
#add 2 layer bi-directional LSTM
lstm_layer_1 = Bidirectional(LSTM(150, return_sequences=True))
lstm_layer_2 = Bidirectional(LSTM(150, return_sequences=True))
#first LSTM layer
fst_lstm_mid = lstm_layer_1(p1_seq)
scd_lstm_mid = lstm_layer_1(p2_seq)
trd_lstm_mid = lstm_layer_1(p3_seq)
#second LSTM layer
fst_lstm_fin = lstm_layer_2(fst_lstm_mid)
scd_lstm_fin = lstm_layer_2(scd_lstm_mid)
trd_lstm_fin = lstm_layer_2(trd_lstm_mid)
#reduce max
pool_layer = GlobalMaxPooling1D() 
fst_pooled = pool_layer(fst_lstm_fin)
scd_pooled = pool_layer(scd_lstm_fin)
trd_pooled = pool_layer(trd_lstm_fin)
#flatten
fst_final = Flatten()(fst_pooled)
scd_final = Flatten()(scd_pooled)
trd_final = Flatten()(trd_pooled)
#merge node embeddings with DNN
nodes_concat = Concatenate()([fst_final, scd_final, trd_final])
nodes_representation = Dense(300)(nodes_concat)
#normalization
rel_normalized = Normalization(axis=-1)(r_emb)
nodes_normalized = Normalization(axis=-1)(nodes_representation)
#edge probability
edge_probability = Dot(axes=-1)([rel_normalized, nodes_normalized])
edge_probability = Flatten()(edge_probability)
output = Dense(1, activation='sigmoid')(edge_probability)

m = Model(inputs = [path1_inp,path2_inp,path3_inp,rel_inp], outputs = output)
m.compile(loss='binary_crossentropy', metrics=['accuracy'])
try:
    m.fit(
        x={
            "p1 input": X_p1,
            "p2 input": X_p2,
            "p3 input": X_p3,
            "rel input": X_conn_based_rels
        },
        y=Y,
        epochs=10,
        batch_size=32,
        validation_split=0.1
    )
except Exception as ex:
    print(ex)
    



10
Argument `validation_split` is only supported for tensors or NumPy arrays.Found incompatible type in the input: [<class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>, <class 'int'>,

In [None]:
print(X_conn_based_rels)