In [2]:
import os
import random
from time import time
import pandas as pd
import numpy as np
import networkx as nx
import graph_tool.all as gt
from motif_counts import *
from tqdm.auto import tqdm
from collections import defaultdict
from collections import defaultdict
from tqdm import tqdm
from joblib import Parallel, delayed
import pickle
#from tqdm.notebook import tqdm

In [3]:
df = pd.read_feather('mcns_fw_edge_comp.feather')

In [4]:
# 1) build a unique list of all node labels
labels = np.unique(np.concatenate([df['pre'].values, df['post'].values]))

# 2) create the graph and a string vertex‐property to store the label
g = gt.Graph(directed=True)
v_label = g.new_vp("string")
g.vp["label"] = v_label

# 3) add one vertex per label, keep a Python dict to map label→vertex
label2v = {}
for L in labels:
    v = g.add_vertex()
    label2v[L] = v
    v_label[v] = str(L)

# 4) create a float edge‐property for your weights
e_weight = g.new_ep("float")
g.ep["weight"] = e_weight

# 5) add all edges with their weights
edge_list = [
    (label2v[src], label2v[tgt], float(w))
    for src, tgt, w in df[['pre','post','weight_m']].itertuples(index=False)
]
g.add_edge_list(edge_list, eprops=[g.ep["weight"]])
loops = [e for e in g.edges() if e.source() == e.target()]
for e in loops:
    g.remove_edge(e)

In [5]:
V = list(range(g.num_vertices()))
E = {
    (int(e.source()), int(e.target()))
    for e in g.edges()
    if g.ep['weight'][e] >= 5
}

In [None]:
def _process_edge(a, b, nbr_out, nbr_in, Eset):
    local = set()
    # only c’s that could possibly add any extra edge
    candidates = (nbr_out[a] | nbr_in[a] | nbr_out[b] | nbr_in[b]) - {a, b}

    for c in candidates:
        t = Triplet([a, b, c])
        t.add_edge(a, b)
        # check the other five directed edges
        for src, dst in ((a, c), (c, a), (b, c), (c, b), (b, a)):
            if (src, dst) in Eset:
                t.add_edge(src, dst)
        local.add(t)

    return local

def collect_triplets_parallel(E, V, n_jobs=-1, prefer="threads"):
    # ensure O(1) lookups
    Eset = set(E)

    # build directed adjacency once
    nbr_out = defaultdict(set)
    nbr_in  = defaultdict(set)
    for u, v in Eset:
        nbr_out[u].add(v)
        nbr_in[v].add(u)

    # parallel map over edges
    results = Parallel(n_jobs=n_jobs, prefer=prefer)(
        delayed(_process_edge)(a, b, nbr_out, nbr_in, Eset)
        for a, b in tqdm(Eset, total=len(Eset))
    )

    # union all the per-edge sets
    tri = set().union(*results)
    return tri


tri = collect_triplets_parallel(E, V)
#with open('tri.pkl', 'wb') as f:
    # protocol=pickle.HIGHEST_PROTOCOL uses the most efficient format
    #pickle.dump(tri, f, protocol=pickle.HIGHEST_PROTOCOL)

 17%|████████████████████████████████████▉                                                                                                                                                                                         | 143867/863843 [12:43<37:59:59,  5.26it/s]IOStream.flush timed out
 26%|██████████████████████████████████████████████████████████▎                                                                                                                                                                     | 224816/863843 [18:32<39:04, 272.52it/s]IOStream.flush timed out
 41%|██████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                     | 350597/863843 [29:35<34:00, 251.54it/s]

In [3]:
%time
with open('tri.pkl', 'rb') as f:
    tri = pickle.load(f)

CPU times: user 2 μs, sys: 0 ns, total: 2 μs
Wall time: 3.34 μs


In [27]:
# grab the iterable of Triplet objects
triplet_iter = iter(tri)

# build a list of plain dicts, one per triplet
rows = []
for t in tqdm(triplet_iter,total=len(tri)):
    # t.vertices maps local‐indices 0,1,2 → original node IDs
    v = t.vertices
    rows.append({
        'node0': v[0],
        'node1': v[1],
        'node2': v[2],
        # store edges as a list of tuples
        'edges': list(t.edges),
    })

df = pd.DataFrame.from_records(rows)
df.to_feather("motifcounts_triplets.feather")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 168543875/168543875 [22:02<00:00, 127461.66it/s]
