In [1]:
import os
import sys

import re
import pandas as pd
import networkx as nx
import tqdm
import random
import pickle
import json
import subprocess

# Determine the project root directory for relative imports
try:
    # This will work in scripts where __file__ is defined
    current_dir = os.path.dirname(os.path.abspath(__file__))
    # Assuming "src" is parallel to the script folder
    project_root = os.path.abspath(os.path.join(current_dir, ".."))
except NameError:
    # In notebooks __file__ is not defined: assume we're in notebooks/riziv_dataset/
    project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))

src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

# Local application imports

from main.ollama_utils import get_ollama_embedding


In [2]:
# Define the path to the BSARD dataset files
BSARD_data_path = os.path.join(project_root, "data", "BSARD_dataset")

In [3]:
with open(os.path.join(BSARD_data_path, "base_document_graph.pkl"), 'rb') as f:
    G = pickle.load(f)

In [4]:
def get_gpu_temperature():
    # Run nvidia-smi to query GPU temperature
    result = subprocess.run(
        ["nvidia-smi", "--query-gpu=temperature.gpu", "--format=csv,noheader"],
        stdout=subprocess.PIPE,
        text=True
    )
    # Parse and return temperature of GPU 0
    return int(result.stdout.strip().split('\n')[0])

def gpu_temperature_rest_time():
    if get_gpu_temperature() >= 80:
        return 100
    else:
        return 0

In [5]:
# Test embedding generation
embedding = get_ollama_embedding("Embedding generation test", )
print(embedding)
print(len(embedding["embedding"]))

{'embedding': [-1.0566672086715698, -0.9916018843650818, -0.1977226734161377, 0.44412606954574585, -0.07686168700456619, -0.7473859786987305, 0.6320770382881165, -0.3504481613636017, -0.2972276210784912, 0.16000185906887054, -0.5720860362052917, 0.17846082150936127, 0.44752582907676697, 0.3534890413284302, -0.43371647596359253, -0.8164119720458984, -1.5097856521606445, -0.2829397916793823, -0.09882917255163193, -1.298407793045044, 0.010910525918006897, -1.9046603441238403, 0.5234888792037964, -0.5442371368408203, 0.584852933883667, 0.22193007171154022, -0.3788483738899231, -0.46844640374183655, 0.2362867146730423, -0.6325748562812805, 0.6420486569404602, -1.0469647645950317, 0.4750227928161621, -1.4116549491882324, -0.5067553520202637, -0.05261102318763733, -0.37181004881858826, -0.09439092874526978, -1.6463353633880615, -0.0347198061645031, -0.3107561767101288, 0.013417135924100876, 0.7972341179847717, -1.205283761024475, 0.9445411562919617, -0.7153135538101196, 0.156986802816391, -0.

In [6]:
from tqdm.auto import tqdm      # auto ➜ usa notebook widget si existe

# 1) Pre‑contamos los nodos "Article" para que tqdm conozca el total
article_nodes = [
    (n, d) for n, d in G.nodes(data=True) if d.get("node_type") == "Article"
]

# 2) Barra de progreso: un tick por embedding generado
for node, data in tqdm(article_nodes,
                       desc="Generating embeddings",
                       total=len(article_nodes)):      # opcional: tqdm lo infiere
    data["embedding"] = get_ollama_embedding(data["article_text"])["embedding"]


Generating embeddings:   0%|          | 0/22633 [00:00<?, ?it/s]

In [7]:
# with open(os.path.join(BSARD_data_path, 'base_document_graph_wsem.pkl'), 'wb') as f:
#    pickle.dump(G, f)

In [8]:
# with open(os.path.join(BSARD_data_path, "base_document_graph_wsem.pkl"), 'rb') as f:
#    G = pickle.load(f)

In [9]:
from tqdm.auto import tqdm
import numpy as np

# --- 1) Libro: promedio de embeddings de Artículo -------------------------

# 1.1. recoge todos los nodos tipo Book
book_nodes = [n for n, d in G.nodes(data=True) if d.get("node_type") == "Book"]

for book in tqdm(book_nodes, desc="Aggregating Book embeddings"):
    # 1.2. encuentra los artículos hijos: sucesores en el grafo
    child_articles = [
        nbr for nbr in G[book]
        if G.nodes[nbr].get("node_type") == "Article"
    ]
    # 1.3. extrae sus embeddings
    embeddings = [
        np.array(G.nodes[art]["embedding"])
        for art in child_articles
        if "embedding" in G.nodes[art]
    ]
    if embeddings:
        # 1.4. calcula el promedio y asigna al nodo Book
        avg = np.mean(embeddings, axis=0)
        G.nodes[book]["embedding"] = avg.tolist()

# --- 2) Act: promedio de embeddings de Book -------------------------------

act_nodes = [n for n, d in G.nodes(data=True) if d.get("node_type") == "Act"]

for act in tqdm(act_nodes, desc="Aggregating Act embeddings"):
    # hijos son nodos Book a los que apunta con relación CONTAINS
    child_books = [
        nbr for nbr in G[act]
        if G.nodes[nbr].get("node_type") == "Book"
    ]
    embeddings = [
        np.array(G.nodes[bk]["embedding"])
        for bk in child_books
        if "embedding" in G.nodes[bk]
    ]
    if embeddings:
        avg = np.mean(embeddings, axis=0)
        G.nodes[act]["embedding"] = avg.tolist()


Aggregating Book embeddings:   0%|          | 0/150 [00:00<?, ?it/s]

Aggregating Act embeddings:   0%|          | 0/35 [00:00<?, ?it/s]

In [11]:
with open(os.path.join(BSARD_data_path, 'base_document_graph_wsem.pkl'), 'wb') as f:
    pickle.dump(G, f)