In [3]:
import os
import sys

import re
import pandas as pd
import networkx as nx
import tqdm
import random
import pickle
import json
import subprocess

# Determine the project root directory for relative imports
try:
    # This will work in scripts where __file__ is defined
    current_dir = os.path.dirname(os.path.abspath(__file__))
    # Assuming "src" is parallel to the script folder
    project_root = os.path.abspath(os.path.join(current_dir, ".."))
except NameError:
    # In notebooks __file__ is not defined: assume we're in notebooks/riziv_dataset/
    project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))

src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

# Local application imports

from main.ollama_utils import get_ollama_embedding


In [None]:
# Define the path to the BSARD dataset files
BSARD_data_path = os.path.join(project_root, "data", "BSARD_dataset")

In [None]:
with open(os.path.join(BSARD_data_path, "base_document_graph.pkl"), 'rb') as f:
    G = pickle.load(f)

In [None]:
def get_gpu_temperature():
    # Run nvidia-smi to query GPU temperature
    result = subprocess.run(
        ["nvidia-smi", "--query-gpu=temperature.gpu", "--format=csv,noheader"],
        stdout=subprocess.PIPE,
        text=True
    )
    # Parse and return temperature of GPU 0
    return int(result.stdout.strip().split('\n')[0])

def gpu_temperature_rest_time():
    if get_gpu_temperature() >= 80:
        return 100
    else:
        return 0

In [None]:
# Test embedding generation
embedding = get_ollama_embedding("Embedding generation test", )
print(embedding)
print(len(embedding["embedding"]))

In [None]:
# Generate and assign embeddings for each chunk in the graph (Expected execution time ~ 6-7min)
for node, data in G.nodes(data=True):
    if data.get("node_type") == "Article":
        data["embedding"] = get_ollama_embedding(data["article_text"])["embedding"]