https://www.bentoml.com/blog/a-guide-to-open-source-embedding-models

In [1]:
import os
import sys
import json
import pickle
import requests

import networkx as nx
import numpy as np
import pandas as pd
import requests

import pynvml
import time
import subprocess

# Add "src" path to Python path
sys.path.append(os.path.abspath("../src"))

# Import custom embedding function
from embedding_utils import get_ollama_embedding



In [2]:
def get_gpu_temperature():
    # Run nvidia-smi to query GPU temperature
    result = subprocess.run(
        ["nvidia-smi", "--query-gpu=temperature.gpu", "--format=csv,noheader"],
        stdout=subprocess.PIPE,
        text=True
    )
    # Parse and return temperature of GPU 0
    return int(result.stdout.strip().split('\n')[0])

def gpu_temperature_rest_time():
    if get_gpu_temperature() >= 80:
        return 100
    else:
        return 0

In [3]:
# Load graph
with open("../data/base_hybrid_graph.pkl", "rb") as f:
    G = pickle.load(f)

# Load corpus
with open("../data/Multi-hop_RAG_dataset/corpus.json", "r", encoding="utf-8") as f:
    corpus = json.load(f)

# Load raw entities (for summary embeddings)
with open('../data/multihop_dataset_raw_entities.pkl', 'rb') as f:
    raw_entities = pickle.load(f)

# Convert corpus data into df
corpus_as_df = pd.DataFrame(corpus)

In [4]:
# Test embedding generation
embedding = get_ollama_embedding("Embedding generation test", )
print(embedding)
print(len(embedding["embedding"]))

{'embedding': [-1.0566672086715698, -0.9916018843650818, -0.1977226734161377, 0.44412606954574585, -0.07686168700456619, -0.7473859786987305, 0.6320770382881165, -0.3504481613636017, -0.2972276210784912, 0.16000185906887054, -0.5720860362052917, 0.17846082150936127, 0.44752582907676697, 0.3534890413284302, -0.43371647596359253, -0.8164119720458984, -1.5097856521606445, -0.2829397916793823, -0.09882917255163193, -1.298407793045044, 0.010910525918006897, -1.9046603441238403, 0.5234888792037964, -0.5442371368408203, 0.584852933883667, 0.22193007171154022, -0.3788483738899231, -0.46844640374183655, 0.2362867146730423, -0.6325748562812805, 0.6420486569404602, -1.0469647645950317, 0.4750227928161621, -1.4116549491882324, -0.5067553520202637, -0.05261102318763733, -0.37181004881858826, -0.09439092874526978, -1.6463353633880615, -0.0347198061645031, -0.3107561767101288, 0.013417135924100876, 0.7972341179847717, -1.205283761024475, 0.9445411562919617, -0.7153135538101196, 0.156986802816391, -0.

In [6]:
# Generate and assign embeddings for each chunk in the graph (Expected execution time ~ 6-7min)
for node, data in G.nodes(data=True):
    if data.get("type") == "chunk" and "text" in data:
        data["embedding"] = get_ollama_embedding(data["text"])["embedding"]

In [5]:
# Generate and assign embeddings for each article in the graph based on the summary 
for node, data in G.nodes(data=True):
    if data.get("type") == "article":
        data["embedding"] = get_ollama_embedding(raw_entities[node]["summary"])["embedding"]

'Amazon is holding an 11-day shopping event, beginning November 17th and continuing through Cyber Monday, November 27th, with both Black Friday and Cyber Monday deals available. The article highlights deals across various categories including Amazon devices (Echo, Fire TV, Kindle), Apple products, TVs, laptops, headphones, tablets, gaming, speakers, vacuums, kitchen appliances, smart home devices, fitness tech, beauty tech, drones, cameras, Lego, and gift cards. \n\nSpecific deals mentioned include the Echo Show for under $40, the 10th generation 64GB iPad for $349, a 65-inch Fire TV at a record low price, the Microsoft Surface Laptop Go 3 for $599.99, Bose QuietComfort 45 headphones for under $200, and the Meta Quest 2 with a $50 credit. Many deals are marked as all-time lows (🔥) or exclusive to Prime members (📨). The article will be updated continuously as new deals become available.'

In [5]:
#with open("../data/graph_w_embeddings.pkl", "rb") as f:
#    G = pickle.load(f)

For this first "test version" of the graph, the base semantic node embedding in the graph will be computed in the following manner:

- __*Chunk embeddings*__: Embedding of the chunk itself
- __*Article embeddings*__: Average of the article summary embedding and the avergae of all chunk embeddings within the article
- __*Author embeddings*__: Average of all article embeddings of articles written by the author
- __*Category embeddings*__: Average of all article embeddings comprised within the category
- __*Source embedding*__: Average of all article embeddings published by the source
- __*Entity_LOC embedding*__: 
- __*Entity_PER embedding*__: 
- __*Entity_ORG embedding*__: 

### Entity embeddings

In [None]:
# Find all nodes with type "Entity_LOC"
entity_nodes = [node.split("_") for node in G.nodes() if G.nodes[node]['type'] in ['entity_PER', 'entity_LOC', 'entity_ORG']]

#print("\n=== LOCATION ENTITIES ===")
#print(f"Found {len(loc_entities)} location entities:")
#for entity in loc_entities:
#    print(f"- {entity}")

ent_type_mapping ={"ORG":"Organization", "LOC":"Location", "PER":"Person"} 

response_tracking = {}

for i, entity_node in enumerate(entity_nodes):
    # Print progress
    print(f"Processing {entity_node[0]} ({i+1}/{len(entity_nodes)} - {((i+1)/len(entity_nodes)*100):.1f}%)")

    # Check GPU temperature and wait if necessary 
    while gpu_temperature_rest_time() != 0:
        print(f"== Pausing code execution to cool down GPU... ({get_gpu_temperature()}) ==")
        time.sleep(gpu_temperature_rest_time())
    
    # Get entity name and type
    entity_name = entity_node[0]
    entity_type = entity_node[1]

    prompt = f"""
    {entity_name} is an entity of type {entity_type} extracted from a text by an NER algorithm. 
    Your task is to assess withouth further context whether this entity is widely recognizable and return your answer in a JSON format. 
    In this JSON first provide again {entity_name} in the field "entity name" and then {ent_type_mapping[entity_type]}  in the field "entity_type"
    Finally, in the filed "widely_recognizable" indicate with a boolean value your decision about whether this is a widely recognizable entity without further context.
    Additionally, if you determine this entity is widely recognizable, provide one short sentence of context about this entity in the field "short_context", otherwise leave it blank.
    As a final remark, consider that these entites has been extracted from a corpus of news articles covering topics in Science, Business, Entertainment, Sports, Technology and Health.
    It is very unlikely that widely recognizable entities of the type "organization" or "person" unrelated to these domains appear in the corpus.
    """
    url = "http://localhost:11434/api/chat"
    headers = {"Content-Type": "application/json"}

    data = {
        "model": "gemma3:27b-it-q8_0",
        "messages": [{"role": "user", "content": prompt}],
        "stream": False,
        "format": {
            "type": "object",
            "properties": {
                "entity_name": {"type": "string"},
                "entity_type": {"type": "string"},
                "widely_recognizable": {"type" : "boolean"},
                "short_context":{"type": "string"}  
                }
            },
            "required": ["entity_name", "entity_type", "widely_recognizable", "short_context"]
        }


    response = requests.post(url, headers=headers, data=json.dumps(data))

    response_json =json.loads(response.json()['message']['content'])

    print(response.json()['message']['content']) # CEGAR

    # If the entity is widely recognizable, use the short context to generate an embedding
    if response_json["widely_recognizable"]:
        entity_base_embedding = get_ollama_embedding(response_json["short_context"])["embedding"]
        response_json["embedding"] = entity_base_embedding
        # Assign the embedding to the entity node
        G.nodes[f"{entity_node[0]}_{entity_node[1]}"]["embedding"] = entity_base_embedding
        print(f"---> SUCCESS: Entity {entity_node[0]} ({entity_node[1]}) is widely recognizable. Embedding assigned.")
    else:
        # Create default zero embedding for non-recognizable entities
        default_embedding = np.zeros(1024, dtype=np.float32)
        response_json["embedding"] = default_embedding
        # Assign the default embedding to the entity node
        G.nodes[f"{entity_node[0]}_{entity_node[1]}"]["embedding"] = default_embedding
        print(f"---> DEFAULT EMBEDDING: Entity {entity_node[0]} ({entity_node[1]}) is not widely recognizable. Default embedding assigned.")
    response_tracking[f"{entity_node[0]}_{entity_node[1]}"]  = response_json


# Save response_tracking to pickle file
#with open('../data/response_tracking.pkl', 'wb') as f:
#    pickle.dump(response_tracking, f)
#print("Saved!")

# Mostrar el resultado
#print(response.status_code)
#print(response.json())
#print(response.json()['message']['content'])


Processing Amazon (1/743 - 0.1%)
{
  "entity_name": "Amazon",
  "entity_type": "ORG",
  "widely_recognizable": true,
  "short_context": "Amazon is a multinational technology company best known for e-commerce, cloud computing, and digital streaming."
}

---> SUCCESS: Entity Amazon (ORG) is widely recognizable. Embedding assigned.
Processing Apple (2/743 - 0.3%)
{
  "entity_name": "Apple",
  "entity_type": "ORG",
  "widely_recognizable": true,
  "short_context": "Apple is a multinational technology company known for designing, developing, and selling consumer electronics, computer software, and online services."
}

---> SUCCESS: Entity Apple (ORG) is widely recognizable. Embedding assigned.
Processing Bendigo (3/743 - 0.4%)
{
  "entity_name": "Bendigo",
  "entity_type": "ORG",
  "widely_recognizable": true,
  "short_context": "Bendigo is a city in Victoria, Australia, known for its gold rush history and art galleries."
}

---> SUCCESS: Entity Bendigo (ORG) is widely recognizable. Embeddi

In [32]:
# Create and assign article summary embeddings

for node, data in G.nodes(data=True):
    if data["type"] == "article":
        try:
            summary_embedding = get_ollama_embedding(raw_entities[node]["summary"])
            data["summary_embedding"] = summary_embedding["embedding"] 
        except:
            print(f"Error with {node}, assigning default embedding.")
            data["summary_embedding"] = np.zeros(1024, dtype=np.float32) 

        #print(data["summary_embedding"])
        #break

Error with 250, assigning default embedding.


In [33]:
# Helper function to average a list of embeddings (assumed to be list of floats)
def average_embeddings(embeddings):
    """
    Given a list of embedding vectors, compute the element-wise average.
    Returns the average embedding as a list.
    """
    return np.mean(np.array(embeddings), axis=0).tolist()

In [35]:
# -------------------------------
# Step 1: Generate Article Embeddings
# For each article, average the embeddings of all its associated chunks.
# -------------------------------
for node, data in G.nodes(data=True):
    if data.get("type") == "article":
        chunk_embeddings = []
        # Iterate over all successors of the article (neighbors in the graph)
        for neighbor in G.successors(node):
            # Get the edge data from article to neighbor
            edge_data = G.get_edge_data(node, neighbor)
            # Check if the relation is "CONTAINS" (i.e., the article contains the chunk)
            if edge_data and edge_data.get("relation") == "CONTAINS":
                neighbor_data = G.nodes[neighbor]
                # Ensure that the chunk has an embedding
                if "embedding" in neighbor_data:
                    chunk_embeddings.append(neighbor_data["embedding"])
        # If there are chunk embeddings available, compute the average of the summary embedding and the average chunk embedding
        summary_embedding = data["summary_embedding"] 
        if chunk_embeddings:
            data["embedding"] = average_embeddings([average_embeddings(chunk_embeddings),data["summary_embedding"]])

# -------------------------------
# Step 2: Generate Embeddings for Authors, Categories, and Sources
# For each of these node types, average the embeddings of all connected article nodes.
# -------------------------------

# For author nodes: use the "AUTHORED" edge from author to article.
for node, data in G.nodes(data=True):
    if data.get("type") == "author":
        article_embeddings = []
        # Iterate over outgoing edges from the author node
        for _, target, edge_data in G.out_edges(node, data=True):
            if edge_data.get("relation") == "AUTHORED":
                target_data = G.nodes[target]
                if "embedding" in target_data:
                    article_embeddings.append(target_data["embedding"])
        if article_embeddings:
            data["embedding"] = average_embeddings(article_embeddings)

# For category nodes: use the "COVERED_IN" edge from category to article.
for node, data in G.nodes(data=True):
    if data.get("type") == "category":
        article_embeddings = []
        # Iterate over outgoing edges from the category node
        for _, target, edge_data in G.out_edges(node, data=True):
            if edge_data.get("relation") == "COVERED_IN":
                target_data = G.nodes[target]
                if "embedding" in target_data:
                    article_embeddings.append(target_data["embedding"])
        if article_embeddings:
            data["embedding"] = average_embeddings(article_embeddings)

# For source nodes: use the "PUBLISHES" edge from source to article.
for node, data in G.nodes(data=True):
    if data.get("type") == "source":
        article_embeddings = []
        # Iterate over outgoing edges from the source node
        for _, target, edge_data in G.out_edges(node, data=True):
            if edge_data.get("relation") == "PUBLISHES":
                target_data = G.nodes[target]
                if "embedding" in target_data:
                    article_embeddings.append(target_data["embedding"])
        if article_embeddings:
            data["embedding"] = average_embeddings(article_embeddings)

In [36]:
# Ensure every node has an embedding assigned.
# If missing, report the node and assign a default zero embedding.

missing_nodes = []
default_embedding = None

# Determine the embedding dimension from an existing node with an embedding
for node, data in G.nodes(data=True):
    if "embedding" in data:
        # If the embedding is a dict (for "chunk" nodes), extract the list from key "embedding"
        if isinstance(data["embedding"], dict):
            emb_list = data["embedding"].get("embedding")
            if emb_list is not None:
                default_dim = len(emb_list)
                # For chunk nodes, we expect the embedding to be a dict
                default_embedding = {"embedding": [0.0] * default_dim}
                break
        # Otherwise, if it's a list (as for aggregated embeddings)
        elif isinstance(data["embedding"], list):
            default_dim = len(data["embedding"])
            default_embedding = [0.0] * default_dim
            break

if default_embedding is None:
    print("Could not determine the embedding dimension. No node with an assigned embedding was found.")
else:
    # Iterate over all nodes to ensure each has an embedding
    for node, data in G.nodes(data=True):
        if "embedding" not in data:
            missing_nodes.append(node)
            # For "chunk" type nodes, assign a dict structure; otherwise, assign a simple list.
            if data.get("type") == "chunk":
                data["embedding"] = {"embedding": [0.0] * default_dim}
            else:
                data["embedding"] = [0.0] * default_dim

    print("Nodes missing embeddings:", missing_nodes)


Nodes missing embeddings: []


In [37]:
# -------------------------------
# Graph Summary Statistics
# -------------------------------

# Total number of nodes and edges
total_nodes = G.number_of_nodes()
total_edges = G.number_of_edges()

# Count nodes by type
node_type_counts = {}
for node, data in G.nodes(data=True):
    node_type = data.get("type", "unknown")
    node_type_counts[node_type] = node_type_counts.get(node_type, 0) + 1

# Calculate graph density
density = nx.density(G)

# Calculate average degree (using overall degree: sum of in-degree and out-degree)
degrees = dict(G.degree())
avg_degree = sum(degrees.values()) / total_nodes if total_nodes > 0 else 0

# Calculate the number of strongly and weakly connected components
num_strongly_connected = nx.number_strongly_connected_components(G)
num_weakly_connected = nx.number_weakly_connected_components(G)

# Print the network summary
print("Network Summary:")
print(f"Total nodes: {total_nodes}")
print("Nodes by type:")
for node_type, count in node_type_counts.items():
    print(f"  {node_type}: {count}")
print(f"Total edges: {total_edges}")
print(f"Graph density: {density:.4f}")
print(f"Average degree: {avg_degree:.2f}")
print(f"Number of strongly connected components: {num_strongly_connected}")
print(f"Number of weakly connected components: {num_weakly_connected}")

Network Summary:
Total nodes: 3367
Nodes by type:
  article: 609
  author: 300
  source: 49
  category: 6
  chunk: 1660
  entity_ORG: 314
  entity_LOC: 90
  entity_PER: 339
Total edges: 15831
Graph density: 0.0014
Average degree: 9.40
Number of strongly connected components: 1
Number of weakly connected components: 1


In [38]:
with open("../data/multihop_graph_w_sem_embeddings.pkl", "wb") as f:
    pickle.dump(G, f)
print("Graph saved!")

Graph saved!


## Visualizing the embedding space with TSNE

In [39]:
from sklearn.manifold import TSNE
import plotly.express as px

In [40]:
# -------------------------------
# Step 1: Extract embeddings and node info from the graph
# -------------------------------
embedding_list = []  # List to store embedding vectors
node_ids = []        # List to store node IDs
node_types = []      # List to store node types (e.g., article, chunk, etc.)
node_categories = [] 
node_authors = [] 
node_sources = [] 

# Iterate over all nodes in the graph G
for node, data in G.nodes(data=True):
    if "embedding" in data:
        embedding_list.append(data["embedding"])
        node_ids.append(node)
        node_types.append(data.get("type", "unknown"))
        node_categories.append(data.get("category", "unknown"))
        node_authors.append(data.get("author", "unknown"))
        node_sources.append(data.get("source", "unknown"))


# Check how many embeddings were extracted
print(f"Extracted embeddings from {len(embedding_list)} nodes.")

# -------------------------------
# Step 2: Convert embeddings to a numpy array
# -------------------------------
embeddings_array = np.array(embedding_list)
print(f"Embeddings array shape: {embeddings_array.shape}")

# -------------------------------
# Step 3: Apply t-SNE to reduce the embeddings to 2D
# -------------------------------
tsne = TSNE(n_components=2, random_state=42, perplexity=1000)
embeddings_2d = tsne.fit_transform(embeddings_array)
print("t-SNE transformation complete.")

# -------------------------------
# Step 4: Create a DataFrame with the 2D coordinates and node information
# -------------------------------
df_tsne = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
df_tsne['node_id'] = node_ids
df_tsne['node_type'] = node_types
df_tsne['category'] = node_categories 
df_tsne['author'] = node_authors
df_tsne['source'] = node_sources 

# -------------------------------
# Step 5: Create an interactive scatter plot using Plotly
# -------------------------------
color_map = {
    "article": "blue",
    "author": "red",
    "source": "green",
    "category": "orange",
    "chunk": "purple"
}

fig = px.scatter(
    df_tsne,
    x='x',
    y='y',
    color='node_type',
    color_discrete_map=color_map,
    hover_data=['node_id', 'node_type', 'category', 'author', 'source'],
    title="t-SNE Visualization of Node Embeddings",
    width=1000,
    height=1000
)
fig.show()


Extracted embeddings from 3367 nodes.
Embeddings array shape: (3367, 1024)
t-SNE transformation complete.
