https://www.bentoml.com/blog/a-guide-to-open-source-embedding-models

In [2]:
import pickle
import requests
import networkx as nx
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px
import pandas as pd
import json

In [34]:
with open("../data/graph.pkl", "rb") as f:
    G = pickle.load(f)

with open("../data/Multi-hop_RAG_dataset/corpus.json", "r", encoding="utf-8") as f:
    corpus = json.load(f)

# Convert corpus data into df
corpus_as_df = pd.DataFrame(corpus)

In [3]:
import requests

# API address of Ollama on the host machine
# OLLAMA_BASE_URL = "http://host.docker.internal:11434"  # If using Docker on Linux, check the alternative below
OLLAMA_BASE_URL = "http://localhost:11434"

def get_embedding(text):
    """Gets embeddings for a given text using Ollama"""
    url = f"{OLLAMA_BASE_URL}/api/embeddings"
    payload = {
        "model": "bge-m3",  # Make sure you have this model downloaded in Ollama
        "prompt": text
    }
    response = requests.post(url, json=payload)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None

# Test with a sample text
embedding = get_embedding("***")
print(embedding)


{'embedding': [-1.6167062520980835, -0.36916884779930115, -1.324873685836792, 0.10496003925800323, -0.5085816383361816, -0.6559687852859497, 0.8341795802116394, 1.473646879196167, -0.29673513770103455, 0.41089606285095215, 0.7573739886283875, 1.1329209804534912, -0.13601182401180267, -0.013740446418523788, 1.3187133073806763, -1.6694451570510864, 1.040123462677002, -0.08517423272132874, 0.8772013187408447, -0.7995467185974121, -0.8771259188652039, -0.23145601153373718, -0.06231669336557388, 1.1984612941741943, 0.3176096975803375, 0.15809568762779236, 0.04244368150830269, -0.7236093878746033, -1.6375433206558228, -0.4657614231109619, 0.27378857135772705, -0.9877020120620728, -0.033766284584999084, -0.9883822798728943, -1.0301696062088013, -1.1154265403747559, -0.6944770812988281, 1.0368667840957642, -2.1396827697753906, 0.17176881432533264, -0.07348373532295227, -0.3024083077907562, 0.6346084475517273, -0.5592426061630249, 0.5155187249183655, -1.3062057495117188, -0.059213846921920776, 

In [4]:
# Iterate over graph nodes and update embeddings
for node, data in G.nodes(data=True):
    if data.get("type") == "chunk" and "text" in data:
        data["embedding"] = get_embedding(data["text"])

In [6]:
with open("../data/graph_w_embeddings.pkl", "wb") as f:
    pickle.dump(G, f)

In [2]:
with open("../data/graph_w_embeddings.pkl", "rb") as f:
    G = pickle.load(f)

In [4]:
# Assuming G is the NetworkX graph already created and that each "chunk" node has an "embedding" attribute.

# Helper function to average a list of embeddings (assumed to be list of floats)
def average_embeddings(embeddings):
    """
    Given a list of embedding vectors, compute the element-wise average.
    Returns the average embedding as a list.
    """
    return np.mean(np.array(embeddings), axis=0).tolist()

# -------------------------------
# Step 1: Generate Article Embeddings
# For each article, average the embeddings of all its associated chunks.
# -------------------------------
for node, data in G.nodes(data=True):
    if data.get("type") == "article":
        chunk_embeddings = []
        # Iterate over all successors of the article (neighbors in the graph)
        for neighbor in G.successors(node):
            # Get the edge data from article to neighbor
            edge_data = G.get_edge_data(node, neighbor)
            # Check if the relation is "CONTAINS" (i.e., the article contains the chunk)
            if edge_data and edge_data.get("relation") == "CONTAINS":
                neighbor_data = G.nodes[neighbor]
                # Ensure that the chunk has an embedding
                if "embedding" in neighbor_data:
                    chunk_embeddings.append(neighbor_data["embedding"].get("embedding"))
        # If there are chunk embeddings available, compute their average and assign to the article node
        if chunk_embeddings:
            data["embedding"] = average_embeddings(chunk_embeddings)

# -------------------------------
# Step 2: Generate Embeddings for Authors, Categories, and Sources
# For each of these node types, average the embeddings of all connected article nodes.
# -------------------------------

# For author nodes: use the "AUTHORED" edge from author to article.
for node, data in G.nodes(data=True):
    if data.get("type") == "author":
        article_embeddings = []
        # Iterate over outgoing edges from the author node
        for _, target, edge_data in G.out_edges(node, data=True):
            if edge_data.get("relation") == "AUTHORED":
                target_data = G.nodes[target]
                if "embedding" in target_data:
                    article_embeddings.append(target_data["embedding"])
        if article_embeddings:
            data["embedding"] = average_embeddings(article_embeddings)

# For category nodes: use the "COVERED_IN" edge from category to article.
for node, data in G.nodes(data=True):
    if data.get("type") == "category":
        article_embeddings = []
        # Iterate over outgoing edges from the category node
        for _, target, edge_data in G.out_edges(node, data=True):
            if edge_data.get("relation") == "COVERED_IN":
                target_data = G.nodes[target]
                if "embedding" in target_data:
                    article_embeddings.append(target_data["embedding"])
        if article_embeddings:
            data["embedding"] = average_embeddings(article_embeddings)

# For source nodes: use the "PUBLISHES" edge from source to article.
for node, data in G.nodes(data=True):
    if data.get("type") == "source":
        article_embeddings = []
        # Iterate over outgoing edges from the source node
        for _, target, edge_data in G.out_edges(node, data=True):
            if edge_data.get("relation") == "PUBLISHES":
                target_data = G.nodes[target]
                if "embedding" in target_data:
                    article_embeddings.append(target_data["embedding"])
        if article_embeddings:
            data["embedding"] = average_embeddings(article_embeddings)

# -------------------------------
# Step 3: Generate Network Summary Statistics
# -------------------------------

# Total number of nodes and edges
total_nodes = G.number_of_nodes()
total_edges = G.number_of_edges()

# Count nodes by type
node_type_counts = {}
for node, data in G.nodes(data=True):
    node_type = data.get("type", "unknown")
    node_type_counts[node_type] = node_type_counts.get(node_type, 0) + 1

# Calculate graph density
density = nx.density(G)

# Calculate average degree (using overall degree: sum of in-degree and out-degree)
degrees = dict(G.degree())
avg_degree = sum(degrees.values()) / total_nodes if total_nodes > 0 else 0

# Calculate the number of strongly and weakly connected components
num_strongly_connected = nx.number_strongly_connected_components(G)
num_weakly_connected = nx.number_weakly_connected_components(G)

# Print the network summary
print("Network Summary:")
print(f"Total nodes: {total_nodes}")
print("Nodes by type:")
for node_type, count in node_type_counts.items():
    print(f"  {node_type}: {count}")
print(f"Total edges: {total_edges}")
print(f"Graph density: {density:.4f}")
print(f"Average degree: {avg_degree:.2f}")
print(f"Number of strongly connected components: {num_strongly_connected}")
print(f"Number of weakly connected components: {num_weakly_connected}")


Network Summary:
Total nodes: 2622
Nodes by type:
  article: 609
  author: 300
  source: 47
  category: 6
  chunk: 1660
Total edges: 9052
Graph density: 0.0013
Average degree: 6.90
Number of strongly connected components: 1
Number of weakly connected components: 1


In [8]:
# -------------------------------
# Step 1: Extract embeddings and node info from the graph
# -------------------------------
embedding_list = []  # List to store embedding vectors
node_ids = []        # List to store node IDs
node_types = []      # List to store node types (e.g., article, chunk, etc.)

# Iterate over all nodes in the graph G
for node, data in G.nodes(data=True):
    if "embedding" in data:
        if type(data["embedding"]) == dict:
            embedding_list.append(data["embedding"].get("embedding"))
            node_ids.append(node)
            node_types.append(data.get("type", "unknown"))
        else:
            embedding_list.append(data["embedding"])
            node_ids.append(node)
            node_types.append(data.get("type", "unknown"))


# Check how many embeddings were extracted
print(f"Extracted embeddings from {len(embedding_list)} nodes.")

# -------------------------------
# Step 2: Convert embeddings to a numpy array
# -------------------------------
embeddings_array = np.array(embedding_list)
print(f"Embeddings array shape: {embeddings_array.shape}")

# -------------------------------
# Step 3: Apply t-SNE to reduce the embeddings to 2D
# -------------------------------
tsne = TSNE(n_components=2, random_state=42, perplexity=1000)
embeddings_2d = tsne.fit_transform(embeddings_array)
print("t-SNE transformation complete.")

# -------------------------------
# Step 4: Create a DataFrame with the 2D coordinates and node information
# -------------------------------
df_tsne = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
df_tsne['node_id'] = node_ids
df_tsne['node_type'] = node_types

# -------------------------------
# Step 5: Create an interactive scatter plot using Plotly
# -------------------------------
color_map = {
    "article": "blue",
    "author": "red",
    "source": "green",
    "category": "orange",
    "chunk": "purple"
}

fig = px.scatter(
    df_tsne,
    x='x',
    y='y',
    color='node_type',
    color_discrete_map=color_map,
    hover_data=['node_id', 'node_type'],
    title="t-SNE Visualization of Node Embeddings",
    width=1000,
    height=1000
)
fig.show()


Extracted embeddings from 2620 nodes.
Embeddings array shape: (2620, 1024)
t-SNE transformation complete.


In [3]:
with open("../data/graph_w_sage_embeddings.pkl", "rb") as f:
    G_wsage = pickle.load(f)

In [4]:
# -------------------------------
# Step 1: Extract embeddings and node info from the graph
# -------------------------------
embedding_list = []  # List to store embedding vectors
node_ids = []        # List to store node IDs
node_types = []      # List to store node types (e.g., article, chunk, etc.)

# Iterate over all nodes in the graph G
for node, data in G_wsage.nodes(data=True):
    if "SAGE_embedding" in data:
        if type(data["SAGE_embedding"]) == dict:
            embedding_list.append(data["SAGE_embedding"].get("SAGE_embedding"))
            node_ids.append(node)
            node_types.append(data.get("type", "unknown"))
        else:
            embedding_list.append(data["SAGE_embedding"])
            node_ids.append(node)
            node_types.append(data.get("type", "unknown"))


# Check how many embeddings were extracted
print(f"Extracted embeddings from {len(embedding_list)} nodes.")

# -------------------------------
# Step 2: Convert embeddings to a numpy array
# -------------------------------
embeddings_array = np.array(embedding_list)
print(f"Embeddings array shape: {embeddings_array.shape}")

# -------------------------------
# Step 3: Apply t-SNE to reduce the embeddings to 2D
# -------------------------------
tsne = TSNE(n_components=2, random_state=42, perplexity=1000)
embeddings_2d = tsne.fit_transform(embeddings_array)
print("t-SNE transformation complete.")

# -------------------------------
# Step 4: Create a DataFrame with the 2D coordinates and node information
# -------------------------------
df_tsne = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
df_tsne['node_id'] = node_ids
df_tsne['node_type'] = node_types

# -------------------------------
# Step 5: Create an interactive scatter plot using Plotly
# -------------------------------
color_map = {
    "article": "blue",
    "author": "red",
    "source": "green",
    "category": "orange",
    "chunk": "purple"
}

fig = px.scatter(
    df_tsne,
    x='x',
    y='y',
    color='node_type',
    color_discrete_map=color_map,
    hover_data=['node_id', 'node_type'],
    title="t-SNE Visualization of Node Embeddings",
    width=1000,
    height=1000
)
fig.show()

Extracted embeddings from 2620 nodes.
Embeddings array shape: (2620, 256)
t-SNE transformation complete.
