In [1]:
from sklearn.manifold import TSNE
import plotly.express as px
import pickle
import numpy as np
import json
import pandas as pd
import plotly.graph_objects as go


In [2]:
# Load graph
with open("../data/MultiHop_graph_w_sage_embeddings.pkl", "rb") as f:
    G = pickle.load(f)

# Load corpus
with open("../data/Multi-hop_RAG_dataset/corpus.json", "r", encoding="utf-8") as f:
    corpus = json.load(f)

# Convert corpus data into df
corpus_as_df = pd.DataFrame(corpus)

In [3]:
perplexity_val = 1000

In [4]:
# 1

# -------------------------------
# Step 1: Extract embeddings and node info from the graph
# -------------------------------
embedding_list = []  # List to store embedding vectors
node_ids = []        # List to store node IDs
node_types = []      # List to store node types (e.g., article, chunk, etc.)
node_categories = [] 
node_authors = [] 
node_sources = [] 

# Iterate over all nodes in the graph G
for node, data in G.nodes(data=True):
    if "embedding" in data:
        embedding_list.append(data["embedding"])
        node_ids.append(node)
        node_types.append(data.get("type", "unknown"))
        node_categories.append(data.get("category", "unknown"))
        node_authors.append(data.get("author", "unknown"))
        node_sources.append(data.get("source", "unknown"))


# Check how many embeddings were extracted
print(f"Extracted embeddings from {len(embedding_list)} nodes.")

# -------------------------------
# Step 2: Convert embeddings to a numpy array
# -------------------------------
embeddings_array = np.array(embedding_list)
print(f"Embeddings array shape: {embeddings_array.shape}")

# -------------------------------
# Step 3: Apply t-SNE to reduce the embeddings to 2D
# -------------------------------
tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity_val)
embeddings_2d = tsne.fit_transform(embeddings_array)
print("t-SNE transformation complete.")

# -------------------------------
# Step 4: Create a DataFrame with the 2D coordinates and node information
# -------------------------------
df_tsne = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
df_tsne['node_id'] = node_ids
df_tsne['node_type'] = node_types
df_tsne['category'] = node_categories 
df_tsne['author'] = node_authors
df_tsne['source'] = node_sources 

# -------------------------------
# Step 5: Create an interactive scatter plot using Plotly
# -------------------------------
color_map = {
    "article": "blue",
    "author": "red",
    "source": "green",
    "category": "orange",
    "chunk": "purple"
}

fig = px.scatter(
    df_tsne,
    x='x',
    y='y',
    color='node_type',
    color_discrete_map=color_map,
    hover_data=['node_id', 'node_type', 'category', 'author', 'source'],
    title="t-SNE Visualization of Node Embeddings",
    width=1000,
    height=1000
)
fig.show()


Extracted embeddings from 2622 nodes.
Embeddings array shape: (2622, 1024)
t-SNE transformation complete.


In [5]:
# 2

# -------------------------------
# Step 1: Extract embeddings and node info from the graph
# -------------------------------
embedding_list = []  # List to store embedding vectors
node_ids = []        # List to store node IDs
node_types = []      # List to store node types (e.g., article, chunk, etc.)
node_categories = [] 
node_authors = [] 
node_sources = [] 

# Iterate over all nodes in the graph G
for node, data in G.nodes(data=True):
    if "SAGE_embedding" in data:
        embedding_list.append(data["SAGE_embedding"])
        node_ids.append(node)
        node_types.append(data.get("type", "unknown"))
        node_categories.append(data.get("category", "unknown"))
        node_authors.append(data.get("author", "unknown"))
        node_sources.append(data.get("source", "unknown"))


# Check how many embeddings were extracted
print(f"Extracted embeddings from {len(embedding_list)} nodes.")

# -------------------------------
# Step 2: Convert embeddings to a numpy array
# -------------------------------
embeddings_array = np.array(embedding_list)
print(f"Embeddings array shape: {embeddings_array.shape}")

# -------------------------------
# Step 3: Apply t-SNE to reduce the embeddings to 2D
# -------------------------------
tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity_val)
embeddings_2d = tsne.fit_transform(embeddings_array)
print("t-SNE transformation complete.")

# -------------------------------
# Step 4: Create a DataFrame with the 2D coordinates and node information
# -------------------------------
df_tsne = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
df_tsne['node_id'] = node_ids
df_tsne['node_type'] = node_types
df_tsne['category'] = node_categories 
df_tsne['author'] = node_authors
df_tsne['source'] = node_sources 

# -------------------------------
# Step 5: Create an interactive scatter plot using Plotly
# -------------------------------
color_map = {
    "article": "blue",
    "author": "red",
    "source": "green",
    "category": "orange",
    "chunk": "purple"
}

fig = px.scatter(
    df_tsne,
    x='x',
    y='y',
    color='node_type',
    color_discrete_map=color_map,
    hover_data=['node_id', 'node_type', 'category', 'author', 'source'],
    title="t-SNE Visualization of Node Embeddings",
    width=1000,
    height=1000
)
fig.show()

Extracted embeddings from 2622 nodes.
Embeddings array shape: (2622, 256)
t-SNE transformation complete.


In [6]:
# 3

# -------------------------------
# Step 1: Extract embeddings and node info from the graph
# -------------------------------
embedding_list = []  # List to store embedding vectors
node_ids = []        # List to store node IDs
node_types = []      # List to store node types (e.g., article, chunk, etc.)
node_categories = [] 
node_authors = [] 
node_sources = [] 

# Iterate over all nodes in the graph G
for node, data in G.nodes(data=True):
    if "embedding" in data:
        embedding_list.append(data["embedding"])
        node_ids.append(node)
        node_types.append(data.get("type", "unknown"))
        node_categories.append(data.get("category", "unknown"))
        node_authors.append(data.get("author", "unknown"))
        node_sources.append(data.get("source", "unknown"))


# Check how many embeddings were extracted
print(f"Extracted embeddings from {len(embedding_list)} nodes.")

# -------------------------------
# Step 2: Convert embeddings to a numpy array
# -------------------------------
embeddings_array = np.array(embedding_list)
print(f"Embeddings array shape: {embeddings_array.shape}")

# -------------------------------
# Step 3: Apply t-SNE to reduce the embeddings to 2D
# -------------------------------
tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity_val)
embeddings_2d = tsne.fit_transform(embeddings_array)
print("t-SNE transformation complete.")

# -------------------------------
# Step 4: Create a DataFrame with the 2D coordinates and node information
# -------------------------------
df_tsne = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
df_tsne['node_id'] = node_ids
df_tsne['node_type'] = node_types
df_tsne['category'] = node_categories 
df_tsne['author'] = node_authors
df_tsne['source'] = node_sources 

# -------------------------------
# Step 5: Create an interactive scatter plot using Plotly
# -------------------------------
color_map = {
    "article": "blue",
    "author": "red",
    "source": "green",
    "category": "orange",
    "chunk": "purple"
}

# Generate edge traces based on the edges in the graph G and the TSNE coordinates in df_tsne
edge_traces = []
for u, v in G.edges():
    try:
        x0 = df_tsne.loc[df_tsne['node_id'] == u, 'x'].values[0]
        y0 = df_tsne.loc[df_tsne['node_id'] == u, 'y'].values[0]
        x1 = df_tsne.loc[df_tsne['node_id'] == v, 'x'].values[0]
        y1 = df_tsne.loc[df_tsne['node_id'] == v, 'y'].values[0]
    except IndexError:
        continue
    edge_trace = go.Scatter(
        x=[x0, x1, None],
        y=[y0, y1, None],
        mode='lines',
        line=dict(color='gray', width=1),
        hoverinfo='none'
    )
    edge_traces.append(edge_trace)


fig = px.scatter(
    df_tsne,
    x='x',
    y='y',
    color='node_type',
    color_discrete_map=color_map,
    hover_data=['node_id', 'node_type', 'category', 'author', 'source'],
    title="t-SNE Visualization of Node Embeddings",
    width=1000,
    height=1000
)

# Añadir las trazas de los edges al figure
for trace in edge_traces:
    fig.add_trace(trace)

fig.show()

Extracted embeddings from 2622 nodes.
Embeddings array shape: (2622, 1024)
t-SNE transformation complete.
