# **Visualization of the papers in the embedding space**

The idea here is to reduce the dimensionality of the embeddings to 2D or 3D so we can visualize them.

In [1]:
import numpy as np
import pandas as pd
import umap
import plotly.express as px
import ast
from sklearn.decomposition import PCA

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../data/papers_with_embeddings.csv")

In [3]:
df["embedding"] = df["embedding"].astype(str).apply(ast.literal_eval)

In [4]:
X = np.vstack(df["embedding"]).astype(np.float32)

# L2 normalize (good for cosine geometry)
X = X / np.linalg.norm(X, axis=1, keepdims=True)

# PCA first (speeds up UMAP + reduces noise)
X50 = PCA(n_components=50, random_state=0).fit_transform(X)

In [5]:
# UMAP to 2D
X2 = umap.UMAP(
    n_components=2,
    n_neighbors=15,
    min_dist=0.1,
    metric="cosine",
    random_state=0
).fit_transform(X50)

df["umap_x"] = X2[:, 0]
df["umap_y"] = X2[:, 1]

  warn(


In [20]:
fig = px.scatter(
    df,
    x="umap_x",
    y="umap_y",
    color="primary_location.source.display_name",
    hover_data={
        "title": True,
        "umap_x": False,
        "umap_y": False,
        "primary_location.source.display_name": False,
    },
    opacity=0.8,
    template="plotly_white",
)

fig.update_traces(
    marker=dict(
        size=7,
        line=dict(width=0.5, color="rgba(0,0,0,0.3)")
    )
)

fig.update_layout(
    title=dict(
        text="Abstract embeddings in a 2D space",
        x=0.5,
        xanchor="center"
    ),
    legend_title_text="Journal",
    legend=dict(
        itemsizing="constant",
        bgcolor="rgba(255,255,255,0.8)"
    ),
    margin=dict(l=20, r=20, t=60, b=20),
)

fig.update_xaxes(
    showgrid=False,
    zeroline=False,
    visible=False
)

fig.update_yaxes(
    showgrid=False,
    zeroline=False,
    visible=False
)

fig.update_layout(width=800, height=450)

fig.show()

In [8]:
X3 = umap.UMAP(
    n_components=3,
    n_neighbors=15,
    min_dist=0.1,
    metric="cosine",
    random_state=0
).fit_transform(X50)

df["umap_x3"] = X3[:, 0]
df["umap_y3"] = X3[:, 1]
df["umap_z3"] = X3[:, 2]


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [21]:
fig3d = px.scatter_3d(
    df,
    x="umap_x3",
    y="umap_y3",
    z="umap_z3",
    color="primary_location.source.display_name",
    hover_data={
        "title": True,
        "umap_x3": False,
        "umap_y3": False,
        "umap_z3": False,
        "primary_location.source.display_name": False,
    },
    opacity=0.85,
    template="plotly_white",
)

fig3d.update_traces(
    marker=dict(
        size=4.5,
        line=dict(width=0.3, color="rgba(0,0,0,0.3)")
    )
)

fig3d.update_layout(
    title=dict(
        text="Abstract embeddings in a 3D space",
        x=0.5,
        xanchor="center"
    ),
    legend_title_text="Journal",
    legend=dict(
        bgcolor="rgba(255,255,255,0.8)",
        itemsizing="constant"
    ),
    margin=dict(l=20, r=20, t=60, b=20),
    scene=dict(
        xaxis=dict(
            showbackground=False,
            showgrid=False,
            zeroline=False,
            visible=False
        ),
        yaxis=dict(
            showbackground=False,
            showgrid=False,
            zeroline=False,
            visible=False
        ),
        zaxis=dict(
            showbackground=False,
            showgrid=False,
            zeroline=False,
            visible=False
        ),
    ),
)

fig3d.update_layout(width=800, height=450)

fig3d.show()