# Exploring LLM-based Sentence Embeddings

## Setup Data & Scatter Plot Config

This is the same as from [2-Composing-Linking-Scatter-Plots.ipynb](#Synchronizing-Views)

In [None]:
!curl -L -C - -o data/huffpost-embeddings.pq https://storage.googleapis.com/flekschas/jupyter-scatter-tutorial/huffpost-embeddings.pq

In [10]:
import pandas as pd
huffpost_embeddings = pd.read_parquet('data/huffpost-embeddings.pq')
huffpost_embeddings.head(3)

Unnamed: 0,link,headline,category,short_description,authors,date,year,month,season,sentiment,sentiment_score,x_all_MiniLM_L6_v2,y_all_MiniLM_L6_v2,x_all_mpnet_base_v2,y_all_mpnet_base_v2
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,2022,September,Fall,NEGATIVE,0.959589,3.873639,0.84671,4.985067,0.615112
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,2022,September,Fall,NEGATIVE,0.99498,3.672992,4.065663,3.480932,4.658141
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,2022,September,Fall,POSITIVE,0.998081,7.427926,5.442265,3.243571,5.680662


In [11]:
from jscatter import glasbey_light
category_cmap = { cat: glasbey_light[i] for i, cat in enumerate(huffpost_embeddings.category.unique()) }
huffpost_scatter_config = dict(x='x_all_MiniLM_L6_v2', y='y_all_MiniLM_L6_v2', axes=False, background_color='#111111')

In [12]:
from jscatter import Scatter
overview_scatter = Scatter(
    data=huffpost_embeddings,
    color_by='category',
    color_map=category_cmap,
    height=720,
    legend=True,
    **huffpost_scatter_config
)

In [13]:
from ipywidgets import Dropdown

categorical_variables = ["category", "year", "month", "season", "sentiment"]
continuous_variables = ["sentiment_score"]

categories = [list(map(lambda val: f"{cat}:{val}", huffpost_embeddings[cat].unique())) for cat in categorical_variables]
categories = [item for sublist in categories for item in sublist]

select_color = Dropdown(options=categorical_variables + continuous_variables, value="category", description="Color by")
select_filter = Dropdown(options=["-"] + categories, value="-", description="Filter to")
select_facet = Dropdown(options=["-"] + categorical_variables, value="-", description="Facet by")

In [20]:
from ipywidgets import Box, Output

faceted_scatters = dict(scatters=[])
def get_all_scatter():
    return [(overview_scatter, huffpost_embeddings, None, "All")] + faceted_scatters['scatters']


table = Output()
@table.capture(clear_output=True)
def on_selection_change(change):
    display(huffpost_embeddings.iloc[change.new][["category", "headline", "short_description", "date"]])

overview_scatter.widget.observe(on_selection_change, names=["selection"])


def on_color_change(change):
    for scatter, _, _, _ in get_all_scatter():
        if change.new == "category":
            scatter.color(by=change.new, map=category_cmap)
        elif change.new in categorical_variables:
            scatter.color(by=change.new, map="auto")
        else:
            scatter.color(by=change.new, map="coolwarm")

select_color.observe(on_color_change, names=["value"])


def on_filter_change(change):
    cat, val = ("", "") if change.new == "-" else change.new.split(":")
    for scatter, df, df_index, _ in get_all_scatter():
        if not cat:
            scatter.filter(None)
        else:
            idxs = df.query(f'{cat} == "{val}"').index
            if df_index is not None:
                idxs = df_index.loc[idxs].i.values
            scatter.filter(idxs)

select_filter.observe(on_filter_change, names=["value"])


def create_faceted_scatter():
    import pandas as pd
    
    faceted_scatters = []

    for value in sorted(huffpost_embeddings[select_facet.value].unique()):
        df = huffpost_embeddings[huffpost_embeddings[select_facet.value] == value]

        # For mapping between the original index and the view index
        df_index = pd.DataFrame(range(len(df.index)), columns=['i'], index=df.index)
        
        scatter = Scatter(
            data=df,
            color_by=select_color.value,
            color_map=category_cmap,
            **huffpost_scatter_config
        )
        scatter.widget.observe(on_selection_change, names=["selection"])
        faceted_scatters.append((scatter, df, df_index, value))

    return faceted_scatters


def compose_faceted_scatters():
    from jscatter import compose
    from math import ceil, sqrt
    scatters = create_faceted_scatter()
    faceted_scatters['scatters'] = scatters
    cols = max(1, ceil(sqrt(len(scatters))))
    rows = max(1, ceil(len(scatters) / cols))
    return compose([(scatter, title) for scatter, _, _, title in scatters], sync_view=True, cols=cols, rows=rows, row_height=720 // rows)

def get_scatters():
    if select_facet.value == "-":
        return overview_scatter.show()
    return compose_faceted_scatters()

scatters = Output()
@scatters.capture(clear_output=True)
def on_facet_change(change):
    display(get_scatters())

select_facet.observe(on_facet_change, names=["value"])

In [22]:
from ipywidgets import AppLayout, HBox, HTML, TwoByTwoLayout, VBox

# Initialize
with scatters:
    display(get_scatters())

VBox([
    AppLayout(center=HBox([select_color, select_facet, select_filter]), right_sidebar=HTML(value="Selected news articles:")),
    AppLayout(center=scatters, right_sidebar=table)
])

VBox(children=(AppLayout(children=(HTML(value='Selected news articles:', layout=Layout(grid_area='right-sideba…

---

## Next

Next up, we'll show you how to bring the integration even further by using `jscatter` with a custom widget for the Fashion MNIST example we saw previously.

➡️ [Building a Bespoke Interface for Exploring Fashion MNIST](3-Fashion-MNIST.ipynb)