In [None]:
# You can run:
# `python -m pip install -r requirements.txt`
# if you need to install the dependencies. These are pretty standard packages
# but versions aren't pinned so you should run in a virtual environment to
# avoid messing up other installations.

import json

import numpy as np
import pandas as pd
import altair as alt

import dotenv
from openai import OpenAI

# To configure the OpenAI client, you'll need to provide your own OpenAI API
# key. You can do so by adding the line:
# `OPENAI_API_KEY = <put your API key here>`
# to a file called `.env`.

dotenv.load_dotenv(".env", override=True)
client = OpenAI()


def embed(text: str, model: str = "text-embedding-3-small") -> list[float]:
    """
    Embed the provided text with the selected OpenAI embedding model.
    
    See https://platform.openai.com/docs/guides/embeddings/embedding-models for more details on available models and usage.
    """
    response = client.embeddings.create(input=text, model=model)
    return response.data[0].embedding



: 

In [None]:
# Load the data from the local JSON file and convert it into a dataframe. For
# easy plotting we'll convert the position on the edge of the circumplex to (x,
# y) coordinates.

from math import sin, cos, radians

with open("data/circumplex.json", "r") as f:
    data = json.load(f)

df = pd.DataFrame(list(data["data"]["figure"].items()), columns=["affect", "degrees"])
df["radians"] = df["degrees"].map(radians)
df["valence"] = df["radians"].map(cos)
df["arousal"] = df["radians"].map(sin)

: 

In [None]:
# Altair chart definitions to mirror Russell's original circumplex diagrams.

valence_line = (
    alt.Chart()
    .mark_rule(strokeDash=(8, 4))
    .encode(x=alt.datum(0), y=alt.datum(-1), y2=alt.datum(1))
)
arousal_line = (
    alt.Chart()
    .mark_rule(strokeDash=(8, 4))
    .encode(y=alt.datum(0), x=alt.datum(-1), x2=alt.datum(1))
)

positive_label = (
    alt.Chart()
    .mark_text(baseline="bottom", angle=90, xOffset=10)
    .encode(x=alt.datum(1), y=alt.datum(0), text=alt.datum("Positive"))
)
negative_label = (
    alt.Chart()
    .mark_text(baseline="bottom", angle=270, xOffset=-10)
    .encode(x=alt.datum(-1), y=alt.datum(0), text=alt.datum("Negative"))
)
active_label = (
    alt.Chart()
    .mark_text(baseline="bottom", yOffset=-10)
    .encode(x=alt.datum(0), y=alt.datum(1), text=alt.datum("Active"))
)
passive_label = (
    alt.Chart()
    .mark_text(baseline="top", yOffset=10)
    .encode(x=alt.datum(0), y=alt.datum(-1), text=alt.datum("Passive"))
)

# Combination charts defining the circumplex. For convenience, use the
# `render_circumplex` function to render the circumplex diagram on top of the
# provided chart.

axis_lines = valence_line + arousal_line
axis_labels = positive_label + negative_label + active_label + passive_label


def add_circumplex(chart: alt.Chart) -> alt.Chart:
    """
    Returns the provided chart with Russell's circumplex added on top.
    """
    return (
        (chart + axis_lines + axis_labels)
        .configure_view(stroke=None)
        .configure_axis(grid=False)
    ).interactive()

def render_and_save(chart: alt.Chart, filename: str) -> alt.Chart:
    """
    Returns the provided chart after writing it to a file with the given name.
    """
    chart.save(filename)
    return chart

: 

In [None]:
# Render the original circumplex. These are just the (x, y) coordinates we
# stored in the dataframe.

russells_affects = (
    alt.Chart(
        df[["affect", "valence", "arousal", "degrees"]], title="Russell's Circumplex"
    )
    .mark_point()
    .encode(
        alt.X("valence").axis(None).scale(domain=[-1.2, 1.2]),
        alt.Y("arousal").axis(None).scale(domain=[-1.2, 1.2]),
        tooltip=[
            alt.Tooltip("affect"),
            alt.Tooltip("degrees", format="0.2f"),
            alt.Tooltip("valence", format="0.2f"),
            alt.Tooltip("arousal", format="0.2f"),
        ],
    )
)

render_and_save(add_circumplex(russells_affects), "figures/russells.json")

: 

In [None]:
# Generate embeddings for all the affects. This might take a few seconds!

df["embedding"] = df["affect"].map(embed)

: 

In [None]:
# Generate a 2D projection of the word embeddings using t-SNE.

from sklearn.manifold import TSNE

# Parameters chosen solely because they make the graph look nice. The online 
# documentation gives some advice on how to choose.
# https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html 
tsne = TSNE(perplexity=5, learning_rate=100).fit_transform(
    np.array([row.embedding for row in df.itertuples()])
)

# Project the (28, 2) array back into columns in the dataframe.
df["x_tsne"] = pd.Series(np.transpose(tsne)[0])
df["y_tsne"] = pd.Series(np.transpose(tsne)[1])

: 

In [None]:
# Generate quadrant labels for each affect. We'll color the t-SNE embeddings by
# which quadrant they're from.


def classify_quadrant(degrees: float) -> str:
    """
    Utility function mapping circumplex position to quadrant description.
    """
    if 0 <= degrees <= 90:
        return "Positive and Active"
    elif degrees <= 180:
        return "Negative and Active"
    elif degrees <= 270:
        return "Negative and Passive"
    else:
        return "Positive and Passive"


df["quadrant"] = df["degrees"].map(classify_quadrant)

# Graph the t-SNE projection. The selection lets us filter arrows by clicking
# the quadrants in the legend.

selection = alt.selection_point(fields=["quadrant"], bind="legend")

tsne_affects = (
    alt.Chart(
        df[["affect", "quadrant", "degrees", "x_tsne", "y_tsne"]],
        title="t-SNE Embeddings with Circumplex Angles",
    )
    .mark_point(shape="arrow", filled=True, size=300)
    .encode(
        alt.X("x_tsne").axis(None),
        alt.Y("y_tsne").axis(None),
        alt.Tooltip("affect"),
        alt.Color("quadrant").title("Circumplex Quadrant"),
        alt.Angle("degrees").scale(domain=[-270, -270 + -360]),
        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.2)),
    )
    .add_params(selection)
    .configure_view(stroke=None)
    .configure_axis(grid=False)
    .interactive()
)

render_and_save(tsne_affects, "figures/tsne.json")

: 

In [None]:
# As a sanity check, we'll try to fit a linear model that predicts valence and 
# arousal directly from the embeddings. Since we've got so many dimensions and 
# only 28 data points, this should massively overfit.

from sklearn.linear_model import LinearRegression

E = np.array([row.embedding for row in df.itertuples()])
x = df["valence"].to_numpy()
y = df["arousal"].to_numpy()

valence_lr = LinearRegression().fit(E, x)
arousal_lr = LinearRegression().fit(E, y)

# These scores report R^2 -- a 1.0 is a perfect fit.
assert valence_lr.score(E, x) == 1.0
assert arousal_lr.score(E, y) == 1.0

: 

In [None]:
# Instead of using a direct linear regression, we'll regularize my
# simulatneously attempting to minimize the L2 norm of the parameters -- aka, a
# Ridge regression.

from sklearn.linear_model import Ridge
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error
from math import sqrt

# The only parameter we need to consider is `alpha`, which controls the 
# strength of the regularization. Set it to `0.0` and we end up with 
# `LinearRegression` again. 

# There are a couple of common ways to pick the value of `alpha`:
# 1. Pick the largest `alpha` that doesn't increase mean-squared error. This 
# strikes a nice balance between bias and variance.
# 2. Cross-validation -- and since we don't have much data, we'll do 
# leave-one-out. That only gives us 28 chances to find the right choice, so 
# we'll make our parameter sweep pretty broad.

alpha = 1.0
mse = float("inf")

alphas = [0.25 + i * 0.25 for i in range(0, 28)]

for alpha_loo, (train_index, test_index) in zip(alphas, LeaveOneOut().split(E)):
    E_train, E_test = E[train_index], E[test_index]
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    valence_loo = Ridge(alpha=alpha_loo).fit(E_train, x_train)
    arousal_loo = Ridge(alpha=alpha_loo).fit(E_train, y_train)

    loo_mse = mean_squared_error(
        [valence_loo.predict(E_test), arousal_loo.predict(E_test)], [x_test, y_test]
    )

    if loo_mse <= mse:
        alpha, mse = alpha_loo, loo_mse

print("Alpha: {:0.2f}".format(alpha))

valence_ridge = Ridge(alpha=alpha).fit(E, x)
arousal_ridge = Ridge(alpha=alpha).fit(E, y)

assert valence_ridge.score(E, x) != 1.0
assert arousal_ridge.score(E, y) != 1.0

print(
    "Valence R²: {:0.2f}, Arousal R²: {:0.2f}".format(
        valence_ridge.score(E, x), arousal_ridge.score(E, y)
    )
)

# We'll add the Ridge regression predictions to the dataframe, along with how
# far each affect is from the original values.

df["valence_ridge"] = df["embedding"].map(
    lambda e: valence_ridge.predict(np.array(e).reshape(1, -1))[0]
)
df["arousal_ridge"] = df["embedding"].map(
    lambda e: arousal_ridge.predict(np.array(e).reshape(1, -1))[0]
)
df["displacement"] = df.apply(
    lambda row: sqrt(
        (row.valence - row.valence_ridge) ** 2 + (row.arousal - row.arousal_ridge) ** 2
    ),
    axis=1,
)

: 

In [None]:
selection = alt.selection_point(fields=["affect"])

trimmed_df = df[
    ["affect", "displacement", "valence_ridge", "arousal_ridge", "valence", "arousal"]
]

base = (
    alt.Chart(trimmed_df, title="Circumplex Ridge Regression")
    .mark_point(filled=True)
    .encode(
        alt.X("valence_ridge").axis(None).scale(domain=[-1.2, 1.2]),
        alt.Y("arousal_ridge").axis(None).scale(domain=[-1.2, 1.2]),
        alt.Color("displacement:Q").legend(None).scale(scheme="turbo"),
        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
        tooltip=[
            alt.Tooltip("affect"),
            alt.Tooltip("valence_ridge", title="valence", format="0.2f"),
            alt.Tooltip("arousal_ridge", title="arousal", format="0.2f"),
            alt.Tooltip("displacement:Q", format="0.2f"),
        ],
    )
    .add_params(selection)
)

ghost = (
    alt.Chart(trimmed_df)
    .mark_point()
    .encode(
        alt.X("valence").axis(None),
        alt.Y("arousal").axis(None),
        alt.Color("displacement:Q").legend(None).scale(scheme="turbo"),
        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.0), empty=False),
    )
)

trails = (
    alt.Chart(trimmed_df)
    .mark_rule(strokeDash=(2, 4), strokeWidth=2, strokeCap="round")
    .encode(
        alt.X("valence"),
        alt.X2("valence_ridge"),
        alt.Y("arousal"),
        alt.Y2("arousal_ridge"),
        alt.Color("displacement").scale(scheme="turbo"),
        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.0), empty=False),
    )
)

render_and_save(add_circumplex(base + ghost + trails), "figures/ridge.json")

: 

In [None]:
with open("data/hoffman.json", "r") as f:
    hoffman_raw = json.load(f)

hoffman_affects = []
for category in hoffman_raw["categories"]:
    for entry in category["entries"]:
        hoffman_affects.append((category["category"], entry))

hoffman = pd.DataFrame(hoffman_affects, columns=["category", "affect"])
hoffman["embedding"] = hoffman["affect"].map(embed)
hoffman["valence"] = hoffman["embedding"].map(
    lambda e: valence_ridge.predict(np.array(e).reshape(1, -1))[0]
)
hoffman["arousal"] = hoffman["embedding"].map(
    lambda e: arousal_ridge.predict(np.array(e).reshape(1, -1))[0]
)

: 

In [None]:
selection = alt.selection_point(fields=["category"], bind="legend")

base = (
    alt.Chart(
        hoffman[["affect", "valence", "arousal", "category"]],
        title="Circumplex Embedding",
    )
    .mark_point()
    .encode(
        alt.X("valence:Q").axis(None).scale(domain=[-1.2, 1.2]),
        alt.Y("arousal:Q").axis(None).scale(domain=[-1.2, 1.2]),
        alt.Color("category").title("H.I. Category"),
        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
        tooltip=[
            alt.Tooltip("affect"),
            alt.Tooltip("valence:Q", format="0.2f"),
            alt.Tooltip("arousal:Q", format="0.2f"),
        ],
    )
    .add_params(selection)
)

render_and_save(add_circumplex(base), "figures/hoffman.json")

: 