## Setup

In [1]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
from sentence_transformers import SentenceTransformer

%load_ext autoreload
%autoreload 2
from utils import utils
from utils import graphs

In [2]:
texts = pd.read_feather("../../06_explorer/explorer/data/sentiments.arrow")
texts = texts.dropna(subset=["sentiment"]).reset_index(drop=True)

spoof = True
if spoof:
    texts["sentiment"] = np.where(
        texts[["positive", "negative"]].max(axis="columns") > 1 / 3,
        np.where(texts["positive"] > texts["negative"], "positive", "negative"),
        texts["sentiment"],
    )

### SBERT embeddings

In [3]:
if not os.path.isfile("tmp.npy"):
    os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Avoid strange warnings
    model = SentenceTransformer("all-mpnet-base-v2")
    embeddings = model.encode(texts["text"].to_list())
    np.save("tmp.npy", embeddings)
else:
    embeddings = np.load("tmp.npy")

## Plotting

### Embedding/clustering

#### Clustering based on TSNE embedding (of the SBERT embeddings)

In [4]:
df = utils.get_tsne_embedding(texts, embeddings, "eaib", "")
df = utils.add_clustering(df)

In [5]:
graphs.embedding_scatter_plot(df)

#### Cluster based on SBERT embeddings

In [6]:
df = utils.get_tsne_embedding(texts, embeddings, "eaib", "")
df = utils.add_clustering(df, embeddings, texts)

In [7]:
graphs.embedding_scatter_plot(df)