In [1]:
from openai.embeddings_utils import cosine_similarity
from evalio.semantics import visualize_embeddings, get_embedding, get_nearest_neighbors
from evalio.util import convert_string_to_numpy_array
import pandas as pd

from sklearn.metrics import classification_report, PrecisionRecallDisplay

In [None]:
EMBEDDING_COLUMN_NAME = 'embedding'
SCORE_COLUMN_NAME = 'Score'

df = pd.read_csv('data/fine_food_reviews_with_embeddings_1k.csv')

# Visualize embeddings

In [None]:
visualize_embeddings(df, embeddings_column='embedding', score_column='Score')

# 2-class Precision-Recall curve (review sentiment)

In [None]:
df['embedding'] = convert_string_to_numpy_array(df['embedding'])

# convert 5-star rating to binary sentiment
df = df[df.Score != 3]
df["sentiment"] = df.Score.replace({1: "negative", 2: "negative", 4: "positive", 5: "positive"})

In [None]:
def evaluate_embeddings_approach(labels=('negative', 'positive')):
    label_embeddings_ = [get_embedding(label) for label in labels]

    def label_score(review_embedding, label_embeddings):
        return cosine_similarity(review_embedding, label_embeddings[1]) - cosine_similarity(
            review_embedding, label_embeddings[0]
        )

    probas = df["embedding"].apply(lambda x: label_score(x, label_embeddings_))
    preds = probas.apply(lambda x: 'positive' if x > 0 else 'negative')

    report = classification_report(df.sentiment, preds)
    print(report)

    display = PrecisionRecallDisplay.from_predictions(df.sentiment, probas, pos_label='positive')
    _ = display.ax_.set_title("2-class Precision-Recall curve")

In [None]:
evaluate_embeddings_approach(labels=['negative', 'positive'])

In [None]:
evaluate_embeddings_approach(
    labels=['An Amazon review with a negative sentiment.', 'An Amazon review with a positive sentiment.']
)

# Clustering

In [None]:
from evalio.semantics import create_clusters_from_embeddings, create_clusters_from_column

CLUSTER_COLUMN_NAME = 'Cluster'
N_CLUSTERS = 4

df['Cluster'] = create_clusters_from_column(df['embedding'], n_clusters=N_CLUSTERS)
# create_clusters_from_embeddings(
#     df,
#     embeddings_column=EMBEDDING_COLUMN_NAME,
#     target_cluster_column=CLUSTER_COLUMN_NAME,
#     n_clusters=N_CLUSTERS,
# )

In [None]:
from evalio.semantics import visualize_embedding_clusters

visualize_embedding_clusters(
    df, embeddings_column='embedding', target_cluster_column='Cluster', score_column='Score', n_clusters=N_CLUSTERS
)

In [None]:
from evalio.semantics import classify_clusters_with_llm

classify_clusters_with_llm(df, cluster_column='Cluster', n_clusters=N_CLUSTERS)

# Semantic search

In [None]:
from evalio.semantics import search_reviews


results = search_reviews(df, "delicious beans", embeddings_column='embedding', top_n=3)

In [None]:
news_df = pd.read_csv('data/AG_news_samples.csv')
for idx, row in news_df.head(3).iterrows():
    print("")
    print(f"Title: {row['title']}")
    print(f"Description: {row['description']}")
    print(f"Label: {row['label']}")

In [None]:
news_descriptions = news_df["description"].tolist()
news_descriptions

In [None]:
news_df["description"].values[0] == news_df["description"][0] == news_descriptions[0]

In [None]:
first_news_description_embedding = get_embedding(news_descriptions[0])
first_news_description_embedding

In [None]:
get_nearest_neighbors(news_descriptions, 0, k_nearest_neighbors=5)

In [None]:
get_embedding.cache_info()