In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from openai import OpenAI


  from tqdm.autonotebook import tqdm, trange


In [2]:
comments = pd.DataFrame([
    "I like apples and oranges.",
    "I like meat.",
    "I like cars.",
    "Bananas are my favorite fruit.",
    "I enjoy watching soccer games.",
    "Electric cars are the future of transportation.",
    "Pineapples are great on pizza.",
    "Basketball is intense.",
    "I think motorcycles are cool.",
    "Mangoes make delicious smoothies.",
    "I love running",
    "Trucks are very useful for moving heavy loads.",
    "Grapes are a perfect snack for summer."
], columns=["comments"])
comments["class_sentiment"] = 0

kpis = pd.DataFrame(["fruits", "sports", "engine"], columns=["kpis"])

In [3]:
class EmbeddingModel:
    """
    Uses the model "sentence-transformers/all-MiniLM-L6-v2"
    to embed any given texts locally.
    """
    def __init__(self, model_path='MiniLM'):
        self.model_path = model_path
        self.model = SentenceTransformer(self.model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)

    def encode(self, text):
        """
        :param text: input text to transform into embedding
        :return:  weights of the given text.
        """
        return self.model.encode(text)

In [4]:
def cosine_similarity(vec_a, vec_b):
        """
        Calculate the cosine similarity between two vectors.

        :param vec_a: First vector.
        :param vec_b: Second vector.
        :return: Cosine similarity.
        """
        dot_product = np.dot(vec_a, vec_b)
        norm_a = np.linalg.norm(vec_a)
        norm_b = np.linalg.norm(vec_b)
        return dot_product / (norm_a * norm_b)

In [5]:
embedding_model = EmbeddingModel(model_path="models/MiniLM")
comments["embedding"] = comments.iloc[:, 0].apply(
        lambda x: embedding_model.encode(x))
minimal_relevance=0.35

In [6]:
client = OpenAI(api_key= "add_openai_key_here")

model="text-embedding-3-large"

comments["embedding_gpt"] = comments.iloc[:, 0].apply(
        lambda x: client.embeddings.create(input = [x], model=model).data[0].embedding)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
#  Relevance is present if the {kpi}_relevance score is > to 0.35 for the local model and > 0.25 for the gpt model

for kpi in kpis["kpis"]:
    kpi_embedding = embedding_model.encode(kpi)
    kpi_embedding_gpt = client.embeddings.create(input = [kpi], model=model).data[0].embedding
    
    comments[f"{kpi}_relevance"] = comments["embedding"].apply(
    lambda embedding: cosine_similarity(embedding, kpi_embedding))
    
    comments[f"{kpi}_relevance_gpt"] = comments["embedding_gpt"].apply(
    lambda embedding: cosine_similarity(embedding, kpi_embedding_gpt))
    


In [11]:
comments.head(20)

Unnamed: 0,comments,class_sentiment,embedding,embedding_gpt,fruits_relevance,fruits_relevance_gpt,sports_relevance,sports_relevance_gpt,engine_relevance,engine_relevance_gpt
0,I like apples and oranges.,0,"[-0.0047736308, -0.033712782, -0.029867908, 0....","[-0.02607107348740101, 0.01751878671348095, -0...",0.492001,0.372873,0.183973,0.111833,0.08706,0.056628
1,I like meat.,0,"[-0.034070875, -0.0248448, 0.00999197, 0.05328...","[-0.00010533343447605148, 0.009422408416867256...",0.280889,0.215225,0.208115,0.131746,0.243062,0.062176
2,I like cars.,0,"[-0.022753023, 0.01635055, 0.06078946, 0.02422...","[-0.013270105235278606, 0.022298555821180344, ...",0.198592,0.166178,0.311859,0.212242,0.383792,0.237702
3,Bananas are my favorite fruit.,0,"[-0.02827728, -0.054333355, 0.029286304, 0.034...","[0.01907418482005596, -0.010422163642942905, -...",0.64477,0.426732,0.10573,0.092829,0.084614,0.027123
4,I enjoy watching soccer games.,0,"[0.02653475, -0.05407322, -0.03101991, -0.0086...","[0.014220135286450386, 0.041234537959098816, -...",0.099971,0.073012,0.505962,0.314044,0.085004,0.052593
5,Electric cars are the future of transportation.,0,"[-0.04241642, 0.07156061, 0.046378553, 0.01690...","[-0.0064928061328828335, 0.0387466736137867, -...",0.039102,0.127313,0.141261,0.081846,0.278879,0.168049
6,Pineapples are great on pizza.,0,"[-0.03054588, -0.019156106, 0.03501304, 0.1002...","[0.007351840380579233, 0.03830129653215408, -0...",0.431365,0.291731,0.006918,0.084082,-0.003197,0.050985
7,Basketball is intense.,0,"[0.08103625, -0.03981275, 0.023760607, -0.0146...","[0.0413094237446785, -0.013469276949763298, -0...",0.12874,0.08909,0.498443,0.323362,0.10812,0.060419
8,I think motorcycles are cool.,0,"[-0.041573863, 0.03360039, 0.014504659, -0.038...","[0.0402224026620388, -0.003146461443975568, -0...",0.038827,0.090154,0.144001,0.142037,0.246448,0.143612
9,Mangoes make delicious smoothies.,0,"[-0.053552773, -0.060681816, 0.022325274, 0.06...","[0.022401113063097, 0.012005576863884926, -0.0...",0.476705,0.355605,-0.01248,0.073249,0.151816,0.01343
