In [2]:
%load_ext autoreload
%autoreload 2

 
import sys
sys.path.append("..")
import pandas as pd

# Importing the LLM models and the necessary modules

In [4]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

openai_api_key = "##"

openai_llm_model = ChatOpenAI(
    api_key = openai_api_key,
    model="o3-mini",
    #temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

openai_embeddings_model = OpenAIEmbeddings(
    api_key = openai_api_key ,
    model="text-embedding-3-large",
)


In [5]:
from atom.utils import LangchainOutputParser

lg = LangchainOutputParser(llm_model=openai_llm_model, embeddings_model=openai_embeddings_model)

# Loading the datasets

In [8]:
import pandas as pd

df_result = pd.read_excel("similar_entities.xlsx")

In [9]:
df_result["label_embeddings"] = list(await lg.calculate_embeddings(text=list(df_result['label'])))
df_result["name_embeddings"] = list(await lg.calculate_embeddings(text=list(df_result['name'])))
df_result["label2_embeddings"] = list(await lg.calculate_embeddings(text=list(df_result['label2'])))
df_result["name2_embeddings"] = list(await lg.calculate_embeddings(text=list(df_result['name2'])))

In [22]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def compute_rowwise_similarity(df, lambda_, beta):
    """
    Compute cosine similarity for each row using the formula:
    lambda * name_embeddings + beta * label_embeddings.
    
    The similarity is computed between:
    (lambda * name_embeddings + beta * label_embeddings) and 
    (lambda * name2_embeddings + beta * label2_embeddings).

    Parameters:
    df (pd.DataFrame): Input dataframe with embedding columns.
    lambda_ (float): Weight for name embeddings.
    beta (float): Weight for label embeddings.

    Returns:
    pd.DataFrame: The input dataframe with an additional column 'cosine_similarity'.
    """

    def compute_similarity(row):
        # Extract embeddings for the current row
        emb1 = lambda_ * np.array(row['name_embeddings']) + beta * np.array(row['label_embeddings'])
        emb2 = lambda_ * np.array(row['name2_embeddings']) + beta * np.array(row['label2_embeddings'])
        
        # Compute cosine similarity
        similarity = cosine_similarity([emb1], [emb2])[0][0]
        return similarity

    # Apply function to each row and store the result
    df['cosine_similarity'] = df.apply(compute_similarity, axis=1)
    
    return df["cosine_similarity"].mean()

# Example usage:
# df = compute_rowwise_similarity(df, lambda_=0.5, beta=0.5)

In [24]:
def grid_search_lambda(df, delta=0.1):
    """
    Perform grid search over lambda values to find the best lambda that maximizes cosine similarity.
    
    Parameters:
    df (pd.DataFrame): The input dataframe.
    delta (float): Step size for lambda search (default 0.1).
    
    Returns:
    tuple: Best lambda value and the corresponding max cosine similarity.
    """
    best_lambda = None
    max_similarity = -1

    # Search over lambda values from 0 to 1 with step size delta
    lambda_values = np.arange(0, 1.1, delta)

    results = []
    
    for lambda_ in lambda_values:
        beta = 1 - lambda_
        avg_similarity = compute_rowwise_similarity(df, lambda_, beta)
        results.append((lambda_, avg_similarity))
        
        if avg_similarity > max_similarity:
            max_similarity = avg_similarity
            best_lambda = lambda_

    return best_lambda, max_similarity, results


In [26]:
grid_search_lambda(df=df_result, delta=0.05)

(0.8,
 0.7299654380635093,
 [(0.0, 0.4221890359101122),
  (0.05, 0.4404743611295795),
  (0.1, 0.46084377738337806),
  (0.15000000000000002, 0.4832447048951166),
  (0.2, 0.5075028826554425),
  (0.25, 0.5332945762841813),
  (0.30000000000000004, 0.5601289623452876),
  (0.35000000000000003, 0.5873495373517935),
  (0.4, 0.6141624231650046),
  (0.45, 0.6396947374186861),
  (0.5, 0.6630779630964563),
  (0.55, 0.6835420188460816),
  (0.6000000000000001, 0.7004998042446285),
  (0.65, 0.7136030360717724),
  (0.7000000000000001, 0.722758475944278),
  (0.75, 0.7281054430509765),
  (0.8, 0.7299654380635093),
  (0.8500000000000001, 0.7287791515671717),
  (0.9, 0.7250448586993008),
  (0.9500000000000001, 0.7192674908976456),
  (1.0, 0.7119223179046161),
  (1.05, 0.7034330257978003)])

In [None]:
def compute_similarity(row, lambda_=0.8):
        # Extract embeddings for the current row
        emb1 = lambda_ * np.array(row['name_embeddings']) + (1-lambda_) * np.array(row['label_embeddings'])
        emb2 = lambda_ * np.array(row['name2_embeddings']) + (1-lambda_) * np.array(row['label2_embeddings'])
        
        # Compute cosine similarity
        similarity = cosine_similarity([emb1], [emb2])[0][0]
        return similarity