In [2]:
%load_ext autoreload
%autoreload 2

 
import sys
sys.path.append("..")
import pandas as pd

# Importing the LLM models and the necessary modules

In [3]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

openai_api_key = "##"

openai_llm_model = ChatOpenAI(
    api_key = openai_api_key,
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

openai_embeddings_model = OpenAIEmbeddings(
    api_key = openai_api_key ,
    model="text-embedding-3-large",
)


In [4]:
from atom.utils import LangchainOutputParser

lg = LangchainOutputParser(llm_model=openai_llm_model, embeddings_model=openai_embeddings_model)

# Loading the datasets

In [5]:
import pandas as pd
import ast

def recover_list_dfs(df):
    for col in df.columns:
        if df[col].dtype == object:  # Ensure we only process object (string) columns
            first_valid_index = df[col].first_valid_index()
            if first_valid_index is not None and isinstance(df[col].loc[first_valid_index], str):
                first_value = df[col].loc[first_valid_index].strip()  # Strip any whitespace
                if first_value.startswith("[") and first_value.endswith("]"):  # Ensure it's a list format
                    def safe_eval(x):
                        try:
                            return ast.literal_eval(x) if isinstance(x, str) else x
                        except (SyntaxError, ValueError) as e:
                            print(f"Skipping invalid value in column '{col}': {x} -> {e}")
                            return None  # Return None for problematic values
                    df[col] = df[col].apply(safe_eval)
                    
    return df
    

df_news = recover_list_dfs(pd.read_excel("../datasets/news/df_news_all_llms_eval.xlsx"))
df_abstracts = recover_list_dfs(pd.read_excel("../datasets/abstracts/df_abstracts_all_llms_eval.xlsx"))

In [6]:
import ast
from itertools import chain

factoids = list(chain(*[ast.literal_eval(fact) for fact in df_news["factoids_ground_truth"]]))

In [6]:
len(factoids)

387

# Evaluating the latency

In [7]:
from atom import Atom

atom = Atom(llm_model=openai_llm_model, embeddings_model=openai_embeddings_model)

In [18]:
from datetime import datetime
import time

async def calculate_latency(factoids: list[str]):
    results = []
    current_date = str(datetime.now().date())
    for i in range(1, len(factoids), 20):
        try:
            start = time.perf_counter()
            await atom.build_graph(atomic_facts=factoids[:i], obs_timestamp=current_date)
            end = time.perf_counter()
            elapsed_time = end - start
            results.append({"number of factoids": i, "atom's execution time": elapsed_time})
        except Exception as e:
            # Retry once after error
            print(f"Error processing {i} factoids on first attempt: {e}. Retrying...")
            try:
                start = time.perf_counter()
                await atom.build_graph(atomic_facts=factoids[:i], obs_timestamp=current_date)
                end = time.perf_counter()
                elapsed_time = end - start
                results.append({"number of factoids": i, "atom's execution time": elapsed_time})
            except Exception as e_retry:
                # If retry fails, log the error and continue
                print(f"Error processing {i} factoids on retry: {e_retry}. Skipping...")
                results.append({"number of factoids": i, "atom's execution time": None, "error": str(e_retry)})
        finally:
            # Save progress after each iteration
            pd.DataFrame(results).to_excel("scalability_atom.xlsx", index=False)
    return pd.DataFrame(results)

In [None]:
await calculate_latency(factoids=factoids)