In [3]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

openai_api_key = "###"

openai_llm_model = ChatOpenAI(
    api_key = openai_api_key,
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

openai_embeddings_model = OpenAIEmbeddings(
    api_key = openai_api_key ,
    model="text-embedding-3-large",
)

# Loading the datasets

In [4]:
import pandas as pd
import ast

def recover_list_dfs(df):
    for col in df.columns:
        if df[col].dtype == object:  # Ensure we only process object (string) columns
            first_valid_index = df[col].first_valid_index()
            if first_valid_index is not None and isinstance(df[col].loc[first_valid_index], str):
                first_value = df[col].loc[first_valid_index].strip()  # Strip any whitespace
                if first_value.startswith("[") and first_value.endswith("]"):  # Ensure it's a list format
                    def safe_eval(x):
                        try:
                            return ast.literal_eval(x) if isinstance(x, str) else x
                        except (SyntaxError, ValueError) as e:
                            print(f"Skipping invalid value in column '{col}': {x} -> {e}")
                            return None  # Return None for problematic values
                    df[col] = df[col].apply(safe_eval)
                    
    return df
    

df_news = recover_list_dfs(pd.read_excel("../datasets/news/df_news_all_llms_eval.xlsx"))
df_abstracts = recover_list_dfs(pd.read_excel("../datasets/abstracts/df_abstracts_all_llms_eval.xlsx"))

In [5]:
import ast
from itertools import chain

factoids = list(chain(*[ast.literal_eval(fact) for fact in df_news["factoids_ground_truth"]]))

In [2]:
from itext2kg import iText2KG

  warn(


In [6]:
itext2kg= iText2KG(llm_model=openai_llm_model, embeddings_model=openai_embeddings_model)

In [8]:
from datetime import datetime
import time
import pandas as pd

def calculate_latency(factoids: list[str]):
    results = []
    for i in range(1, len(factoids), 20):
        try:
            start = time.perf_counter()
            itext2kg.build_graph(sections=factoids[:i])
            end = time.perf_counter()
            elapsed_time = end - start
            results.append({"number of factoids": i, "itext2kg's execution time": elapsed_time})
        except Exception as e:
            # Retry once after error
            print(f"Error processing {i} factoids on first attempt: {e}. Retrying...")
            try:
                start = time.perf_counter()
                itext2kg.build_graph(sections=factoids[:i])
                end = time.perf_counter()
                elapsed_time = end - start
                results.append({"number of factoids": i, "itext2kg's execution time": elapsed_time})
            except Exception as e_retry:
                # If retry fails, log the error and continue
                print(f"Error processing {i} factoids on retry: {e_retry}. Skipping...")
                results.append({"number of factoids": i, "itext2kg's execution time": None, "error": str(e_retry)})
        finally:
            # Save progress after each iteration
            pd.DataFrame(results).to_excel("scalability_itext2kg.xlsx", index=False)
    return pd.DataFrame(results)

In [9]:
calculate_latency(factoids=factoids)

[INFO] ------- Extracting Entities from the Document 1
{'entities': [{'name': 'Copenhagen', 'label': 'Location'}, {'name': 'International Climate Summit', 'label': 'Event'}, {'name': 'World Leaders', 'label': 'Group'}, {'name': 'November 2, 2025', 'label': 'Date'}]}
[INFO] ------- Extracting Relations from the Document 1
{'relationships': [{'startNode': {'name': 'world leaders', 'label': 'Group'}, 'endNode': {'name': 'international climate summit', 'label': 'Event'}, 'name': 'attended'}, {'startNode': {'name': 'international climate summit', 'label': 'Event'}, 'endNode': {'name': 'copenhagen', 'label': 'Location'}, 'name': 'held in'}, {'startNode': {'name': 'international climate summit', 'label': 'Event'}, 'endNode': {'name': 'november 2, 2025', 'label': 'Date'}, 'name': 'occurred on'}]}
[INFO] Verification of invented entities
[INFO] ------- Extracting Entities from the Document 1
{'entities': [{'name': 'Copenhagen', 'label': 'Location'}, {'name': 'International Climate Summit', 'lab

KeyboardInterrupt: 