In [12]:
import os, sys
ROOT_DIR = os.path.abspath("..")
sys.path.append(ROOT_DIR)

In [13]:
import neo4j
import pandas as pd
from random_qa.config import InferenceConfig, EvaluationConfig
from random_qa.cli import init_env, run_inference, run_evaluation

In [14]:
init_env()

In [23]:
# Specify a custom question and ground truth query
question = "How many tiles does a cell cover on average?"
query = """
MATCH (c:Cell)<-[:COVERED_BY]-(t:Tile)
WITH c, COUNT(t) AS count
RETURN avg(count) AS meanTileCount
"""
with neo4j.GraphDatabase.driver("bolt://localhost:7687") as driver:
    with driver.session() as session:
        query_result = session.run(query).data()

answer = str(query_result[0]["meanTileCount"])

df_samples = pd.DataFrame([
    ("9000_0", question, query, query_result, answer)
], columns=("sample_id", "question", "query", "query_result", "answer")).set_index("sample_id")

In [24]:
df_results = await run_inference(df_samples, InferenceConfig(scenario="open-book", query_llm="gpt-4o", answer_llm="gpt-4o"))

print("Generated query:")
print(df_results.iloc[0].query)
print(f"\nGenerated answer: {df_results.iloc[0].answer}")

  0%|          | 0/1 [00:00<?, ?it/s]

Generated query:
MATCH (t:Tile)-[:COVERED_BY]->(c:Cell)
WITH c, COUNT(t) AS tileCount
RETURN AVG(tileCount) AS averageTilesPerCell

Generated answer: 62.32 tiles


In [27]:
df_metrics = await run_evaluation(df_samples, df_results, EvaluationConfig(eval_llm="gpt-4o"))
df_metrics

print(f"Predicted query result: {df_results.iloc[0].query_result}")
print(f"Expected query result: {df_samples.iloc[0].query_result}")
print(f"Query recall: {df_metrics.iloc[0].query_recall}")
print(f"Query precision: {df_metrics.iloc[0].query_precision}")
print(f"Query F1-score: {df_metrics.iloc[0].query_f1}")

print(f"\nPredicted answer: {df_results.iloc[0].answer}")
print(f"Expected answer: {df_samples.iloc[0].answer}")
print(f"Answer EM: {df_metrics.iloc[0].answer_exact_match}")
print(f"Answer KWR: {df_metrics.iloc[0].answer_keyword_recall}")
print(f"Answer R_BERT: {df_metrics.iloc[0].answer_bert_score_recall}")
print(f"Answer P_BERT: {df_metrics.iloc[0].answer_bert_score_precision}")
print(f"Answer F_BERT: {df_metrics.iloc[0].answer_bert_score_f1}")
print(f"Answer CG: {df_metrics.iloc[0].answer_correctness['score']}")
print(f"Justification: {df_metrics.iloc[0].answer_correctness['justification']}")

  0%|          | 0/1 [00:00<?, ?it/s]

Predicted query result: [{'averageTilesPerCell': 62.31545375471149}]
Expected query result: [{'meanTileCount': 62.31545375471149}]
Query recall: 1.0
Query precision: 1.0
Query F1-score: 1.0

Predicted answer: 62.32 tiles
Expected answer: 62.31545375471149
Answer EM: 0.0
Answer KWR: 0.0
Answer R_BERT: 0.13323578238487244
Answer P_BERT: 0.03035748191177845
Answer F_BERT: 0.08321505784988403
Answer CG: 4
Justification: The generated answer of 62.32 tiles is very close to the ground truth answer of 62.31545375471149, with a difference of only 0.00454624528851. This is a minor rounding difference and does not significantly alter the factual correctness.
