In [7]:
import os
import json
import logging
from pathlib import Path
from dotenv import load_dotenv
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
from llm.openai_endpoints import query_openai_model
from llm.annotator_helpers import get_model_annotations
from utils.parsing_helpers import parse_yago_uri
from llm.ollama_endpoints import query_ollama_model

from llm.prompts import gen_qa_prompt, generate_question_evaluation_prompt, generate_answer_evaluation_prompt

from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential



# Set up logging
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)

In [9]:
resp = query_ollama_model("Are you up? reply in json")
print(resp)

('{"status": "up", "message": "I\'m ready to assist you!"}', None)


In [5]:
# Load the dataset
logger.info("Loading dataset...")
dataset = load_dataset("preetam7/dynamic_kgqa")

# Work with test split
test_data = list(dataset['test'])
results = []

## Getting Started with the QA Generation and Evaluation Pipeline

This notebook serves as a **starter template** for our project. It demonstrates the core components of the pipeline and helps you get familiar with how everything fits together.

### Overview

- **Subgraph Processing**  
  The pipeline begins by taking a subgraph as input.

- **QA Pair Generation**  
  It uses placeholder models (currently GPT-4o via API) to generate question-answer pairs along with supporting paths.

- **Evaluation**  
  These QA pairs are then evaluated using annotator models. Currently, we use:
  - Phi-3
  - Two versions of GPT (API-based)

### Model Flexibility

> **Note:** All models used here are placeholders.  
> We plan to replace them with relevant models from [Hugging Face](https://huggingface.co/) to support easier integration and reproducibility.

You are encouraged to start with the models mentioned in the paper, but the setup is flexible and can accommodate others as needed.

### Infrastructure Support

If you need more compute power, I can spin up a **larger VM with better GPUs** — just let me know.

### What You Should Do

- Explore the notebook and run through the workflow
- Get familiar with how each component fits into the overall pipeline
- Feel free to modify and test with different models
- Reach out with questions or suggestions!

---


In [18]:
prompt = gen_qa_prompt(test_data[0]['subgraph'])
print(prompt)

# print(prompt)
try:
    res, _ = query_ollama_model(prompt)
    res = json.loads(res)
    qa_pairs = res['qa_pairs']
    for i in range(1): #len(qa_pairs)
        question = qa_pairs[i]['question']
        answer = qa_pairs[i]['answer']
        supporting_facts = qa_pairs[i]['supporting_path']
        
        question_eval_prompt = generate_question_evaluation_prompt(question)
        question_eval_res = get_model_annotations(question_eval_prompt)
        print(question_eval_res)
        
        # Generate and get answer evaluation responses
        answer_eval_prompt = generate_answer_evaluation_prompt(question, answer, supporting_facts)
        answer_eval_res = get_model_annotations(answer_eval_prompt)
        print(answer_eval_res)
        
        # Get JSON responses directly
        judge1_response = answer_eval_res[0]
        judge2_response = answer_eval_res[1] 
        judge3_response = answer_eval_res[2]
        
        # Get question evaluation responses directly
        question_judge1 = question_eval_res[0]
        question_judge2 = question_eval_res[1]
        question_judge3 = question_eval_res[2]
        
        # Create a row with same fields as test_data
        answer_raw, answer_readable = parse_yago_uri(answer)
        row = {
            'id': i,
            'question': question,
            'answer': answer_raw,
            'answer_readable': answer_readable,
            'answer_uri': answer,  # Original URI
            'supporting_facts': json.dumps(supporting_facts),
            'supporting_facts_uri': '',  # Would need to extract from supporting facts if available
            # Question evaluation fields
            'logical_structure_flag_llm_judge1': question_judge1['logical_structure_flag'],
            'logical_structure_reasoning_llm_judge1': question_judge1['logical_structure_reasoning'],
            'redundancy_flag_llm_judge1': question_judge1['redundancy_flag'],
            'redundancy_reasoning_llm_judge1': question_judge1['redundancy_reasoning'],
            
            'logical_structure_flag_llm_judge2': question_judge2['logical_structure_flag'],
            'logical_structure_reasoning_llm_judge2': question_judge2['logical_structure_reasoning'],
            'redundancy_flag_llm_judge2': question_judge2['redundancy_flag'],
            'redundancy_reasoning_llm_judge2': question_judge2['redundancy_reasoning'],
            
            'logical_structure_flag_llm_judge3': question_judge3['logical_structure_flag'],
            'logical_structure_reasoning_llm_judge3': question_judge3['logical_structure_reasoning'],
            'redundancy_flag_llm_judge3': question_judge3['redundancy_flag'],
            'redundancy_reasoning_llm_judge3': question_judge3['redundancy_reasoning'],
            
            # Answer evaluation fields
            'answer_support_flag_llm_judge1': judge1_response['answer_support_flag'],
            'answer_support_reasoning_llm_judge1': judge1_response['answer_support_reasoning'],
            'answer_adequacy_flag_llm_judge1': judge1_response['answer_adequacy_flag'],
            'answer_adequacy_reasoning_llm_judge1': judge1_response['answer_adequacy_reasoning'],
            
            'answer_support_flag_llm_judge2': judge2_response['answer_support_flag'],
            'answer_support_reasoning_llm_judge2': judge2_response['answer_support_reasoning'], 
            'answer_adequacy_flag_llm_judge2': judge2_response['answer_adequacy_flag'],
            'answer_adequacy_reasoning_llm_judge2': judge2_response['answer_adequacy_reasoning'],
            
            'answer_support_flag_llm_judge3': judge3_response['answer_support_flag'],
            'answer_support_reasoning_llm_judge3': judge3_response['answer_support_reasoning'],
            'answer_adequacy_flag_llm_judge3': judge3_response['answer_adequacy_flag'],
            'answer_adequacy_reasoning_llm_judge3': judge3_response['answer_adequacy_reasoning']
        }
        results.append(row)
    # print(res)
except Exception as e:
    print(e)


You are an AI assistant tasked with generating question-answer pairs from knowledge graph triples. Your goal is to create natural, human-like questions and their corresponding answers based on the provided graph data.

Task Overview:
Generate **multi-hop, complex Q&A pairs** where the questions appear simple and natural but require reasoning across multiple connected relationships within the graph to infer the answer.

Guidelines for Generating Q&A Pairs:
1. **Question Design**:
- Questions should utilize multiple connected relationships in the graph, requiring multi-hop reasoning.
- Avoid single-hop or trivial questions directly derived from a single triple.
- The answer should be an entity or node in the graph.

2. **Multi-Hop Reasoning**:
- Use paths connecting entities indirectly through multiple relationships to infer answers.
- Questions should reflect meaningful and interesting connections within the graph.
- Aim for question with at least 4 hops or higher whenever possible.

3.

In [26]:
# res = json.loads(res[0])

res['qa_pairs'][0]
# generate_question_evaluation_prompt



{'question': 'What is the parent taxon of Camelidae?',
 'answer': 'Even-toed ungulate',
 'supporting_path': [['http://yago-knowledge.org/resource/Camelidae',
   'http://schema.org/parentTaxon',
   'http://yago-knowledge.org/resource/Even-toed_ungulate']]}

In [21]:
# res = json.loads(res[0])

res['qa_pairs'][0]
# generate_question_evaluation_prompt



{'question': 'What is the parent taxon of Camelidae?',
 'answer': 'Even-toed ungulate',
 'supporting_path': [['http://yago-knowledge.org/resource/Camelidae',
   'http://schema.org/parentTaxon',
   'http://yago-knowledge.org/resource/Even-toed_ungulate']]}

In [22]:
# res = json.loads(res[0])

res['qa_pairs'][0]
# generate_question_evaluation_prompt



{'question': 'What is the parent taxon of Camelidae?',
 'answer': 'Even-toed ungulate',
 'supporting_path': [['http://yago-knowledge.org/resource/Camelidae',
   'http://schema.org/parentTaxon',
   'http://yago-knowledge.org/resource/Even-toed_ungulate']]}

In [23]:
results

[{'id': 0,
  'question': 'What is the parent taxon of Camelidae that is also a parent taxon of Suina?',
  'answer': 'Even-toed ungulate',
  'answer_readable': 'Even-toed ungulate',
  'answer_uri': 'Even-toed ungulate',
  'supporting_facts': '[["http://yago-knowledge.org/resource/Camelidae", "http://schema.org/parentTaxon", "http://yago-knowledge.org/resource/Even-toed_ungulate"], ["http://yago-knowledge.org/resource/Even-toed_ungulate", "http://schema.org/parentTaxon", "http://yago-knowledge.org/resource/Suina"]]',
  'supporting_facts_uri': '',
  'logical_structure_flag_llm_judge1': True,
  'logical_structure_reasoning_llm_judge1': 'The question is grammatically correct and follows proper syntax, making it clear and understandable.',
  'redundancy_flag_llm_judge1': False,
  'redundancy_reasoning_llm_judge1': 'The question does not contain its own answer and does not provide overly obvious phrasing that would lead to an easy guess of the answer.',
  'logical_structure_flag_llm_judge2': 

In [43]:
# res = json.loads(res[0])

res['qa_pairs'][0]['question']
# generate_question_evaluation_prompt

'Which taxon is the parent of the alpaca?'

In [41]:
test_data[0]

{'id': 0,
 'question': 'What family does the alpaca belong to, considering its evolutionary lineage through its parent taxa?',
 'answer': 'Camelidae',
 'answer_readable': 'Camelidae',
 'answer_uri': 'http://yago-knowledge.org/resource/Camelidae',
 'supporting_facts': '[{"subject": "Alpaca", "predicate": "parentTaxon", "object": "Vicugna"}, {"subject": "Vicugna", "predicate": "parentTaxon", "object": "Camelidae"}]',
 'supporting_facts_uri': '[["http://yago-knowledge.org/resource/Alpaca", "http://schema.org/parentTaxon", "http://yago-knowledge.org/resource/Vicugna"], ["http://yago-knowledge.org/resource/Vicugna", "http://schema.org/parentTaxon", "http://yago-knowledge.org/resource/Camelidae"]]',
 'subgraph': '[["http://yago-knowledge.org/resource/Camelidae", "http://schema.org/parentTaxon", "http://yago-knowledge.org/resource/Even-toed_ungulate"], ["http://yago-knowledge.org/resource/Even-toed_ungulate", "http://schema.org/parentTaxon", "http://yago-knowledge.org/resource/Ungulate"], ["h