In [1]:
from dataclasses import dataclass
import os
from dotenv import load_dotenv
import dspy


load_dotenv()

@dataclass
class Config:
    # model configuration
    # base_model: str = "ollama_chat/qwen2.5:32b-instruct-q8_0"
    #base_model: str = "ollama_chat/gemma2:9b-instruct-q8_0"
    # base_model: str = "ollama_chat/llama3.2:1b-instruct-q8_0"
    # base_model: str = "ollama_chat/exaone3.5:2.4b-instruct-q8_0"
    # base_model: str = "ollama_chat/granite3.1-dense:2b-instruct-q8_0"
    # base_model: str = "ollama_chat/granite3.1-moe:3b-instruct-q8_0"
    # base_model:str = "ollama_chat/llama3.2:3b-instruct-q8_0"
    #base_model:str = "ollama_chat/gemma2:27b-instruct-q8_0"
    #base_model: str = "ollama_chat/phi4:14b-q8_0"
    #base_model: str = "ollama_chat/command-r:35b-08-2024-q8_0"
    #base_model: str = "ollama_chat/mistral-small:24b-instruct-2501-q8_0"
    #base_model: str = "ollama_chat/phi4-mini:3.8b-q8_0"
    temperature: float = 0.7
    #base_model: str = "openrouter/deepseek/deepseek-chat"
    #base_model: str = "openrouter/deepseek/deepseek-r1"  # good, slow
    #base_model: str = "openrouter/google/gemini-2.0-flash-001" # good, fast
    #base_model: str = "openrouter/deepseek/deepseek-r1-distill-qwen-32b"
    #base_model: str = "openrouter/perplexity/sonar-reasoning"
    #base_model: str = "openrouter/deepseek/deepseek-r1-distill-llama-70b"
    #base_model: str = "openrouter/openai/gpt-4o-mini"
    base_model: str = "openrouter/meta-llama/llama-3.3-70b-instruct"

    # dataset
    dataset: str = "scifact"
    delta_threshold = 0.1

    # APIKEY (if using api for teacher)
    api_key: str | None = None


config = Config(
    api_key = os.environ["APIKEY"]
)

if config.base_model.startswith("ollama"):
    # small, locally hosted base model
    lm = dspy.LM(config.base_model, api_base='http://localhost:11434', api_key='', temperature=config.temperature, cache=False)
else:
    lm = dspy.LM(config.base_model, api_key=config.api_key, temperature=config.temperature, cache=False)
dspy.configure(lm=lm)

In [2]:
MIND_READER = """
# Query Expansion Narrative Generation

## Task Overview
You are tasked with creating a first-person narrative that recreates the thought process behind expanding a search query.
This narrative should explain how you would move from the original query to the provided augmented query using the specific technique indicated in the instruction.

## Input Format
You will receive three components:

1. **Expansion Technique**: The original instruction of the technique to use (e.g., "expand", "graph", "headline", etc.)

2. **Structured Data in Markdown Format**:
   ```markdown
   ## Query
   [original query text]
   
   ## Existing Reasoning
   [brief summary of reasoning]
   
   ## Task Outcomes
   ### Query Goals
   - [goal 1]
   - [goal 2]
   
   ### [Other Fields Based on Technique]
   - [item 1]
   - [item 2]
   
   ## Augmented Query
   [final expanded query]
   ```

3. **Technique Description**: A separate description of the specific technique to apply (provided externally)

## Format

1. **Initiate the thinking***: Start each thought process as brainstorming about the intent for what the user is looking for in the query. For example, something like:
"The user is search a BM25 index for '<original query>'. I need to uncover optimal search terms to use with that algorithm by
determining the intent of their search. The user seems to be looking for..."

2. **Playing the Mind Game**: First, state and very briefly explain the mind game you are playing (expansion technique). Then,
walk through steps in the mind game as though you are uncovering search terms as you think more deeply about the query.

3. **Augmented query terms**: After a detailed first person role play of the mind game technique, conclude with something like:
"Therefore, the optimal BM25 search terms are <search terms from the augmented query>"

These augmented query terms should include all important words from the augmented query, but written as list of terms without
unnecessary things like punctuation or stopwords, OR/AND operators etc. Remember, this is a BM25 index and those terms have a
high document frequency or get removed in the tokenization process anyway. However, be sure to include ALL **important** terms
from the augmented query and nothing more.

Remember, ensure that your thought process exemplifies the detailed steps of the expansion technique, as though you were
using each step in the technique to uncover hidden and more rich search terms.
"""

In [3]:
import dspy


class MindReader(dspy.Signature):
    __doc__ = MIND_READER
    structured_data: str = dspy.InputField(desc="Query augmentation process data")
    expansion_technique: str = dspy.InputField(desc="Mind game used as technique for query expansion")
    thought_process: str = dspy.OutputField(desc="First person role-playing")


In [4]:
import json

with open("generator_docstrings.json", "r") as fh:
    instructions = json.load(fh)

In [5]:
#!/usr/bin/env python3
import json
import sys

def format_json_to_markdown(json_data):
    # Create a copy to avoid modifying the original
    data = json_data.copy()
    
    # Extract key fields before removing them
    query = data.pop('query', 'No query provided')
    reasoning = data.pop('reasoning', 'No reasoning provided')
    augmented_query = data.pop('augmented_query', 'No augmented query provided')
    instruction = data.pop('instruction', 'No instruction provided')
    
    # Remove performance/accuracy related fields
    metrics_fields = ['ndcg', 'map', 'recall', 'precision', 'delta']
    for field in metrics_fields:
        if field in data:
            del data[field]
    
    # Remove dataset, generator, temperature info
    other_fields = ['dataset', 'generator_model', 'temperature', 'query_id']
    for field in list(data.keys()):
        if field in other_fields:
            del data[field]
    
    # Build markdown output
    markdown = []
    
    # Add query section
    markdown.append("## Query")
    markdown.append(f"{query}")
    markdown.append("")
    
    # Add reasoning section
    markdown.append("## Existing Reasoning")
    markdown.append(f"{reasoning}")
    markdown.append("")
    
    # Add task outcomes section for remaining fields
    markdown.append("## Task Outcomes")
    
    for key, value in data.items():
        formatted_key = key.replace('_', ' ').title()
        markdown.append(f"### {formatted_key}")
        
        if isinstance(value, list):
            for item in value:
                markdown.append(f"- {item}")
        else:
            markdown.append(f"{value}")
        
        markdown.append("")
    
    # Add augmented query section
    markdown.append("## Augmented Query")
    markdown.append(f"{augmented_query}")
    
    return "\n".join(markdown), instruction


In [6]:
import json
import os
import glob
from pathlib import Path


def process_benchmark(input_dir, output_dir, reprocess=True):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Get all JSON files in the input directory
    json_files = glob.glob(os.path.join(input_dir, "*.json"))
    
    # Track statistics
    total_files = len(json_files)
    processed_files = 0
    skipped_files = 0
    
    # Process each file
    for file_path in json_files:
        file_name = os.path.basename(file_path)
        output_path = os.path.join(output_dir, file_name)
        
        # Check if file already exists and skip if reprocess is False
        if os.path.exists(output_path) and not reprocess:
            print(f"Skipping {file_path} (already exists in output directory)")
            skipped_files += 1
            continue
        
        print(f"Processing {file_path}...")
        
        # Read the JSON file
        with open(file_path, 'r') as file:
            json_data = json.load(file)
        
        # Convert to markdown and get the instruction
        markdown_content, instruction = format_json_to_markdown(json_data)
        
        # Check if instruction exists in instructions dictionary
        if instruction not in instructions:
            print(f"Warning: Instruction '{instruction}' not found in instructions dictionary. Skipping {file_path}")
            skipped_files += 1
            continue
        
        # Generate thought process using dspy
        try:
            gen = dspy.Predict(MindReader)  # Assuming MindReader is defined elsewhere
            pred = gen(
                structured_data=markdown_content,
                expansion_technique=instructions[instruction]
            )
            
            # Add thought process to the original JSON data
            print(pred)
            json_data["thought_process"] = pred.thought_process
            json_data["though_generator"] = config.base_model
            
            # Save the augmented JSON to the output directory
            with open(output_path, 'w') as outfile:
                json.dump(json_data, outfile, indent=2)
                
            print(f"Saved augmented JSON to {output_path}")
            processed_files += 1
            
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
            skipped_files += 1
    
    print(f"Processing complete:")
    print(f"- Total files: {total_files}")
    print(f"- Processed files: {processed_files}")
    print(f"- Skipped files: {skipped_files}")


In [None]:
beir = "scifact"
INPUT_FOLDER = f"data/{beir}"
OUTPUT_FOLDER = f"data/thinking/{beir}"

process_benchmark(INPUT_FOLDER, OUTPUT_FOLDER)

Processing data/scifact/1213.json...
Prediction(
    thought_process='The user is searching a BM25 index for \'The deregulated and prolonged activation of monocytes has deleterious effects in inflammatory diseases\'. I need to uncover optimal search terms to use with that algorithm by determining the intent of their search. The user seems to be looking for information on how prolonged monocyte activation affects inflammatory diseases and potentially seeking treatment strategies or consequences of this activation.\n\nTo expand this query, I\'ll be playing a mind game called "Query Expansion via User Intent and Journey Mapping". This technique involves analyzing how different users, with varying levels of expertise, would seek information about prolonged monocyte activation in inflammatory diseases throughout their learning journey. The goal is to extract intent-revealing terms and vocabulary that reflect how real users describe their needs at different stages.\n\nFirst, I\'ll create use

In [None]:

# with open("data/scifact/1014.json", 'r') as file:
#     json_data = json.load(file)

# # Convert to markdown and print
# markdown_content, instruction = format_json_to_markdown(json_data)


# gen = dspy.Predict(MindReader)
# pred = gen(
#     structured_data = markdown_content,
#     expansion_technique = instructions[instruction]
# )

# print(pred.thought_process)