### üì¶ Step 1: Install required packages (run once)

In [50]:
%pip install pandas openai azure-search-documents

Note: you may need to restart the kernel to use updated packages.


### üîê Step 2: Credentials

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_EMBEDDING_MODEL = "text-embedding-3-small"

AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
AZURE_SEARCH_KEY = os.getenv("AZURE_SEARCH_KEY")
AZURE_EXAM_INDEX = os.getenv("AZURE_EXAM_INDEX")

### üì¶ Step 3: Upgrade azure-search-documents package

In [52]:
%pip install --upgrade azure-search-documents

Note: you may need to restart the kernel to use updated packages.


### üîÑ Step 4a: Updated Index Schema (with Quality Scoring fields)
This schema extends the original with fields needed for the quality scoring feedback loop.

In [3]:
# Updated index schema with quality scoring + review/state management fields
# WARNING: This will DELETE and RECREATE the existing index

from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile
)
from azure.core.credentials import AzureKeyCredential

index_client = SearchIndexClient(
    endpoint=AZURE_SEARCH_ENDPOINT,
    credential=AzureKeyCredential(AZURE_SEARCH_KEY)
)

embedding_dimensions = 1536

index_schema = SearchIndex(
    name=AZURE_EXAM_INDEX,  # Using existing index name
    fields=[
        # === CORE FIELDS (existing) ===
        SearchField(name="id", type=SearchFieldDataType.String, key=True, searchable=False),
        SearchField(name="domain", type=SearchFieldDataType.String, searchable=True, filterable=True),
        SearchField(name="topic", type=SearchFieldDataType.String, searchable=True, filterable=True),
        SearchField(name="evidence", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="question_text", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="options_raw", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="correct_answer", type=SearchFieldDataType.String, filterable=True),
        SearchField(name="rationale", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="full_text", type=SearchFieldDataType.String, searchable=True),
        
        # === Structured item components ===
        SearchField(name="stimulus", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="stem", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="option_a", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="option_b", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="option_c", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="option_d", type=SearchFieldDataType.String, searchable=True),
        
        # === Quality scoring fields ===
        SearchField(name="quality_score", type=SearchFieldDataType.Double, filterable=True, sortable=True),
        SearchField(name="quality_tier", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchField(name="quality_summary", type=SearchFieldDataType.String, searchable=True),
        SearchField(name="quality_scores_json", type=SearchFieldDataType.String, searchable=False),  # JSON blob
        SearchField(
            name="improvement_suggestions",
            type=SearchFieldDataType.Collection(SearchFieldDataType.String),
            searchable=True
        ),
        
        # === REVIEW & STATE MANAGEMENT (Human-in-the-Loop) ===
        SearchField(name="review_status", type=SearchFieldDataType.String, filterable=True, facetable=True),  
        # Values: "gold_standard" | "pending_review" | "approved" | "approved_with_edits" | "rejected"
        SearchField(name="reviewed_at", type=SearchFieldDataType.DateTimeOffset, filterable=True, sortable=True),
        SearchField(name="reviewed_by", type=SearchFieldDataType.String, filterable=True),
        SearchField(name="review_decision", type=SearchFieldDataType.String, filterable=True),  # "upvote" | "downvote"
        SearchField(name="review_explanation", type=SearchFieldDataType.String, searchable=True),  # Why rejected/edited
        
        # === EDIT TRACKING ===
        SearchField(name="was_edited", type=SearchFieldDataType.Boolean, filterable=True),
        SearchField(name="original_version_json", type=SearchFieldDataType.String, searchable=False),  # JSON snapshot before edits
        SearchField(name="edit_summary", type=SearchFieldDataType.String, searchable=True),
        
        # === GENERATION METADATA (for Agent Framework + Analytics) ===
        SearchField(name="generation_batch_id", type=SearchFieldDataType.String, filterable=True),  # Group items from same batch
        SearchField(name="generation_attempt", type=SearchFieldDataType.Int32, filterable=True),  # Which retry generated this
        SearchField(name="similarity_at_generation", type=SearchFieldDataType.Double, filterable=True),  # Max similarity when generated
        SearchField(name="generation_metadata_json", type=SearchFieldDataType.String, searchable=False),  # Full generation context
        
        # === Metadata/tracking fields ===
        SearchField(name="source", type=SearchFieldDataType.String, filterable=True, facetable=True),  # "original" | "generated_v2"
        SearchField(name="is_generated", type=SearchFieldDataType.Boolean, filterable=True),
        SearchField(name="created_at", type=SearchFieldDataType.DateTimeOffset, filterable=True, sortable=True),
        SearchField(name="scored_at", type=SearchFieldDataType.DateTimeOffset, filterable=True, sortable=True),

        # === VECTOR FIELD (existing) ===
        SearchField(
            name="content_vector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=embedding_dimensions,
            vector_search_profile_name="myHnswProfile"
        )
    ],
    vector_search=VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="myHnsw",
                kind="hnsw"
            )
        ],
        profiles=[
            VectorSearchProfile(
                name="myHnswProfile",
                algorithm_configuration_name="myHnsw"
            )
        ]
    )
)

# Delete existing index if it exists, then create with new schema
if AZURE_EXAM_INDEX in index_client.list_index_names():
    index_client.delete_index(AZURE_EXAM_INDEX)
    print(f"üóëÔ∏è Deleted existing index: {AZURE_EXAM_INDEX}")

index_client.create_index(index_schema)
print(f"‚úÖ Created index with quality + review/state management fields: {AZURE_EXAM_INDEX}")

üóëÔ∏è Deleted existing index: jdn-exam-sept2025-items
‚úÖ Created index with quality + review/state management fields: jdn-exam-sept2025-items
‚úÖ Created index with quality + review/state management fields: jdn-exam-sept2025-items


### üèÜ Step 4b: Migrate Existing High-Quality Exams
Your existing exams from 09/2025 are already high-quality (human-authored, gold standard). 
We'll mark them as **gold tier** with a perfect score to establish the baseline.

In [4]:
# Updated document converter that includes quality fields for EXISTING high-quality exams
# These are your gold-standard items from 12/2025

import uuid
import json
import pandas as pd
from datetime import datetime, timezone

# Load the CSV file
df = pd.read_csv("./dataset/jdn-items-and-metadata_09-17-25_2.csv", encoding='latin1')
print(f"üìÑ Loaded {len(df)} rows from CSV")

def row_to_doc_v2(row):
    """
    Convert CSV row to document with quality metadata.
    Existing items are marked as GOLD tier since they're human-authored gold standards.
    """
    evidence = "\n".join(filter(pd.notna, [
        row.get("Evidence Statement #1", ""),
        row.get("Evidence Statement #2", ""),
        row.get("Evidence Statement #3", "")
    ]))
    
    # Build options dict and raw string
    options = {}
    options_lines = []
    for opt in ['A', 'B', 'C', 'D', 'E']:
        opt_val = row.get(f'Option {opt}', '')
        if pd.notna(opt_val) and opt_val:
            options[opt] = str(opt_val)
            options_lines.append(f"{opt}. {opt_val}")
    options_raw = "\n".join(options_lines)
    
    # Full text for embedding
    full_text = f"Evidence:\n{evidence}\n\nQuestion:\n{row['Question']}\n\nOptions:\n{options_raw}\n\nRationale:\n{row['Rationale']}"
    
    # Quality scores for EXISTING high-quality items (gold standard baseline)
    # These are human-authored, vetted items - we assign them gold tier
    gold_quality_scores = {
        "stimulus": {"score": 5, "justification": "Human-authored gold standard", "issues": []},
        "stem": {"score": 5, "justification": "Human-authored gold standard", "issues": []},
        "key": {"score": 5, "justification": "Human-authored gold standard", "issues": []},
        "distractors": {"score": 5, "justification": "Human-authored gold standard", "issues": []},
        "alignment": {"score": 5, "justification": "Human-authored gold standard", "issues": []},
        "language": {"score": 5, "justification": "Human-authored gold standard", "issues": []},
        "style": {"score": 5, "justification": "Human-authored gold standard", "issues": []},
        "fairness": {"score": 5, "justification": "Human-authored gold standard", "issues": []}
    }
    
    return {
        # Core fields
        "id": str(uuid.uuid4()),
        "domain": row.get("Domain", ""),
        "topic": row.get("Topic", ""),
        "evidence": evidence,
        "question_text": row["Question"],
        "options_raw": options_raw,
        "correct_answer": row.get("Answer", ""),
        "rationale": row.get("Rationale", ""),
        "full_text": full_text,
        
        # Structured components (if available, otherwise derive from question)
        "stimulus": "",  # Original CSV may not have separate stimulus
        "stem": row["Question"],  # Use question as stem
        "option_a": options.get("A", ""),
        "option_b": options.get("B", ""),
        "option_c": options.get("C", ""),
        "option_d": options.get("D", ""),
        
        # Quality fields - GOLD TIER for existing high-quality items
        "quality_score": 5.0,  # Perfect score for gold standard
        "quality_tier": "gold",  # Highest tier
        "quality_summary": "Human-authored gold standard item from JD-Next item bank (09/2025)",
        "quality_scores_json": json.dumps(gold_quality_scores),
        "improvement_suggestions": [],  # No improvements needed for gold standard
        
        # Review & State Management - GOLD STANDARD (no review needed)
        "review_status": "gold_standard",  # Skips human review workflow
        "reviewed_at": None,
        "reviewed_by": None,
        "review_decision": None,
        "review_explanation": None,
        
        # Edit Tracking
        "was_edited": False,
        "original_version_json": None,
        "edit_summary": None,
        
        # Generation Metadata (N/A for original items)
        "generation_batch_id": None,
        "generation_attempt": None,
        "similarity_at_generation": None,
        "generation_metadata_json": None,
        
        # Metadata
        "source": "original",  # Distinguishes from generated items
        "is_generated": False,  # Human-authored
        "created_at": datetime.now(timezone.utc).isoformat(),
        "scored_at": datetime.now(timezone.utc).isoformat(),
    }

# Convert all rows with quality metadata
docs_v2 = [row_to_doc_v2(row) for _, row in df.iterrows()]
print(f"‚úÖ Converted {len(docs_v2)} documents with quality metadata")
print(f"\nSample document quality fields:")
print(f"  - quality_score: {docs_v2[0]['quality_score']}")
print(f"  - quality_tier: {docs_v2[0]['quality_tier']}")
print(f"  - review_status: {docs_v2[0]['review_status']}")
print(f"  - source: {docs_v2[0]['source']}")
print(f"  - is_generated: {docs_v2[0]['is_generated']}")

üìÑ Loaded 24 rows from CSV
‚úÖ Converted 24 documents with quality metadata

Sample document quality fields:
  - quality_score: 5.0
  - quality_tier: gold
  - review_status: gold_standard
  - source: original
  - is_generated: False


### üß¨ Step 4c: Generate embeddings and upload to new index

In [5]:
# Generate embeddings for the v2 documents
from openai import AzureOpenAI

client = AzureOpenAI(
    api_key=AZURE_OPENAI_KEY,
    api_version="2024-02-01",
    azure_endpoint=AZURE_OPENAI_ENDPOINT
)

def embed(text):
    response = client.embeddings.create(
        model=AZURE_OPENAI_EMBEDDING_MODEL,
        input=text
    )
    return response.data[0].embedding

print("Generating embeddings...")
for i, doc in enumerate(docs_v2):
    doc["content_vector"] = embed(doc["full_text"])
    if (i + 1) % 50 == 0:
        print(f"  Embedded {i + 1}/{len(docs_v2)} documents...")

print(f"‚úÖ Embedded {len(docs_v2)} documents")

Generating embeddings...
‚úÖ Embedded 24 documents
‚úÖ Embedded 24 documents


In [6]:
# Upload to the index with quality fields
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential

search_client = SearchClient(
    endpoint=AZURE_SEARCH_ENDPOINT,
    index_name=AZURE_EXAM_INDEX,
    credential=AzureKeyCredential(AZURE_SEARCH_KEY)
)

batch_size = 100
for i in range(0, len(docs_v2), batch_size):
    batch = docs_v2[i:i+batch_size]
    result = search_client.upload_documents(documents=batch)
    succeeded = sum(1 for r in result if r.succeeded)
    print(f"‚úÖ Uploaded batch {i//batch_size + 1}: {succeeded}/{len(batch)} documents")

print(f"\nüéâ Migration complete! {len(docs_v2)} gold-standard items now in '{AZURE_EXAM_INDEX}'")

‚úÖ Uploaded batch 1: 24/24 documents

üéâ Migration complete! 24 gold-standard items now in 'jdn-exam-sept2025-items'


### ‚úÖ Step 4d: Verify the migration
Check that quality fields are properly set and queryable.

In [7]:
# Verify the migration - check quality distribution
results = search_client.search(
    search_text="*",
    select=["id", "topic", "quality_score", "quality_tier", "source", "is_generated", "review_status"],
    top=1000
)

items = list(results)
print(f"Total items in index: {len(items)}")

# Count by quality tier, source, and review status
tier_counts = {}
source_counts = {}
review_status_counts = {}
for item in items:
    tier = item.get("quality_tier", "unscored")
    source = item.get("source", "unknown")
    review_status = item.get("review_status", "unknown")
    tier_counts[tier] = tier_counts.get(tier, 0) + 1
    source_counts[source] = source_counts.get(source, 0) + 1
    review_status_counts[review_status] = review_status_counts.get(review_status, 0) + 1

print(f"\nüìä Quality Distribution:")
for tier, count in sorted(tier_counts.items()):
    print(f"  {tier}: {count} items")

print(f"\nüì¶ Source Distribution:")
for source, count in sorted(source_counts.items()):
    print(f"  {source}: {count} items")

print(f"\nüîç Review Status Distribution:")
for status, count in sorted(review_status_counts.items()):
    print(f"  {status}: {count} items")

# Test filtering by quality
gold_items = search_client.search(
    search_text="*",
    filter="quality_tier eq 'gold'",
    top=5
)
print(f"\nüèÜ Sample gold-tier items:")
for item in gold_items:
    print(f"  - {item['topic']}: score={item.get('quality_score', 'N/A')}, review_status={item.get('review_status', 'N/A')}")

Total items in index: 24

üìä Quality Distribution:
  gold: 24 items

üì¶ Source Distribution:
  original: 24 items

üîç Review Status Distribution:
  gold_standard: 24 items

üèÜ Sample gold-tier items:
  - TP.2: Legal Test for Consideration: score=5.0, review_status=gold_standard
  - TP.2: Legal Test for Consideration: score=5.0, review_status=gold_standard
  - TP.2: Legal Test for Consideration: score=5.0, review_status=gold_standard
  - TP.2: Legal Test for Consideration: score=5.0, review_status=gold_standard
  - TP.2: Legal Test for Consideration: score=5.0, review_status=gold_standard
