# JD‚ÄëNext Rubric Ingestion Pipeline

This notebook ingests rubric rules, item‚Äëwriting principles, examples, and construct definitions into the existing Azure AI Search index.

It uses the same schema as `exam_ingestion_pipeline.ipynb`, but only populates:

- `id`
- `domain = "rubric"`
- `topic`
- `full_text`
- `content_vector`

All other fields remain `None`.

This keeps rubric ingestion clean, modular, and fully compatible with the unified index.

### üîê Step 2: Credentials

In [22]:
import os
from dotenv import load_dotenv

load_dotenv()

AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_EMBEDDING_MODEL = "text-embedding-3-small"

AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
AZURE_SEARCH_KEY = os.getenv("AZURE_SEARCH_KEY")
AZURE_RUBRIC_INDEX = os.getenv("AZURE_RUBRIC_INDEX")

### üß± Step 4: Create the Azure AI Search index (run once)

In [23]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile
)
from azure.core.credentials import AzureKeyCredential

index_client = SearchIndexClient(
    endpoint=AZURE_SEARCH_ENDPOINT,
    credential=AzureKeyCredential(AZURE_SEARCH_KEY)
)

embedding_dimensions = 1536

index_schema = SearchIndex(
    name=AZURE_RUBRIC_INDEX,
    fields=[
        SearchField(name="id", type=SearchFieldDataType.String, key=True, searchable=False),
        SearchField(name="category", type=SearchFieldDataType.String, searchable=True, filterable=True),
        SearchField(name="subsection", type=SearchFieldDataType.String, searchable=True, filterable=True),
        SearchField(name="type", type=SearchFieldDataType.String, searchable=True, filterable=True),
        SearchField(name="content", type=SearchFieldDataType.String, searchable=True), 
        SearchField(name="order", type=SearchFieldDataType.Int32, filterable=True, sortable=True), 
        SearchField(name="full_text", type=SearchFieldDataType.String, searchable=True),

        SearchField(
            name="content_vector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=embedding_dimensions,
            vector_search_profile_name="rubricHnswProfile"
        )
    ],
    vector_search=VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="rubricHnsw",
                kind="hnsw"
            )
        ],
        profiles=[
            VectorSearchProfile(
                name="rubricHnswProfile",
                algorithm_configuration_name="rubricHnsw"
            )
        ]
    )
)

if AZURE_RUBRIC_INDEX not in index_client.list_index_names():
    index_client.create_index(index_schema)
    print(f"‚úÖ Created index: {AZURE_RUBRIC_INDEX}")
else:
    print(f"‚ÑπÔ∏è Index '{AZURE_RUBRIC_INDEX}' already exists.")

‚úÖ Created index: jdn-rubric-index


### üìÑ Load your CSV

In [24]:
import json

RUBRIC_JSONL_PATH = "./item-writing/JD-Next Item-Writing rubric.jsonl"

rubric_chunks = []

with open(RUBRIC_JSONL_PATH, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            rubric_chunks.append(json.loads(line))

print(f"Loaded {len(rubric_chunks)} rubric chunks.")
rubric_chunks[:2]

Loaded 166 rubric chunks.


[{'order': 1,
  'category': 'CONSTRUCT',
  'subsection': 'Construct',
  'type': 'HEADER',
  'content': 'Construct',
  'section': 'Construct',
  'source': 'Construct.txt'},
 {'order': 2,
  'category': 'CONSTRUCT',
  'subsection': 'Construct',
  'type': 'DEFINITION',
  'content': 'A construct is the Knowledge, Skills, and Abilities (KSAs) we intend to measure in an assessment.',
  'section': 'Construct',
  'source': 'Construct.txt'}]

## üß† Normalize chunks into a document

In [25]:
import uuid

def normalize_rubric_chunk(chunk):
    """
    Convert a JSONL rubric chunk into an Azure Search document.
    """
    doc = {
        "id": str(uuid.uuid4()),  # unique ID
        "category": chunk.get("category", ""),
        "subsection": chunk.get("subsection", ""),
        "type": chunk.get("type", ""),
        "content": chunk.get("content", ""),
        "order": chunk.get("order", 0),
        "full_text": chunk.get("content", ""),  # can expand later
        "content_vector": None  # filled in next cell
    }
    return doc

rubric_docs = [normalize_rubric_chunk(c) for c in rubric_chunks]

print(f"Normalized {len(rubric_docs)} documents.")
rubric_docs[0]


Normalized 166 documents.


{'id': 'ee7471f5-eaa6-4ce7-8b7a-cb45c71dfc41',
 'category': 'CONSTRUCT',
 'subsection': 'Construct',
 'type': 'HEADER',
 'content': 'Construct',
 'order': 1,
 'full_text': 'Construct',
 'content_vector': None}

### üß¨ Generate embeddings for each document

In [26]:
import uuid
from openai import AzureOpenAI

client = AzureOpenAI(
    api_key=AZURE_OPENAI_KEY,
    api_version="2024-02-01",
    azure_endpoint=AZURE_OPENAI_ENDPOINT
)

def embed(text):
    response = client.embeddings.create(
        model=AZURE_OPENAI_EMBEDDING_MODEL,
        input=text
    )
    return response.data[0].embedding

for doc in rubric_docs:
    doc["content_vector"] = embed(doc["full_text"])

for i, doc in enumerate(rubric_docs[:3]):
    print(f"Doc {i} embedding (first 4 dims): {doc['content_vector'][:4]}")
print("... (embeddings for remaining documents not shown)")

print(f"Embedded {len(rubric_docs)} documents.")

Doc 0 embedding (first 4 dims): [0.01560912560671568, 0.007811433169990778, -0.0362197682261467, 0.027563298121094704]
Doc 1 embedding (first 4 dims): [0.033380910754203796, 0.011718349531292915, 0.012452865950763226, 0.006616291124373674]
Doc 2 embedding (first 4 dims): [0.036775168031454086, 0.019434520974755287, 0.004527701530605555, 0.017834030091762543]
... (embeddings for remaining documents not shown)
Embedded 166 documents.


### üöÄ Upload documents in batches

In [27]:
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential

search_client = SearchClient(
    endpoint=AZURE_SEARCH_ENDPOINT,
    index_name=AZURE_RUBRIC_INDEX,
    credential=AzureKeyCredential(AZURE_SEARCH_KEY)
)

batch_size = 100
for i in range(0, len(rubric_docs), batch_size):
    batch = rubric_docs[i:i+batch_size]
    result = search_client.upload_documents(documents=batch)
    print(f"‚úÖ Uploaded batch {i//batch_size + 1}: {len(batch)} documents")

‚úÖ Uploaded batch 1: 100 documents
‚úÖ Uploaded batch 2: 66 documents
