In [0]:
%pip install --upgrade mlflow databricks-sdk dspy openai
dbutils.library.restartPython()

#Model Switching, not as easy as swapping prompts

Below is a quickstart example that updates a prompt to a different model. It's a simple prompt **classify this query** so you will likely see larger improvements for more complex use cases. We will go from GPT-5 to Gemma 3/GPT-OSS-20B

Ensure that you have access to the Databricks Foundation Model APIs to run this successfully. 

In [0]:
import mlflow
import openai
from mlflow.genai.optimize import GepaPromptOptimizer
from mlflow.genai.scorers import Correctness
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()

#Change the catalog and schema to your catalog and schema 
catalog = "main"
schema = "default"
prompt_registry_name = "new_prompt_registry"
prompt_location = f"{catalog}.{schema}.{prompt_registry_name}"

openai_client = w.serving_endpoints.get_open_ai_client()

# Register initial prompt
prompt = mlflow.genai.register_prompt(
    name=prompt_location,
    template="classify this: {{query}}",
)


# Define your prediction function
def predict_fn(query: str) -> str:
    prompt = mlflow.genai.load_prompt(f"prompts:/{prompt_location}/1")
    completion = openai_client.chat.completions.create(
        model="databricks-gpt-5",
        # load prompt template using PromptVersion.format()
        messages=[{"role": "user", "content": prompt.format(query=query)}],
    )
    return completion.choices[0].message.content




#Test your Function

Observe how accurately the model can classify the input with a bare bones prompt. While accurate, it is not aligned to any task or use case we are looking for. 

In [0]:
from IPython.display import Markdown

output = predict_fn("The emergence of HIV as a chronic condition means that people living with HIV are required to take more responsibility for the self-management of their condition , including making physical , emotional and social adjustments.")

Markdown(output)

#Optimizing against Data

Now we will provide some data with expected responses and facts. This will help optimize our model to behave and output in a way that fits our use cases. 

In this case, we just want the model to output one word from a choice of five words. It should only output that word without any further explanation. 

In [0]:
# Training data with inputs and expected outputs
dataset = [
    {
        "inputs": {"query": "The emergence of HIV as a chronic condition means that people living with HIV are required to take more responsibility for the self-management of their condition , including making physical , emotional and social adjustments."},
        # "outputs": {"response": "BACKGROUND"},
        "expectations": {"expected_facts": ["Classification label must be 'CONCLUSIONS', 'RESULTS', 'METHODS', 'OBJECTIVE', 'BACKGROUND'"]}
    },
    {
        "inputs": {"query": "The emergence of HIV as a chronic condition means that people living with HIV are required to take more responsibility for the self-management of their condition , including making physical , emotional and social adjustments."},
        "outputs": {"response": "BACKGROUND"},
        "expectations": {"expected_response": "BACKGROUND"}
    },
    {
        "inputs": {"query": "This paper describes the design and evaluation of Positive Outlook , an online program aiming to enhance the self-management skills of gay men living with HIV ."},
        # "outputs": {"response": "BACKGROUND"},
        "expectations": {"expected_facts": ["Classification label must be 'CONCLUSIONS', 'RESULTS', 'METHODS', 'OBJECTIVE', 'BACKGROUND'"]}
    },
    {
        "inputs": {"query": "This paper describes the design and evaluation of Positive Outlook , an online program aiming to enhance the self-management skills of gay men living with HIV ."},
        "outputs": {"response": "BACKGROUND"},
        "expectations": {"expected_response": "BACKGROUND"}
    },
    {
        "inputs": {"query": "This study is designed as a randomised controlled trial in which men living with HIV in Australia will be assigned to either an intervention group or usual care control group ."},
        # "outputs": {"response": "METHODS"},
        "expectations": { "expected_facts": ["Classification label must be 'CONCLUSIONS', 'RESULTS', 'METHODS', 'OBJECTIVE', 'BACKGROUND'"]}
    },
    {
        "inputs": {"query": "This study is designed as a randomised controlled trial in which men living with HIV in Australia will be assigned to either an intervention group or usual care control group ."},
        "outputs": {"response": "METHODS"},
        "expectations": {"expected_response": "METHODS"}
    },
    {
        "inputs": {"query": "The intervention group will participate in the online group program ` Positive Outlook ' ."},
        # "outputs": {"response": "METHODS"},
        "expectations": {"expected_facts": ["Classification label must be 'CONCLUSIONS', 'RESULTS', 'METHODS', 'OBJECTIVE', 'BACKGROUND'"]}
    },
    {
        "inputs": {"query": "The intervention group will participate in the online group program ` Positive Outlook ' ."},
        "outputs": {"response": "METHODS"},
        "expectations": {"expected_response": "METHODS"}
    },
    {
        "inputs": {"query": "The program is based on self-efficacy theory and uses a self-management approach to enhance skills , confidence and abilities to manage the psychosocial issues associated with HIV in daily life ."},
        # "outputs": {"response": "METHODS"},
        "expectations": {"expected_facts": ["Classification label must be 'CONCLUSIONS', 'RESULTS', 'METHODS', 'OBJECTIVE', 'BACKGROUND'"]}
    },
    {
        "inputs": {"query": "The program is based on self-efficacy theory and uses a self-management approach to enhance skills , confidence and abilities to manage the psychosocial issues associated with HIV in daily life ."},
        "outputs": {"response": "METHODS"},
        "expectations": {"expected_response": "METHODS"}
    },
    {
        "inputs": {"query": "Participants will access the program for a minimum of 90 minutes per week over seven weeks ."},
        # "outputs": {"response": "METHODS"},
        "expectations": {"expected_facts": ["Classification label must be 'CONCLUSIONS', 'RESULTS', 'METHODS', 'OBJECTIVE', 'BACKGROUND'"]}
    },
    {
        "inputs": {"query": "Participants will access the program for a minimum of 90 minutes per week over seven weeks ."},
        "outputs": {"response": "METHODS"},
        "expectations": {"expected_response": "METHODS"}
    }
]

# Optimize the prompt
result = mlflow.genai.optimize_prompts(
    predict_fn=predict_fn,
    train_data=dataset,
    prompt_uris=[prompt.uri],
    optimizer=GepaPromptOptimizer(reflection_model="databricks:/databricks-gpt-5-2"),
    scorers=[Correctness(model="databricks:/databricks-gpt-5")],
)

# Use the optimized prompt
optimized_prompt = result.optimized_prompts[0]
print(f"Optimized template: {optimized_prompt.template}")

In [0]:
print(f"Initial Score: {result.initial_eval_score}\n") 
print(f"Final Score: {result.final_eval_score}") 

#Let's review the Changes

Let's test to see how gpt oss works now

In [0]:
import mlflow

mlflow.openai.autolog()

def predict_fn(query: str) -> str:
    prompt = mlflow.genai.load_prompt(f"prompts:/{prompt_location}/2") 
    # updated_prompt = f"{prompt}\n\nclassify this: {{query}}"
    completion = openai_client.chat.completions.create(
        # model="databricks-gemma-3-12b",
        # model = "databricks-gpt-oss-20b",
        model = "databricks-gpt-5",
        # load prompt template using PromptVersion.format()
        messages=[{"role": "system", "content": prompt.format(query=query)}]
            # {"role": "user", "content": query}],
    )
    return completion.choices[0].message.content

In [0]:
output = predict_fn(query="Orbital steroid injection for thyroid-related ophthalmopathy is effective and safe .")

In [0]:
#right answer: RESULTS
output

#Let's try it with Gemma

How well does it do with this prompt? 

In [0]:
from IPython.display import Markdown
prompt = mlflow.genai.load_prompt(f"prompts:/{prompt_location}/2")

Markdown(prompt.template)

In [0]:
def predict_fn_gemma(query: str) -> str:
    prompt = mlflow.genai.load_prompt(f"prompts:/{prompt_location}/2")
    completion = openai_client.chat.completions.create(
        model="databricks-gemma-3-12b",
        # model = "databricks-gpt-oss-20b",
        # model = "databricks-gpt-5",
        # load prompt template using PromptVersion.format()
        messages=[{"role": "system", "content": prompt.format(query=query)},
            {"role": "user", "content": query}],
    )
    return completion.choices[0].message.content

In [0]:
output = predict_fn_gemma(query="Orbital steroid injection for thyroid-related ophthalmopathy is effective and safe .")

In [0]:
#right answer: Results
output

#Not correct, Not Surprising

While we did switch to a smaller model, we can see it's not as simple as just giving it an optimized prompt and see the same performance improvements. 

We should reoptimize for a new prompt for the new model on the original prompt. 

Let's do it below. I'll set up a new function to hit the gemma 3 model. 

In [0]:
def predict_fn_gemma(query: str) -> str:
    prompt = mlflow.genai.load_prompt(f"prompts:/{prompt_location}/1")
    completion = openai_client.chat.completions.create(
        model="databricks-gemma-3-12b",
        # load prompt template using PromptVersion.format()
        messages=[{"role": "user", "content": prompt.format(query=query)}],
    )
    return completion.choices[0].message.content


In [0]:
result_gemma_oss = mlflow.genai.optimize_prompts(
    predict_fn=predict_fn_gemma,
    train_data=dataset,
    prompt_uris=[prompt.uri],
    optimizer=GepaPromptOptimizer(reflection_model="databricks:/databricks-gpt-5-2"),
    scorers=[Correctness(model="databricks:/databricks-gpt-5")],
)
# Use the optimized prompt
optimized_prompt = result.optimized_prompts[0]
print(f"Optimized template: {optimized_prompt.template}")

#We should already have decent scores

In [0]:
print(f"Initial Score: {result_gemma_oss.initial_eval_score}\n") 
print(f"Final Score: {result_gemma_oss.final_eval_score}") 

In [0]:
from IPython.display import Markdown
prompt = mlflow.genai.load_prompt(f"prompts:/{prompt_location}/3")

Markdown(prompt.template)

#Let's check again

In [0]:
def predict_fn_gemma_updated(query: str) -> str:
    prompt = mlflow.genai.load_prompt(f"prompts:/{prompt_location}/3")
    completion = openai_client.chat.completions.create(
        model="databricks-gemma-3-12b",
        # model = "databricks-gpt-oss-20b",
        # model = "databricks-gpt-5",
        # load prompt template using PromptVersion.format()
        messages=[{"role": "system", "content": prompt.format(query=query)},
            {"role": "user", "content": query}],
    )
    return completion.choices[0].message.content

In [0]:
output = predict_fn_gemma_updated(query="Orbital steroid injection for thyroid-related ophthalmopathy is effective and safe .")

In [0]:
#right answer: RESULTS
output

#Is it now correct? 

We can't assume the existing prompt will work for all models or be performant for all models. It only takes a few minutes to re-optimize the model! 

Let's go back to our experiment to add some aliases.

Now we can use mlflow prompt registry to load the right prompts depending on the model we want to use

In [0]:
#GPT-OSS 20B

def predict_fn(query: str) -> str:
    prompt = mlflow.genai.load_prompt(f"prompts:/{prompt_location}@gpt_oss_20b")
    # updated_prompt = f"{prompt}\n\nclassify this: {{query}}"
    completion = openai_client.chat.completions.create(
        # model="databricks-gemma-3-12b",
        model = "databricks-gpt-oss-20b",
        # model = "databricks-gpt-5",
        # load prompt template using PromptVersion.format()
        messages=[{"role": "system", "content": prompt.format(query=query)}]
            # {"role": "user", "content": query}],
    )
    return completion.choices[0].message.content

output = predict_fn(query="Orbital steroid injection for thyroid-related ophthalmopathy is effective and safe .")
output[1]['text']

In [0]:
#GPT-5

def predict_fn(query: str) -> str:
    prompt = mlflow.genai.load_prompt(f"prompts:/{prompt_location}@gpt_oss_20b")
    # prompt = mlflow.genai.load_prompt(f"prompts:/{prompt_location}@gpt_5")
    # updated_prompt = f"{prompt}\n\nclassify this: {{query}}"
    completion = openai_client.chat.completions.create(
        # model="databricks-gemma-3-12b",
        # model = "databricks-gpt-oss-20b",
        model = "databricks-gpt-5",
        # load prompt template using PromptVersion.format()
        messages=[{"role": "system", "content": prompt.format(query=query)}]
            # {"role": "user", "content": query}],
    )
    return completion.choices[0].message.content

output = predict_fn(query="Orbital steroid injection for thyroid-related ophthalmopathy is effective and safe .")
output

In [0]:
#GPT5

def predict_fn_gemma(query: str) -> str:
    prompt = mlflow.genai.load_prompt(f"prompts:/{prompt_location}@gpt5")
    # updated_prompt = f"{prompt}\n\nclassify this: {{query}}"
    completion = openai_client.chat.completions.create(
        model="databricks-gemma-3-12b",
        # model = "databricks-gpt-oss-20b",
        # model = "databricks-gpt-5",
        # load prompt template using PromptVersion.format()
        messages=[{"role": "system", "content": prompt.format(query=query)},
            {"role": "user", "content": query}],
    )
    return completion.choices[0].message.content

output = predict_fn_gemma(query="Orbital steroid injection for thyroid-related ophthalmopathy is effective and safe .")
output