In [10]:
import litellm
litellm._turn_on_debug()


def query_litellm(model_name: str, prompt: str) -> str:
    """
    Query a LiteLLM-compatible model with a given prompt.

    Args:
        model_name (str): The model to use (e.g. 'ollama/qwen2.5' or 'huggingface/your-model').
        prompt (str): The user's input question.

    Returns:
        str: The model's response text.
    """
    response = litellm.completion(
        model=model_name,
        messages=[{"role": "user", "content": prompt}]
    )

    return response['choices'][0]['message']['content']

def call_models(prompt: str,
               model1_name:str='ollama/hf.co/ernanhughes/Fin-R1-Q8_0-GGUF', model2_name:str='ollama/phi3') -> tuple:
    """
    Call the two models and return their responses."
    """
    model1_response = query_litellm(model_name=model1_name, prompt=prompt)
    model2_response = query_litellm(model_name=model2_name, prompt=prompt)
    return model1_response, model2_response


In [11]:
quant_tasks = [
    "Write Python code to calculate the Sharpe Ratio for a stock given a list of daily returns and a risk-free rate.",
    "Generate Python code to backtest a moving average crossover strategy using two different windows.",
    "Create Python code to compute Value at Risk (VaR) using historical simulation for a portfolio of returns.",
    "Write Python code to price a European call option using the Black-Scholes formula.",
    "Generate code to simulate a geometric Brownian motion for an asset price.",
    "Write Python code to calculate the duration and convexity of a bond given its cash flows and yield.",
    "Generate Monte Carlo simulation code to price a European call option.",
    "Write Python code to calculate Delta and Gamma for a European call option using the Black-Scholes model."
]

# --- Step 2: Prompt format ---
def generate_code_prompt(task: str) -> str:
    return f"""
    You are a quantitative financial developer. Write clean, documented Python code to solve the following problem:

    {task}

    Instructions:
    - Explain your approach in <think> tags.
    - Include all Python code within <code> tags.
    - Use comments to clarify logic and assumptions.
    - Ensure the code runs correctly and handles potential edge cases.
    - Prefer standard libraries like NumPy, pandas, and SciPy where applicable.
    - Avoid excessive complexity; prioritize clarity and correctness.
    """

def generate_comparison_validation_prompt(task: str, fin_output: str, qwen_output: str) -> str:
    return f"""
You are a senior quant engineer evaluating code generated by two models for the following task:

Task:
"{task}"

Model A Output:
{fin_output}

Model B Output:
{qwen_output}

Instructions:
1. Compare both outputs in terms of correctness, clarity, and use of financial logic.
2. Identify bugs, missing features, and best practices.
3. Score each out of 10.
4. Justify why one model is better.

Respond with:
Model A Score: <score>/10
Justification A: <reasoning>

Model B Score: <score>/10
Justification B: <reasoning>

Preferred Model: Model A or Model B
"""


In [None]:
import sqlite3
conn = sqlite3.connect("quant_code_results.db")
cursor = conn.cursor()

cursor.execute("""
CREATE TABLE IF NOT EXISTS code_comparison (
    task TEXT,
    fin_r1_output TEXT,
    qwen2_output TEXT,
    comparison_prompt TEXT,
    judge_response TEXT
)
""")

conn.commit()

# --- Step 5: Generate, compare, and store results ---
def call_models_and_compare(task: str):
    fin_output, qwen_output  = call_models(generate_code_prompt(task))
    # Create validation prompt comparing both
    comparison_prompt = generate_comparison_validation_prompt(task, fin_output, qwen_output)
    judge_response = query_litellm("ollama/qwen2.5", comparison_prompt)
    print(judge_response)
    # Store in database
    cursor.execute("""
        INSERT INTO code_comparison (
            task, fin_r1_output, qwen2_output, comparison_prompt, judge_response
        ) VALUES (?, ?, ?, ?, ?)""",
        (task, fin_output, qwen_output, comparison_prompt, judge_response))
    conn.commit()

# --- Step 6: Run for all tasks ---
for task in quant_tasks:
    call_models_and_compare(task)

conn.close()


[92m00:32:16 - LiteLLM:DEBUG[0m: utils.py:311 - 

[92m00:32:16 - LiteLLM:DEBUG[0m: utils.py:311 - [92mRequest to litellm:[0m
[92m00:32:16 - LiteLLM:DEBUG[0m: utils.py:311 - [92mlitellm.completion(model='ollama/hf.co/ernanhughes/Fin-R1-Q8_0-GGUF', messages=[{'role': 'user', 'content': '\n    You are a quantitative financial developer. Write clean, documented Python code to solve the following problem:\n\n    Write Python code to calculate the Sharpe Ratio for a stock given a list of daily returns and a risk-free rate.\n\n    Instructions:\n    - Explain your approach in <think> tags.\n    - Include all Python code within <code> tags.\n    - Use comments to clarify logic and assumptions.\n    - Ensure the code runs correctly and handles potential edge cases.\n    - Prefer standard libraries like NumPy, pandas, and SciPy where applicable.\n    - Avoid excessive complexity; prioritize clarity and correctness.\n    '}])[0m
[92m00:32:16 - LiteLLM:DEBUG[0m: utils.py:311 - 

[92m0