In [2]:
import litellm
litellm._turn_on_debug()


def query_litellm(model_name: str, prompt: str) -> str:
    """
    Query a LiteLLM-compatible model with a given prompt.

    Args:
        model_name (str): The model to use (e.g. 'ollama/qwen2.5' or 'huggingface/your-model').
        prompt (str): The user's input question.

    Returns:
        str: The model's response text.
    """
    response = litellm.completion(
        model=model_name,
        messages=[{"role": "user", "content": prompt}]
    )

    return response['choices'][0]['message']['content']

def call_models(prompt: str,
               model1_name:str='ollama/hf.co/ernanhughes/Fin-R1-Q8_0-GGUF', model2_name:str='ollama/phi3') -> tuple:
    """
    Call the two models and return their responses."
    """
    model1_response = query_litellm(model_name=model1_name, prompt=prompt)
    model2_response = query_litellm(model_name=model2_name, prompt=prompt)
    return model1_response, model2_response


In [1]:
quant_tasks = [
    "Write Python code to calculate the Sharpe Ratio for a stock given a list of daily returns and a risk-free rate.",
    "Generate Python code to backtest a moving average crossover strategy using two different windows.",
    "Create Python code to compute Value at Risk (VaR) using historical simulation for a portfolio of returns.",
    "Write Python code to price a European call option using the Black-Scholes formula.",
    "Generate code to simulate a geometric Brownian motion for an asset price.",
    "Write Python code to calculate the duration and convexity of a bond given its cash flows and yield.",
    "Generate Monte Carlo simulation code to price a European call option.",
    "Write Python code to calculate Delta and Gamma for a European call option using the Black-Scholes model."
]

# --- Step 2: Prompt format ---
def generate_code_prompt(task: str) -> str:
    return f"""
    You are a quantitative financial developer. Write clean, documented Python code to solve the following problem:

    {task}

    Instructions:
    - Explain your approach in <think> tags.
    - Include all Python code within <code> tags.
    - Use comments to clarify logic and assumptions.
    - Ensure the code runs correctly and handles potential edge cases.
    - Prefer standard libraries like NumPy, pandas, and SciPy where applicable.
    - Avoid excessive complexity; prioritize clarity and correctness.
    """

def generate_validation_prompt(task: str, code_output: str) -> str:
    return f"""
    You are a senior quantitative engineer reviewing Python code for accuracy.
    The code was written to address the following financial task:

    "{task}"

    Here is the model's output:
    {code_output}

    Instructions:
    1. Verify that the code correctly solves the stated task.
    2. Identify any bugs, logical flaws, or poor practices.
    3. Comment on use of financial logic, edge case handling, and numerical stability.
    4. Score the solution out of 10 for correctness and clarity.

    Respond with:
    - Score: <X>/10
    - Critique: <brief assessment>
    - Recommendations: <concrete fixes or improvements>
    """


In [None]:
conn = sqlite3.connect("quant_code_results.db")
cursor = conn.cursor()

cursor.execute("""
CREATE TABLE IF NOT EXISTS code_evaluation (
    task TEXT,
    model_name TEXT,
    code_prompt TEXT,
    code_output TEXT,
    validation_prompt TEXT,
    validation_response TEXT
)
""")

conn.commit()

# --- Step 5: Simulate model outputs and run validation ---
def call_model_and_validate(model_name: str, task: str):
    # Simulate code generation (replace this with actual model API calls if needed)
    code_prompt = generate_code_prompt(task)
    
    code_output = response.choices[0].message['content'].strip()

    # Validate the code
    validation_prompt = generate_validation_prompt(task, code_output)
    validation_response = validation.choices[0].message['content'].strip()

    # Store everything in the database
    cursor.execute("""
        INSERT INTO code_evaluation (
            task, model_name, code_prompt, code_output, validation_prompt, validation_response
        ) VALUES (?, ?, ?, ?, ?, ?)
    """, (task, model_name, code_prompt, code_output, validation_prompt, validation_response))
    conn.commit()

# --- Step 6: Run for each model and task ---
for task in quant_tasks:
    call_model_and_validate("Fin-R1", task)
    call_model_and_validate("Qwen2.5", task)

conn.close()
