In [1]:
import litellm
litellm._turn_on_debug()


def query_litellm(model_name: str, prompt: str) -> str:
    """
    Query a LiteLLM-compatible model with a given prompt.

    Args:
        model_name (str): The model to use (e.g. 'ollama/qwen2.5' or 'huggingface/your-model').
        prompt (str): The user's input question.

    Returns:
        str: The model's response text.
    """
    response = litellm.completion(
        model=model_name,
        messages=[{"role": "user", "content": prompt}]
    )

    return response['choices'][0]['message']['content']

def call_models(prompt: str,
               model1_name:str='ollama/hf.co/ernanhughes/Fin-R1-Q8_0-GGUF', model2_name:str='ollama/phi3') -> tuple:
    """
    Call the two models and return their responses."
    """
    model1_response = query_litellm(model_name=model1_name, prompt=prompt)
    model2_response = query_litellm(model_name=model2_name, prompt=prompt)
    return model1_response, model2_response


In [2]:

risk_scenarios = [
    "An investor is heavily invested in Nvidia. A new AI paradigm called DeepSeek emerges out of China, dramatically reducing the need for GPUs. Assess the risk to Nvidia's stock.",
    "An investor holds a concentrated position in Tesla. BYD announces a new solid-state battery that doubles range, halves cost, and enables near-instant charging, removing the need for most public charging infrastructure. Analyze the potential impact on Tesla."
]

# --- Step 2: Shared prompt for both models ---
def generate_risk_prompt(scenario: str) -> str:
    return f"""
You are a financial analyst evaluating the impact of a company-specific market scenario.
Explain how this scenario might affect the investor's portfolio using Chain-of-Thought (CoT) reasoning.
Wrap your reasoning in <think> tags and provide a summary inside <answer> tags.

Scenario:
"{scenario}"
"""




In [3]:
import sqlite3

def judge_risk_outputs(scenario, output_a, output_b):
    return f"""
You are a senior equity analyst comparing two CoT reasoning outputs for a company-specific risk scenario.
Evaluate which response is more insightful, realistic, and helpful for an investor.

Scenario:
"{scenario}"

---

Model A:
{output_a}

---

Model B:
{output_b}

---

Evaluate each model on:
1. Accuracy of market and company-specific reasoning
2. Clarity and insight of explanation
3. Proper structure using <think> and <answer> tags

Respond in this format:

Model A Score: <score>/10
Justification A: <reasoning>

Model B Score: <score>/10
Justification B: <reasoning>

Preferred Model: Model A or Model B
"""

# --- Step 5: Setup SQLite storage ---
conn = sqlite3.connect("risk_scenario_results.db")
cursor = conn.cursor()

cursor.execute("""
CREATE TABLE IF NOT EXISTS risk_results (
    scenario TEXT,
    fin_r1_output TEXT,
    phi3_output TEXT,
    judge_request TEXT,
    judge_result TEXT
)
""")

# --- Step 6: Run analysis and store ---
for scenario in risk_scenarios:
    fin_r1_output, phi3_output = call_models(generate_risk_prompt(scenario))
    print("\n--- Fin-R1 Output ---\n", fin_r1_output)
    print("\n--- Qwen2.5 Output ---\n", phi3_output)
    judge_request = judge_risk_outputs(scenario, fin_r1_output, phi3_output)
    judge_result = query_litellm(model_name='ollama/qwen2.5', prompt=judge_request)
    print("\n>> Judge Evaluation:\n", judge_result)
    cursor.execute("""
    INSERT INTO risk_results (scenario, fin_r1_output, phi3_output, judge_request, judge_result)
    VALUES (?, ?, ?, ?, ?)""",
    (scenario, fin_r1_output, phi3_output, judge_request, judge_result))
    conn.commit()

conn.close()


[92m20:07:32 - LiteLLM:DEBUG[0m: utils.py:311 - 

[92m20:07:32 - LiteLLM:DEBUG[0m: utils.py:311 - [92mRequest to litellm:[0m
[92m20:07:32 - LiteLLM:DEBUG[0m: utils.py:311 - [92mlitellm.completion(model='ollama/hf.co/ernanhughes/Fin-R1-Q8_0-GGUF', messages=[{'role': 'user', 'content': '\nYou are a financial analyst evaluating the impact of a company-specific market scenario.\nExplain how this scenario might affect the investor\'s portfolio using Chain-of-Thought (CoT) reasoning.\nWrap your reasoning in <think> tags and provide a summary inside <answer> tags.\n\nScenario:\n"An investor is heavily invested in Nvidia. A new AI paradigm called DeepSeek emerges out of China, dramatically reducing the need for GPUs. Assess the risk to Nvidia\'s stock."\n'}])[0m
[92m20:07:32 - LiteLLM:DEBUG[0m: utils.py:311 - 

[92m20:07:32 - LiteLLM:DEBUG[0m: litellm_logging.py:388 - self.optional_params: {}
[92m20:07:32 - LiteLLM:DEBUG[0m: utils.py:311 - SYNC kwargs[caching]: False; litellm.c


--- Fin-R1 Output ---
 <think>
Okay, so I need to analyze how the emergence of a new AI paradigm called DeepSeek from China that reduces the demand for GPUs would impact an investor who is heavily invested in Nvidia. Let me start by breaking down the scenario and thinking through the potential effects step by step.

First, let's understand what the DeepSeek technology does. It's described as a new AI approach that significantly cuts down on the need for GPUs. GPUs, or Graphics Processing Units, are crucial components in many computing tasks, especially those related to artificial intelligence, machine learning, and deep learning. They are used extensively by companies like Nvidia because their graphics cards (like the RTX series) are designed specifically for these workloads.

If DeepSeek eliminates the need for GPUs, that could mean a drastic reduction in demand for Nvidia's products. For example, if a new technology allows AI models to be trained and run without relying on specializ

[92m20:10:04 - LiteLLM:INFO[0m: utils.py:1165 - Wrapper: Completed Call, calling success_handler
[92m20:10:04 - LiteLLM:INFO[0m: cost_calculator.py:588 - selected model name for cost calculation: ollama/qwen2.5
[92m20:10:04 - LiteLLM:DEBUG[0m: litellm_logging.py:1089 - Logging Details LiteLLM-Success Call: Cache_hit=None
[92m20:10:04 - LiteLLM:DEBUG[0m: utils.py:4300 - checking potential_model_names in litellm.model_cost: {'split_model': 'qwen2.5', 'combined_model_name': 'ollama/qwen2.5', 'stripped_model_name': 'qwen2.5', 'combined_stripped_model_name': 'ollama/qwen2.5', 'custom_llm_provider': 'ollama'}
[92m20:10:04 - LiteLLM:INFO[0m: cost_calculator.py:588 - selected model name for cost calculation: ollama/qwen2.5
[92m20:10:04 - LiteLLM:DEBUG[0m: utils.py:4300 - checking potential_model_names in litellm.model_cost: {'split_model': 'qwen2.5', 'combined_model_name': 'ollama/qwen2.5', 'stripped_model_name': 'qwen2.5', 'combined_stripped_model_name': 'ollama/qwen2.5', 'custom_


>> Judge Evaluation:
 Model A Score: 8/10
Justification A: 
- **Accuracy of Market and Company-Specific Reasoning**: The response provides a detailed analysis of the impact of DeepSeek on Nvidia's business, considering both immediate financial effects (reduced demand for GPUs) and long-term strategic implications. It accurately identifies key factors such as revenue streams, diversification potential, and geopolitical considerations.
- **Clarity and Insight of Explanation**: While the explanation is clear, it could benefit from being more concise without sacrificing depth. The detailed step-by-step analysis makes it insightful but slightly longer than necessary.
- **Proper Structure Using <think> and <answer> Tags**: The response uses these tags effectively to separate the thought process from the conclusion.

Model B Score: 5/10
Justification B:
- **Accuracy of Market and Company-Specific Reasoning**: This model is less detailed and lacks a comprehensive analysis. It mentions the pot

[92m20:10:50 - LiteLLM:INFO[0m: utils.py:1165 - Wrapper: Completed Call, calling success_handler
[92m20:10:50 - LiteLLM:INFO[0m: cost_calculator.py:588 - selected model name for cost calculation: ollama/hf.co/ernanhughes/Fin-R1-Q8_0-GGUF
[92m20:10:50 - LiteLLM:DEBUG[0m: litellm_logging.py:1089 - Logging Details LiteLLM-Success Call: Cache_hit=None
[92m20:10:50 - LiteLLM:DEBUG[0m: utils.py:4300 - checking potential_model_names in litellm.model_cost: {'split_model': 'hf.co/ernanhughes/Fin-R1-Q8_0-GGUF', 'combined_model_name': 'ollama/hf.co/ernanhughes/Fin-R1-Q8_0-GGUF', 'stripped_model_name': 'hf.co/ernanhughes/Fin-R1-Q8_0-GGUF', 'combined_stripped_model_name': 'ollama/hf.co/ernanhughes/Fin-R1-Q8_0-GGUF', 'custom_llm_provider': 'ollama'}
[92m20:10:50 - LiteLLM:INFO[0m: cost_calculator.py:588 - selected model name for cost calculation: ollama/hf.co/ernanhughes/Fin-R1-Q8_0-GGUF
[92m20:10:50 - LiteLLM:DEBUG[0m: utils.py:4300 - checking potential_model_names in litellm.model_cost


--- Fin-R1 Output ---
 <think>
Okay, let's try to figure out how this scenario might affect an investor holding a concentrated position in Tesla. The key event here is that BYD, a Chinese automaker, has announced a new solid-state battery technology that significantly improves performance and reduces costs compared to existing batteries.

First, I need to understand what the new technology does: it doubles the range of electric vehicles (EVs), halves production costs, and allows for near-instant charging. These are massive improvements over current battery tech. The immediate thought is that this could make BYD's EVs more competitive in the market, especially since cost and range anxiety are big factors for consumers.

But wait, the investor here has a large stake in Tesla. So how does this impact them? Let's break it down step by step. 

1. **Competitive Advantage**: If BYD's new battery technology is as revolutionary as described, it could give BYD an edge over other EV manufacturer

[92m20:11:11 - LiteLLM:INFO[0m: utils.py:1165 - Wrapper: Completed Call, calling success_handler
[92m20:11:11 - LiteLLM:INFO[0m: cost_calculator.py:588 - selected model name for cost calculation: ollama/qwen2.5
[92m20:11:11 - LiteLLM:DEBUG[0m: litellm_logging.py:1089 - Logging Details LiteLLM-Success Call: Cache_hit=None
[92m20:11:11 - LiteLLM:DEBUG[0m: utils.py:4300 - checking potential_model_names in litellm.model_cost: {'split_model': 'qwen2.5', 'combined_model_name': 'ollama/qwen2.5', 'stripped_model_name': 'qwen2.5', 'combined_stripped_model_name': 'ollama/qwen2.5', 'custom_llm_provider': 'ollama'}
[92m20:11:11 - LiteLLM:INFO[0m: cost_calculator.py:588 - selected model name for cost calculation: ollama/qwen2.5
[92m20:11:11 - LiteLLM:DEBUG[0m: utils.py:4300 - checking potential_model_names in litellm.model_cost: {'split_model': 'qwen2.5', 'combined_model_name': 'ollama/qwen2.5', 'stripped_model_name': 'qwen2.5', 'combined_stripped_model_name': 'ollama/qwen2.5', 'custom_


>> Judge Evaluation:
 Model A Score: 8/10
Justification A: 
- **Accuracy of Market and Company-Specific Reasoning**: The analysis is thorough, providing a detailed breakdown of the potential impacts on Tesla from BYD's technological advancement. It accurately identifies key factors such as cost reduction, range improvement, and charging infrastructure needs.
- **Clarity and Insight of Explanation**: While the explanation is clear, it could be more concise without losing critical insights. The step-by-step analysis offers valuable depth but can sometimes be overly verbose.
- **Proper Structure Using <think> and <answer> Tags**: The structure is well-organized with a logical flow from the initial scenario to specific points of impact.

Model B Score: 6/10
Justification B: 
- **Accuracy of Market and Company-Specific Reasoning**: The analysis is somewhat lacking in depth. It mentions some key impacts but lacks detailed reasoning or step-by-step breakdowns that provide a clearer understan