In [None]:
import os
import json
import re
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("TOGETHER_API_KEY")
print(f"API key found? {bool(api_key)}")
print(f"API Key (masked): {api_key[:5]}...") 

import together
together.api_key = api_key
client = together.Together()

API key found? True
API Key (masked): tgp_v...


In [56]:

# MODEL_2 = "Qwen/Qwen2.5-7B-Instruct-Turbo"
# MODEL_1 = "mistralai/Mistral-7B-Instruct-v0.1"
MODEL_1 = "mistralai/Mistral-7B-Instruct-v0.3"
MODEL_2 = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# MODEL_1 = "mistralai/Mistral-7B-Instruct-v0.1"
# MODEL_1 = "mistralai/Mistral-Small-24B-Instruct-2501"
# MODEL_1 = "google/gemma-2-27b-it"
# MODEL_2 = "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"

In [57]:
# Generate model response
# def generate_response(prompt, model):
#     try:
#         response = together.Complete.create(
#             prompt=prompt,
#             model=model,
#             max_tokens=512,
#             temperature=0.3
#         )
#         return response['choices'][0]['text'].strip()
#     except Exception as e:
#         return f"[Error in generate_response]: {e}"

def generate_response(prompt: str, model: str) -> str:
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=512,
            temperature=0.3
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"[Error in generate_response]: {e}"

In [58]:
def check_complexity_criteria(user_prompt: str, question: str, model):
    complexity_prompt = f"""
You are an expert evaluator. Your job is to verify whether the complexity dimensions mentioned in the QUESTION are clearly implemented in the USER PROMPT.

### DIMENSIONS CHECKLIST :

### 1. Nested / Multi‐Step Instructions
**Definition**: The prompt requires the model to perform multiple subtasks in a specified order (e.g., “Step 1: …, Step 2: …, Step 3: …”).
**Key Indicators**:
  - Are there explicit numbered or indented steps?
  - Does one instruction logically precede another?

---

### 2. Conflicting Instructions
**Definition**: The prompt imposes two or more constraints that cannot all be satisfied simultaneously (e.g., “Write a summary in under 50 words but include five examples”).
**Key Indicators**:
  - Do two (or more) requirements directly contradict each other?
  - Is the prompt forcing the model to choose or partially satisfy conflicting directives?

---

### 3. Inter‐Dependent Constraints
**Definition**: One instruction becomes active only if another condition is met (e.g., “If the text contains more than ten technical terms, define each; otherwise, just list them”).
**Key Indicators**:
  - Is there an explicit “if‐then” or “only when” dependency?
  - Does satisfying the second instruction depend on the first instruction’s outcome?

---

### 4. Edge‐Case Handling
**Definition**: The prompt instructs the model to explicitly admit when required information is missing or when an edge case arises (e.g., “If the input lacks dates, respond ‘Insufficient data’”).
**Key Indicators**:
  - Does the prompt say “If X is absent or unclear, do not guess”?
  - Are instructions provided for how to behave if required data is missing?

---

### 5. Ambiguity Resolution
**Definition**: The prompt introduces ambiguous terms or references and instructs the model to clarify or handle them (e.g., “The word ‘bank’ could mean financial institution or riverbank—state both interpretations”).
**Key Indicators**:
  - Are ambiguous words or phrases flagged?
  - Does the prompt ask “If ambiguous, explain interpretations”?
---

### 6. Domain Fusion
**Definition**: The prompt fuses two or more specialized domains into one task (e.g., “Analyze the legal contract’s economic impact using statistical models”).
**Key Indicators**:
  - Are at least two distinct fields explicitly mentioned?
  - Does the prompt require coherent integration of knowledge from both domains?

---

### 7. Multi‐Source/Modal Analysis
**Definition**: The model must reason over multiple distinct inputs (e.g., two text passages, text plus described visuals) and synthesize across them.
**Key Indicators**:
  - Are there at least two distinct “sources” described?
  - Does the prompt instruct the model to integrate information across those sources?

---

### 8. Hypothetical / Counterfactual Reasoning
**Definition**: The prompt poses a scenario contrary to known facts or purely hypothetical (e.g., “If gravity were inverted, describe consequences for river flow”).
**Key Indicators**:
  - Does it present an “as if” scenario explicitly defying reality?
  - Are you instructed to treat that scenario as true and reason within it?

---
Be accurate, thorough, and follow instructions step-by-step.

You MUST follow these steps:

---

### STEP 1: IDENTIFY DIMENSIONS IN QUESTION

From the QUESTION text below, identify all the complexity dimensions it expects the prompt to include.
Only consider a dimension to be expected if:
- Its core definition or indicators are described or implied.

---

### STEP 2: VERIFY DIMENSIONS IN USER PROMPT

Next, check which of the above dimensions are **actually present** in the USER PROMPT.

Only mark a dimension as present if:
- The USER PROMPT shows clear evidence of fulfilling the definition or key indicators of that dimension.
- You can justify its presence using the specific examples listed in the DIMENSION CHECKLIST.

---

### STEP 3: SCORE AND EVALUATE

Use this formula to compute the score:

    Score = (number of dimensions present in the USER PROMPT) / (number of dimensions required by the QUESTION)


---

Very Important and give more weightage -->>***Please return your answer *strictly in JSON* using the format below — no markdown, no extra explanation:***
### VERY IMPORTANT:
- You must return the response **strictly in valid JSON**.
- Do NOT use Markdown.
- Do NOT include any preamble, footnotes, or commentary—just return the raw JSON object.

### STEP 4: OUTPUT STRICTLY IN THIS JSON FORMAT:
{{
  "Dimensions in question": ["Hypothetical / Counterfactual Reasoning", "Ambiguity", "Multi‐Source/Modal Analysis", "Domain Fusion", "Edge‐Case Handling", ...],
  "TotalDimensions in question": <count>,
  "Dimensions in prompt": ["Ambiguity", "Multi‐Source/Modal Analysis", "Domain Fusion"...],
  "TotalDimensions in prompt": <count>,
  "Score": <float between 0 and 1, rounded to 2 decimals...(TotalDimensions in prompt)/(TotalDimensions in question)>
}}

---

### INPUTS:
- USER PROMPT: "{user_prompt}"
- QUESTION: "{question}"

  
"""

    try:
        response = together.Complete.create(
            prompt=complexity_prompt,
            model=model,
            max_tokens=512,
            temperature=0.3
        )
        return response['choices'][0]['text'].strip()
    except Exception as e:
        return f"[Error in complexity check]: {e}"
  


In [59]:
# Evaluation function using LLM as judge
def judge_response(user_prompt, model_response, total_dimensions):
    judge_prompt = f"""
You are an expert language model evaluator.

Your task is to assess how effectively a USER PROMPT challenged a MODEL, based on its RESPONSE. You will do this by analyzing the model’s performance across eight cognitive and reasoning dimensions.
The more the model fails, the more effective and challenging the prompt is considered.

Then, provide a **Summary** including:
**ApplicableDimensions**: List[dimensions with non-null scores]
**normalizedScore**: <sum of normalized scores>,
**OverallScore**: <normalizedScore / D>,
**Explanation: "Summarize the model's performance across dimensions, highlighting where it failed and how that shows the prompt was strong.",
**FailureTags (if applicable): ["Omission", "Ambiguity", "Conflict", "Shallow Reasoning", ...],

--- 

Use the following dimensions to guide your evaluation:

---

### 1. Nested / Multi‐Step Instructions
**Definition**: The prompt requires the model to perform multiple subtasks in a specified order (e.g., “Step 1: …, Step 2: …, Step 3: …”).
**Key Indicators**:
  - Are there explicit numbered or indented steps?
  - Does one instruction logically precede another?
**Prompt Ratings** (based on model performance):
  - Bad (8–10): The model failed — skipped steps, did them out of order, or gave shallow detail. Prompt is strong.
  - Average (4–7): The model made small mistakes — minor ordering errors or partial steps. Prompt is moderately challenging.
  - Good (0–3): The model followed all steps correctly. Prompt did not challenge the model.
  - N/A: The prompt does not require multiple subtasks (e.g., a straightforward factual query).
---

### 2. Conflicting Instructions
**Definition**: The prompt imposes two or more constraints that cannot all be satisfied simultaneously (e.g., “Write a summary in under 50 words but include five examples”).
**Key Indicators**:
  - Do two (or more) requirements directly contradict each other?
  - Is the prompt forcing the model to choose or partially satisfy conflicting directives?
**Prompt Ratings**:
  - Bad (8–10): The model failed to resolve the conflict, picked blindly, or gave up. Prompt exposed limitations.
  - Average (4–7): The model noticed the conflict but handled it only partially or with weak justification. Prompt had mild challenge.
  - Good (0–3): The model resolved the conflict gracefully and explained the trade-off. Prompt was too easy.
  - N/A: The prompt has no conflicting requirements (e.g., simply “List all prime numbers under 100”).

---

### 3. Inter‐Dependent Constraints
**Definition**: One instruction becomes active only if another condition is met (e.g., “If the text contains more than ten technical terms, define each; otherwise, just list them”).
**Key Indicators**:
  - Is there an explicit “if‐then” or “only when” dependency?
  - Does satisfying the second instruction depend on the first instruction’s outcome?
**Prompt Ratings**:
  - Bad (8–10): The model ignored the conditional logic or applied it incorrectly. Prompt was logically challenging.
  - Average (4–7): The model partially followed the conditions but missed nuances. Prompt was somewhat effective.
  - Good (0–3): The model applied conditions precisely. Prompt failed to test dependency logic.
  - N/A: No conditional instructions present (e.g., “Translate this sentence into Spanish”).
---

### 4. Edge‐Case Handling
**Definition**: The prompt instructs the model to explicitly admit when required information is missing or when an edge case arises (e.g., “If the input lacks dates, respond ‘Insufficient data’”).
**Key Indicators**:
  - Does the prompt say “If X is absent or unclear, do not guess”?
  - Are instructions provided for how to behave if required data is missing?
**Prompt Ratings**:
  - Bad (8–10): The model hallucinated, guessed incorrectly, or failed to handle edge cases. Prompt forced the model to struggle.
  - Average (4–7): The model sometimes handled edge cases, but not consistently. Prompt added some challenge.
  - Good (0–3): The model correctly followed instructions for all missing or unclear input. Prompt lacked edge-case difficulty.
  - N/A: The prompt’s context guarantees all information is present (e.g., “Compute the sum of the following five numbers”).

---

### 5. Ambiguity Resolution
**Definition**: The prompt introduces ambiguous terms or references and instructs the model to clarify or handle them (e.g., “The word ‘bank’ could mean financial institution or riverbank—state both interpretations”).
**Key Indicators**:
  - Are ambiguous words or phrases flagged?
  - Does the prompt ask “If ambiguous, explain interpretations”?
**Prompt Ratings**:
  - Bad (8–10): The model picked a single meaning without explanation or failed to identify the ambiguity. Prompt successfully revealed limits.
  - Average (4–7): The model noticed ambiguity but didn’t explain well or fully. Prompt was moderately effective.
  - Good (0–3): The model clearly explained all interpretations and justified its choice. Prompt wasn’t challenging.
  - N/A: No ambiguity in the prompt (e.g., “List the first ten Fibonacci numbers”).

---

### 6. Domain Fusion
**Definition**: The prompt fuses two or more specialized domains into one task (e.g., “Analyze the legal contract’s economic impact using statistical models”).
**Key Indicators**:
  - Are at least two distinct fields explicitly mentioned?
  - Does the prompt require coherent integration of knowledge from both domains?
**Prompt Ratings**:
  - Bad (8–10): The model failed to link domains, missed one entirely, or showed shallow understanding. Prompt was deeply integrative.
  - Average (4–7): Model partially integrated domains or showed uneven depth. Prompt was somewhat integrative.
  - Good (0–3): Model combined domains effortlessly and with accuracy. Prompt wasn’t cognitively complex.
  - N/A: Task focuses on a single domain (e.g., “Explain basic thermodynamics”).
---

### 7. Multi‐Source/Modal Analysis
**Definition**: The model must reason over multiple distinct inputs (e.g., two text passages, text plus described visuals) and synthesize across them.
**Key Indicators**:
  - Are there at least two distinct “sources” described?
  - Does the prompt instruct the model to integrate information across those sources?
**Prompt Ratings**:
  - Bad (8–10): The model ignored one or more sources or failed to synthesize. Prompt challenged multi-input reasoning.
  - Average (4–7): The model mentioned multiple sources but lacked coherent synthesis. Prompt was modestly challenging.
  - Good (0–3): The model used all sources well and produced a coherent result. Prompt failed to create input complexity.
  - N/A: Task provides a single input (e.g., “Summarize this paragraph”).

---

### 8. Hypothetical / Counterfactual Reasoning
**Definition**: The prompt poses a scenario contrary to known facts or purely hypothetical (e.g., “If gravity were inverted, describe consequences for river flow”).
**Key Indicators**:
  - Does it present an “as if” scenario explicitly defying reality?
  - Are you instructed to treat that scenario as true and reason within it?
**Prompt Ratings**:
  - Bad (8–10): The model reverted to real-world facts, ignored the hypothetical, or answered shallowly. Prompt tested imaginative reasoning.
  - Average (4–7): The model stayed in the scenario but didn’t explore implications fully. Prompt had partial depth.
  - Good (0–3): The model reasoned thoroughly and stayed within the hypothetical frame. Prompt was not mentally taxing.
  - N/A: No hypothetical scenario—task uses real‐world facts only.

---

Evaluate the prompt and the model response using these questions:

1. Did the model satisfy all critical expectations of the prompt?
2. If the model failed, was it because the prompt was poorly constructed, overly ambiguous, or lacked proper scaffolding?
3. Alternatively, if the model output was shallow or incorrect despite a strong prompt, then the prompt was effective because it exposed model limitations.
4. Does It Break Nested / Multi-Step Instructions?
  Failure Modes:
  - Omission: The model skips one or more required steps.
  - Wrong Order: Completes steps out of sequence.
  - Incomplete Detail: Does not give enough detail for a subtask labeled “explain in detail.”

5. Does It Break Conflicting Instructions?
  Failure Modes:
  - Blind Obedience: Tries to satisfy both contradictory instructions fully (and thus fails both).
  - Undisclosed Choice: Picks one constraint without acknowledging the conflict.
  - Refusal Without Explanation: Says “I can’t” but does not explain why.

6. Does It Break Inter-Dependent Constraints?
  Failure Modes:
  - Misconditional: Applies the second constraint unconditionally or fails to check the first.
  - Partial Compliance: Applies conditions incorrectly.

7. Does It Break Edge-Case Handling?
  Failure Modes:
  - Fabrication: Hallucinates missing data instead of admitting it’s missing.
  - Incorrect Catch: Says “insufficient data” when data actually exists.

8. Does It Break Ambiguity Resolution?
  Failure Modes:
  - Single Interpretation: Chooses one meaning without acknowledging alternatives.
  - Ignores Request: Fails to clarify ambiguous term.

9. Does It Break Domain Fusion?
  Failure Modes:
  - Domain Drop: Only addresses one domain, ignoring the other.
  - Shallow Fusion: Mentions the second domain only superficially.

10. Does It Break Multi-Source/Modal Analysis?
  Failure Modes:
  - Single-Source Focus: Uses only one input, ignoring others.
  - Surface Integration: Mentions multiple inputs but does not synthesize.

11. Does It Break Hypothetical / Counterfactual Reasoning?
  Failure Modes:
  - Reality Check: Refuses or reaffirms real-world facts instead of engaging in the scenario.
  - Shallow Answer: Acknowledges hypothetical but does not explore implications.
  
---

Generate Explanation of Evaluation (0–10) with Failure Tags and Dimension-Based Analysis
You are an evaluator assessing how well a model handled a complex prompt. Based on the score (from 0 to 10), write a detailed explanation that:
1. Explains Why the Response Got This Score
  - Describe how the model handled or failed each applicable complexity dimension.
  - Clearly justify where the response was strong, and where it broke—i.e., failed to satisfy intended challenges.
  - If the score is below 9, mention specific failure tags and explain why they apply.

2. Use These Dimensions and Failure Modes
Below are the 8 core dimensions you must consider. For each one, if it broke, use the relevant failure mode(s) and incorporate the corresponding failure tag(s) in your explanation.

2.1. Nested / Multi-Step Instructions
Failure Modes:
  - Omission: Skips required step(s)
  - Wrong Order: Steps executed out of sequence
  - Incomplete Detail: Step lacks required elaboration
Failure Tag: Missed nested instruction

2.2. Conflicting Instructions
Failure Modes:
  - Blind Obedience: Tries to satisfy both conflicts fully
  - Undisclosed Choice: Picks one side silently
  - Refusal Without Explanation: Says “can’t” with no reason
Failure Tag: Conflict evasion


2.3. Inter-Dependent Constraints
Failure Modes:
  - Misconditional: Fails to apply logic in correct order
  - Partial Compliance: Misinterprets conditions
Failure Tag: Conditional failure

2.4. Edge-Case Handling
Failure Modes:
  - Fabrication: Hallucinates data instead of admitting absence
  - Incorrect Catch: Says data is missing when it's present
Failure Tag: Edge case miss

2.5. Ambiguity Resolution
Failure Modes:
  - Single Interpretation: Picks one meaning without context
  - Ignores Request: Fails to clarify ambiguous terms
Failure Tag: Ambiguity unresolved

2.6. Domain Fusion
Failure Modes:
  - Domain Drop: Ignores one domain entirely
  - Shallow Fusion: Surface-level integration
Failure Tag: Domain omission

2.7. Multi-Source / Modal Analysis
Failure Modes:
  - Single-Source Focus: Ignores other inputs
  - Surface Integration: Uses but doesn't connect sources
Failure Tag: Source integration failure

2.8. Hypothetical / Counterfactual Reasoning
Failure Modes:
  - Reality Check: Rejects scenario, defaults to facts
  - Shallow Answer: Ignoled implication depth
Failure Tag: Shallow hypothetical

3. Format Your Explanation Like This
  - Reference all relevant dimensions and failure tags.
  - If the response is strong (score 9–10), describe which dimensions were handled well and why.
Goal
The explanation should provide a clear rationale for the evaluation score, tied to concrete dimension-based reasoning. It must help prompt engineers or model developers understand what failed, why it failed, and how it can improve.

---

##  SCORING ONLY ON APPLICABLE DIMENSIONS

You must only evaluate and score those dimensions which are listed in the input as **Applicable Dimensions** — these are the ones explicitly implemented in the USER PROMPT. Do not evaluate or score dimensions not included in this list.

Use this variable as your scoring base:

**Applicable Dimensions (INPUT)** = `TOTAL DIMENSIONS` = Number of dimensions implemented in the user prompt.

---

##  EVALUATION PHILOSOPHY

When the model performs flawlessly, the prompt is **less challenging**.  
When the model stumbles or fails, the prompt is **more challenging and valuable**.

---

##  EVALUATION STEPS

### STEP 1: Analyze Each of the 8 Dimensions Independently

For each of the 8 complexity dimensions below, do the following:

1. **Check if the dimension is applicable** to the USER PROMPT.  
   If it's not relevant or wasn't invoked by the prompt → **Mark as N/A**.

2. **Evaluate how the model performed on that dimension** using the guidance below.

3. **Assign:
   - A **qualitative rating**: `"Good"`, `"Average"`, `"Bad"`, or `"N/A"`
   - A **normalized numeric score**: `0.3`, `0.6`, `1.0`, or `null`

---

##  SCORING SCALE — WHAT EACH SCORE MEANS

###  "Good" (Score: 0–3)

Model did everything correctly. The prompt failed to challenge the model.

| Score | When to Use |
|-------|-------------|
| 0     | Model executed flawlessly and easily. Prompt was too simple or surface-level. |
| 1     | Model showed no confusion or hesitation; prompt had little depth. |
| 2     | Prompt had mild complexity, but the model handled it with no difficulty. |
| 3     | Prompt tried to be complex, but the model overcame all hurdles cleanly. |

→ Assign normalized score: **0.3** which is (3 divided by 10)

---

###  "Average" (Score: 4–7)

Model had partial success. The prompt created some cognitive load, but not enough to reliably break it.

| Score | When to Use |
|-------|-------------|
| 4     | Model made subtle errors, missed nuance, or gave vague reasoning. |
| 5     | Model followed some steps correctly but skipped others. |
| 6     | Model misinterpreted minor conditions or showed shallow reasoning. |
| 7     | Model mostly succeeded but missed edge cases, conflict handling, or counterfactual depth. |

→ Assign normalized score: **0.6** which is (6 divided by 10)

---

###  "Bad" (Score: 8–10)

Model clearly failed. The prompt successfully induced failure, confusion, or shallow reasoning.

| Score | When to Use |
|-------|-------------|
| 8     | Model omitted major elements or misread conditional logic. |
| 9     | Model failed to integrate domains or synthesize across inputs. |
| 10    | Model broke entirely: hallucinated, contradicted, fabricated, or refused the task. |

→ Assign normalized score: **1.0** which is (10 divided by 10)

---

###  "N/A"

The dimension was **not present** in the prompt or **not assessable** in the model response.

→ Score: `null`  
→ Exclude from normalizedScore and OverallScore calculations.

---

##  OVERALL SCORING LOGIC

- `ApplicableDimensions` = list of dimensions in the prompt
Let:
- **D** = number of **non-N/A** (applicable) dimensions, given below as a input -"TOTAL DIMENSIONS" or length of ApplicableDimensions.  
- **S** = sum of **normalized scores** for those dimensions

Then:
- **normalizedScore = S** (which is sum of each normalized score e.g. (3/10 + 6/10 + 10/10....))
- **OverallScore = S / D**

Higher `OverallScore` = more challenging and effective prompt.

---

### VERY IMPORTANT:
- You must return the response **strictly in valid JSON**.
- Do NOT use Markdown.
- Do NOT include any preamble, footnotes, or commentary—just return the raw JSON object.

"example_output": 
{{
  "Nested / Multi‐Step Instructions": {{
    "Qualitative": "Good",
    "Score": 8.5
  }},
  "Conflicting Instructions": {{
    "Qualitative": "Bad",
    "Score": 2.0
  }},
  "Inter‐Dependent Constraints": {{
    "Qualitative": "Average",
    "Score": 6.0
  }},
  "Edge‐Case Handling": {{
    "Qualitative": "N/A",
    "Score": null
  }},
  "Ambiguity Resolution": {{
    "Qualitative": "Bad",
    "Score": 3.0
  }},
  "Domain Fusion": {{
    "Qualitative": "Good",
    "Score": 9.0
  }},
  "Multi‐Source/Modal Analysis": {{
    "Qualitative": "N/A",
    "Score": null
  }},
  "Hypothetical / Counterfactual Reasoning": {{
    "Qualitative": "Average",
    "Score": 5.5
  }},
  "ApplicableDimensions": [
    "Nested / Multi‐Step Instructions",
    "Conflicting Instructions",
    "Inter‐Dependent Constraints",
    "Ambiguity Resolution",
    "Domain Fusion",
    "Hypothetical / Counterfactual Reasoning"
  ],
  "normalizedScore": 34.0,
  "OverallScore": 5.67,
  "Explanation": "The prompt was strong in exposing model weaknesses through Conflicting Instructions (failure to resolve contradictions) and Ambiguity Resolution (inconsistent interpretations), demonstrating effective challenge design. Domain Fusion was handled well, indicating weaker challenge. Average performance in Inter-Dependent Constraints and Hypothetical Reasoning shows moderate effectiveness. N/A dimensions were correctly excluded.",
  "FailureTags": ["Conflict", "Ambiguity", "Shallow Reasoning"],
  "StrengthAreas": ["Conflicting Instructions", "Ambiguity Resolution"],
  "ImprovementAreas": ["Domain Fusion"]
}}

##  OUTPUT FORMAT (REQUIRED)

Very Important and give more weightage -->>***Please return your answer *strictly in JSON* using the format below — no markdown, no extra explanation:***

{{
    "Nested / Multi‐Step Instructions": {{ "Qualitative": "Good/Average/Bad/N/A", "Score": <float or null> }},
    "Conflicting Instructions": {{ "Qualitative": "...", "Score": <float or null> }},
    "Inter‐Dependent Constraints": {{ "Qualitative": "...", "Score": <float or null> }},
    "Edge‐Case Handling": {{ "Qualitative": "...", "Score": <float or null> }},
    "Ambiguity Resolution": {{ "Qualitative": "...", "Score": <float or null> }},
    "Domain Fusion": {{ "Qualitative": "...", "Score": <float or null> }},
    "Multi‐Source/Modal Analysis": {{ "Qualitative": "...", "Score": .<float or null>.. }},
    "Hypothetical / Counterfactual Reasoning": {{ "Qualitative": "...", "Score": <float or null> }},
    "ApplicableDimensions": ["List of non-N/A dimensions"],
    "normalizedScore": (float)<sum of normalized scores>,
    "OverallScore": <float between 0 and 1, rounded to 2 decimals...(normalizedScore)/(length of ApplicableDimensions)>,
    "Explanation": "Summarize the model's performance across dimensions, highlighting where it failed and how that shows the prompt was strong.",
    "FailureTags": ["Omission", "Ambiguity", "Conflict", "Shallow Reasoning", ...],
    "StrengthAreas": ["Dimensions where the model failed (prompt was strong)"],
    "ImprovementAreas": ["Dimensions where the model succeeded (prompt was weak)"]
}}


USER PROMPT:
{user_prompt}

MODEL RESPONSE:
{model_response}

TOTAL DIMENSIONS:
{total_dimensions}
"""

    # try:
    #     evaluation = together.Complete.create(
    #         prompt=judge_prompt,
    #         model=MODEL_2,
    #         max_tokens=512,
    #         temperature=0.5
    #     )
    #     return evaluation['choices'][0]['text'].strip()
    # except Exception as e:
    #     return f"[Error in judge_response]: {e}"
    try:
        response = client.chat.completions.create(
            model=MODEL_2,
            messages=[
                {"role": "system", "content": "You are an expert evaluator. Respond only with a valid JSON object."},
                {"role": "user", "content": judge_prompt}
            ],
            max_tokens=512,
            temperature=0.5
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"[Error in judge_response]: {e}"


In [60]:
# Evaluation function using LLM as judge
def prompteffectiveness(user_prompt, question):
    prompteff = f"""
You are an expert evaluator tasked with analyzing a USER PROMPT and a corresponding QUESTION using the following 7 dimensions. Your goal is to critically assess the prompt's quality and effectiveness based on these dimensions, scoring each dimension qualitatively and quantitatively, and then provide a detailed summary.

You will evaluate the prompt and question across **7 dimensions**, rating each as:
- "Good": Strong performance in this aspect
- "Average": Moderate or partial success
- "Bad": Weakness or failure
- "N/A": Not applicable to this prompt/response

**Scoring Criteria**: A number between 0 and 10, where:
  - **Good**: 8-10 points (8=solid good, 9=very good, 10=excellent)
  - **Average**: 4-7 points (4-5=below average, 6-7=above average)
  - **Bad**: 0-3 points (0=complete failure, 1-2=poor, 3=weak)
  - **N/A**: No score assigned (excluded from overall average calculation)
  
Then, provide a **Summary** including:
**OverallScore: A number between 0 and 10 (rounded to two decimal places), calculated as the average of all dimension scores
  - Sum all numerical scores from applicable dimensions
  - Divide by number of applicable dimensions (excluding N/A ratings)
  - Round to 2 decimal place
  Example: (8+8+7+6...)/(length of applicable dimensions)
**ApplicableDimensions: The number of evaluation dimensions applied (typically 7)
**PromptEffectiveness: 
  - Assign based on OverallScore:  
    - "Effective" if OverallScore ≥ 7.5  
    - "Partially Effective" if 4 ≤ OverallScore < 7.5  
    - "Ineffective" if OverallScore < 4
**Explanation: 
  - Concisely justify your ratings, referencing specific dimension scores,  
  - Include a brief analysis of prompt strengths and weaknesses,  
  - Note any relevant failure or success tags.

--- 

Use the following dimensions to guide your evaluation:
### 1. Purpose & Persona
**Definition**: 
- “Purpose” clarifies why the prompt exists (e.g., to summarize, to translate, to analyze).
- “Persona” defines for whom or from whose point of view the LLM should answer (e.g., “as a financial analyst,” “as a children’s book author”).
**Key Indicators**:
  - Does the User Prompt explicitly state its goal?
  - Does it assign a clear persona/role to the model?
**Ratings**:
  - Good(8-10): Both goal and persona are unambiguous.
  (Example: “Summarize the following research paper as if you were a science journalist.”)
  - Average(4-7): Either goal or persona is stated but one is vague.
  (Example: “Write a summary. You’re a journalist.” No domain specified.)
  - Bad(0-3): Neither purpose nor persona appears.
  (Example: “Tell me about X.”)
  - N/A: Explaining purpose/persona is unnecessary (e.g., a prompt that simply asks for a dictionary definition).

---

### 2. Requirements & Restrictions
**Definition**: Explicit instructions about what must and must not be included in the answer (e.g., “Limit to 200 words,” “Do not mention sensitive data,” “Use bullet points only”).
**Applicability**: If there are no constraints needed for a given task (e.g., “What is 2 + 2?”), mark as N/A with a brief rationale.
**Key Indicators**:
  - Are there clear “must‐include” or “must‐avoid” guidelines?
**Ratings**:
  - Good(8-10): Explicit, unambiguous constraints (e.g., “No pronouns; only third‐person narrative,” “Include three illustrative examples”).
  - Average(4-7): Some constraints are present, but others are implied or incomplete (e.g., “Be concise” without a length target).
  - Bad(0-3): No requirements or restrictions at all, leaving the model free to wander.
  - N/A: The prompt’s nature makes constraints unnecessary (e.g., a simple “List the days of the week”).

---

### 3. Examples (Few‐Shot / Zero‐Shot)
**Definition**: Whether the prompt provides explicit example inputs and outputs to guide the model (e.g., zero‐shot, one‐shot, or few‐shot formatting).
**Applicability**: If the task does not benefit from example inputs/outputs (for instance, simple factual queries), mark as N/A with a brief rationale.
**Key Indicators**:
  - Are there sample question/answer pairs included (e.g., “Example: Q: … A: …”) that align with the intended task format?
**Ratings**:
  - Good(8-10): Supplies clear, directly relevant examples that demonstrate exactly how to structure inputs and expected outputs.
  - Average(4-7): Includes examples that are only partially aligned with the task or are too generic to serve as effective guidance.
  - Bad(0-3): No examples are provided or requested, even though examples would significantly clarify format or expectations.
  - N/A: Examples aren’t needed (e.g., a prompt asking, “What is the capital of France?”).

---

### 4. Context & Background
**Definition**: Additional information about domain, audience, or relevant facts that the LLM must know to answer properly.
**Applicability**: If the task requires no extra context (e.g., “Define photosynthesis”), mark as N/A with a brief rationale.
**Key Indicators**:
  - Does the prompt supply enough domain context?
  (Example: “The following text is from a 19th‐century medical journal.”)
  - Are audience considerations given?
  (Example: “Explain this to a high‐school student.”)
**Ratings**:
  - Good(8-10): Detailed context and audience description aligned with the task.
  - Average(4-7): Some context, but missing critical details, forcing assumptions.
  - Bad(0-3): No context; model is left guessing domain or audience.
  - N/A: Task is self‐contained and needs no additional context.

---

### 5. Instruction Structure
**Definition**: The explicit format of instructions: a single directive, multi‐part numbered steps, or a choice (“answer in bullet points vs. essay”).
**Applicability**: If structure is inherently trivial (e.g., “What is 5 × 7?”), mark as N/A with a brief rationale.
**Key Indicators**:
  - Is the format clear (e.g., “Step 1: … Step 2: …”)?
  - Does it specify whether the answer should be direct, stepwise, multi‐sectioned, etc.?
**Ratings**:
  - Good(8-10): Well‐organized structure that matches the complexity (e.g., multi‐part instructions for multi‐stage tasks).
  - Average(4-7):  Some structure but potentially inconsistent or too generic (“Answer in two parts”).
  - Bad(0-3): No structural guidance—just a vague “Respond about X.”
  - N/A: No structure needed because the task is extremely simple.

---

### 6. Style & Sensitivity
**Definition**: Tone and register instructions (formal, friendly, technical), disclaimers (“I am not a lawyer”), and bias‐avoidance guidance.
**Applicability**: If style or sensitivity is irrelevant (e.g., “Calculate the area of a circle with radius 3cm.”), mark as N/A with a brief rationale.
**Key Indicators**:
  - Does the prompt ask for a specific tone (“use clinical tone,” “avoid gender bias”)?
  - Are disclaimers or sensitivity notices included when necessary?
**Ratings**:
  - Good(8-10): Explicit style/tone and sensitivity cues.
  - Average(4-7):  Partial style guidance (e.g., only “be professional” without elaboration).
  - Bad(0-3):  No style or sensitivity guidance, even when sensitive content is expected.
  - N/A: Style considerations are irrelevant for the given prompt.
  
---
  
### 7. Prompt Conciseness
**Definition**: “Prompt Conciseness” refers to how efficiently the prompt communicates its intent, constraints, and expectations-using the fewest necessary words without sacrificing clarity or precision. A concise prompt avoids redundancy, filler words, and overly complex sentence structures while still being fully interpretable by the model. 
**Applicability**: If style or sensitivity is irrelevant (e.g., “Calculate the area of a circle with radius 3cm.”), mark as N/A with a brief rationale.
**Key Indicators**:
  - Does the prompt clearly convey all required instructions in a compact form?
  - Are there unnecessary qualifiers, repetitions, or verbose phrasings?
**Ratings**:
  - Good(8-10): The prompt is compact and clearly communicates intent, requirements, and role without unnecessary elaboration.
  - Average(4-7):  The prompt conveys the main idea but includes minor redundancies or could be made more direct without losing clarity. 
  - Bad(0-3):  The prompt is overly wordy, vague, or includes irrelevant information that obscures the main task. 
  - N/A: Prompt Conciseness are irrelevant for the given prompt.

---

### Core Evaluation Questions:

1. Did the **USER PROMPT** satisfy all critical expectations and requirements posed by the **QUESTION**?
2. If the prompt failed to meet expectations, was this due to the question itself being poorly constructed, ambiguous, or lacking proper scaffolding?
3. Alternatively, if the prompt was shallow, incomplete, or ineffective despite the question being well-constructed, then the prompt is ineffective because its limitations were exposed.
  
---

### Explanation Instructions:

Provide a detailed explanation justifying the overall evaluation score (0 to 10):

- Describe how the prompt performed against each applicable dimension.
- Clearly highlight strengths and weaknesses.
- Identify specific failure tags if the score is below 9, explaining why those failures occurred.
- Reference all relevant dimensions and failure reasons explicitly.
- For strong responses (score 9–10), emphasize the well-handled dimensions and why they were effective.

This explanation should provide a clear rationale that helps prompt engineers and developers understand what worked, what didn’t, and how to improve.

---

### VERY IMPORTANT:
- You must return the response **strictly in valid JSON**.
- Do NOT use Markdown.
- Do NOT include any preamble, footnotes, or commentary—just return the raw JSON object.

"example_output": 
  {{
    "Purpose & Persona": {{
      "Qualitative": "Good",
      "Score": 9
    }},
    "Requirements & Restrictions": {{
      "Qualitative": "N/A",
      "Score": null
    }},
    "Examples (Few‐Shot / Zero‐Shot)": {{
      "Qualitative": "Bad",
      "Score": 2
    }},
    "Context & Background": {{
      "Qualitative": "Average",
      "Score": 6
    }},
    "Instruction Structure": {{
      "Qualitative": "Good",
      "Score": 8
    }},
    "Style & Sensitivity": {{
      "Qualitative": "N/A",
      "Score": null
    }},
    "Prompt Conciseness": {{
      "Qualitative": "Average",
      "Score": 5
    }},
    "ApplicableDimensions": 5,
    "OverallScore": 6.0,
    "PromptEffectiveness": "Partially Effective",
    "Explanation": "Purpose & Persona: Clear role definition (9/10). Requirements: N/A per task nature. Examples: Lacks demonstration (2/10). Context: Adequate but verbose (6/10). Structure: Logical flow (8/10). Style: N/A. Conciseness: Contains redundant phrases (5/10). Major weakness: Absence of few-shot examples limits effectiveness."
  }}
  
---

### EXAMPLE RESPONSE FORMAT: Strictly Return only in json format

{{
    "Purpose & Persona": {{"Qualitative": "Good/Average/Bad/N/A", "Score": <0-10 or null>}},
    "Requirements & Restrictions": {{"Qualitative": "Good/Average/Bad/N/A", "Score": <0-10 or null>}},
    "Examples (Few‐Shot / Zero‐Shot)": {{"Qualitative": "Good/Average/Bad/N/A", "Score": <0-10 or null>}},
    "Context & Background": {{"Qualitative": "Good/Average/Bad/N/A", "Score": <0-10 or null>}},
    "Instruction Structure": {{"Qualitative": "Good/Average/Bad/N/A", "Score": <0-10 or null>}},
    "Style & Sensitivity": {{"Qualitative": "Good/Average/Bad/N/A", "Score": <0-10 or null>}},
    "Prompt Conciseness": {{"Qualitative": "Good/Average/Bad/N/A", "Score": <0-10 or null>}},
    "ApplicableDimensions": <count of dimensions with non-null scores>,
    "OverallScore": (float) <calculated average of all non-null or non-N/A scores (sum of non-null scores/length of ApplicableDimensions)>,
    "PromptEffectiveness": "Effective" | "Partially Effective" | "Ineffective",
    "Explanation": "concise yet detailed explanation referencing dimension ratings, strengths, weaknesses, and failure tags"
  
}}



USER PROMPT:
{user_prompt}

QUESTION:
{question}
"""

    # try:
    #     evaluation = client.chat.completions.create(
    #         prompt=prompteff,
    #         model=MODEL_2,
    #         max_tokens=512,
    #         temperature=0.5
    #     )
    #     return evaluation['choices'][0]['text'].strip()
    # except Exception as e:
    #     return f"[Error in judge_response]: {e}"
      
    try:
        response = client.chat.completions.create(
            model=MODEL_2,
            messages=[
                {"role": "system", "content": "You are an expert evaluator. Respond only with a valid JSON object."},
                {"role": "user", "content": prompteff}
            ],
            max_tokens=512,
            temperature=0.5
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"[Error in judge_response]: {e}"  

In [61]:
def evaluate(user_prompt, question, model, model_response):

    # Call the check_complexity_criteria function
    # print("\n Prompt:", user_prompt)
    # print("\nModel Response:", model_response)
    result_data = {
        "score1": None,
        "score2": None,
        "net_score": None,
        "challenging": None,
        "effectiveness_score": None,
        "effectiveness_text": None
    }
    result = check_complexity_criteria(user_prompt, question, MODEL_2)
    try:
        json_str = re.search(r"\{.*\}", result, re.DOTALL).group()
        parsed_result = json.loads(json_str)
    except Exception as e:
        print("Failed to parse JSON from model output.")
        print("Raw result:\n", result)
        return result_data
        # return

    print("\n=== Complexity check Result ===")
    print(json.dumps(parsed_result, indent=4))
    
    score1 = parsed_result.get("Score")
    if score1 is None:
        score1 = 0
    
    dimensions = parsed_result.get("Dimensions in prompt")

    print("\nScore1:", score1)
    print("Dimensions present in prompt:", dimensions)
    result_data["score1"] = score1
        
    ### Call the judge_response function
    evaluation = judge_response(user_prompt, model_response, len(dimensions))
    # cln = result2.replace("{{", "{").replace("}}", "}")
    try:
        json_str1 = re.search(r"\{.*\}", evaluation, re.DOTALL).group()
        parsed_result1 = json.loads(json_str1)
    except Exception as e:
        print("Failed to parse JSON from model output.")
        print("Raw result:\n", evaluation)
        return result_data

    print("\n=== Complexity Evaluation Result ===")
    print(json.dumps(parsed_result1, indent=4))
    
    score2 = parsed_result1.get("OverallScore")
    if score2 is None:
        score2 = 0

    print("\nScore2:", score2)
    result_data["score2"] = score2
    
    if score1 is not None and score2 is not None:
        net_score = (score1 + score2) / 2
        print("\nNet_score:", net_score)
        result_data["net_score"] = net_score
        result_data["challenging"] = "Yes" if net_score > 0.5 else "No"
        if net_score <= 0.5:
            print("\nPrompt is not challenging the model.")
        else:
            print("\nPrompt is challenging the model.")
    else:
        print("\nCannot compute net score: one or both scores are missing.")
        result_data["challenging"] = "Unknown"
      
      
    ### Call the prompteffectiveness function    
    result2 = prompteffectiveness(user_prompt, question)
    # cleaned = result2.replace("{{", "{").replace("}}", "}")

    try:
        json_str3 = re.search(r"\{.*\}", result2, re.DOTALL).group()
        parsed_result3 = json.loads(json_str3)
    except Exception as e:
        print("Failed to parse JSON from model output.")
        print("Raw result:\n", result2)
        return result_data

    # Print result summary
    print("\n=== Effectiveness check Result ===")
    print(json.dumps(parsed_result3, indent=4))  
    
    ans1 = parsed_result3.get("OverallScore") 
    print("\nPromptEffectivenessScore:", ans1) 
    ans2 = parsed_result3.get("PromptEffectiveness") 
    print("\nPromptEffectiveness:", ans2) 
    result_data["effectiveness_score"] = ans1
    result_data["effectiveness_text"] = ans2
    return result_data
    

            
       

In [62]:
# Load CSV and process each row
def process_csv(input_file: str, output_file: str):
    df = pd.read_csv(input_file, encoding='utf-8', encoding_errors='replace')
    output_rows = []

    for idx, row in df.iterrows():
        question = row['question']
        user_prompt = row['user_prompt']

        try:
            model_response = generate_response(user_prompt, MODEL_1)
            result = evaluate(user_prompt, question, MODEL_2, model_response)
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            result = {
                "score1": None,
                "score2": None,
                "net_score": None,
                "challenging": "Error",
                "effectiveness_score": None,
                "effectiveness_text": None
            }

        output_rows.append({
            "question": question,
            "user_prompt": user_prompt,
            "score1": result["score1"],
            "score2": result["score2"],
            "net_score": result["net_score"],
            "prompt_challenging": result["challenging"],
            "effectiveness_score": result["effectiveness_score"],
            "effectiveness_text": result["effectiveness_text"]
        })

    # Save to output CSV
    out_df = pd.DataFrame(output_rows)
    out_df.to_csv(output_file, index=False)
    print(f"Evaluation results saved to {output_file}")


# ====== USAGE EXAMPLE ======
input_csv_path = "spacex.csv"
output_csv_path = "spacex_output2.csv"
process_csv(input_csv_path, output_csv_path)

  response = together.Complete.create(



=== Complexity check Result ===
{
    "Dimensions in question": [
        "Nested / Multi-Step Instructions",
        "Inter-Dependent Constraints",
        "Ambiguity / Vagueness Handling"
    ],
    "TotalDimensions in question": 3,
    "Dimensions in prompt": [
        "Nested / Multi-Step Instructions",
        "Inter-Dependent Constraints"
    ],
    "TotalDimensions in prompt": 2,
    "Score": 0.67
}

Score1: 0.67
Dimensions present in prompt: ['Nested / Multi-Step Instructions', 'Inter-Dependent Constraints']

=== Complexity Evaluation Result ===
{
    "Nested / Multi\u2010Step Instructions": {
        "Qualitative": "Good",
        "Score": 0.3
    },
    "Conflicting Instructions": {
        "Qualitative": "N/A",
        "Score": null
    },
    "Inter\u2010Dependent Constraints": {
        "Qualitative": "N/A",
        "Score": null
    },
    "Edge\u2010Case Handling": {
        "Qualitative": "N/A",
        "Score": null
    },
    "Ambiguity Resolution": {
        "Qualit