In [None]:
!pip install google-cloud-aiplatform



# Prompt evaluation for Gen AI summaries

#1. Initial Evaluation: The Hallucination Score (List-Wise):
This code block establishes a crucial baseline: testing the factual accuracy or fidelity of the generated summaries against their source text.



## The Problem it Solved:
The core goal was to detect hallucinations‚Äîinstances where the Large Language Model (LLM) generated information that was not present or was contradictory to the provided source text. Hallucinations are the most severe failure mode for a summarization task, so testing for them first is essential.

We also wished to ranking and differentiate high-quality summaries. We needed a metric that was hard to max out, providing enough signal to distinguish between a perfect, wordy summary and a perfect, concise one.

## How it Works:

**Comparative Judging**: For each source text, all three summaries were generated and then sent to the Judge model simultaneously. This forced the Judge to look at the summaries side-by-side and rank them against each other.

**Generator Model**: Used the gemini-2.0-flash model (the Generator) to produce summaries based on three different prompt styles.

**List-Wise Evaluation**: By sending the source text and all generated summary variations to the Judge model in a single prompt, we force the model to rank them against each other and notice subtle differences.

**Judge Model & Metric**: Used a strong model like gemini-2.5-pro (the Judge) and a scoring scale where 1.0 was perfect (no hallucination) and 10.0 was a complete fabrication, and allowing decimal scoring for robustness.

In [None]:
# !pip install google-cloud-aiplatform

import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig
import pandas as pd
import json
import time

# ==========================================
# 1. SETUP: Initialize Vertex AI
# ==========================================
PROJECT_ID = "pulseai-team3-ba882-fall25"
LOCATION = "us-central1"

vertexai.init(project=PROJECT_ID, location=LOCATION)

# ==========================================
# 2. CONFIGURATION: Model Definitions
# ==========================================
# Ensure these IDs are available in your specific GCP project
GENERATOR_MODEL_ID = "gemini-2.0-flash"
JUDGE_MODEL_ID = "gemini-2.5-pro" # Suggest changing to 1.5-pro or a valid ID

print(f"üöÄ Initialized Vertex AI project {PROJECT_ID}")

# ==========================================
# 3. CORE FUNCTIONS
# ==========================================

def generate_summary(prompt_template, source_text):
    """Generates the summary using the Flash model."""
    model = GenerativeModel(GENERATOR_MODEL_ID)
    config = GenerationConfig(temperature=0.4, max_output_tokens=1024)

    try:
        final_prompt = prompt_template.format(text=source_text)
        response = model.generate_content(final_prompt, generation_config=config)
        return response.text
    except Exception as e:
        return f"[Error]: {str(e)}"

def evaluate_comparative_hallucination(source_text, summaries_dict):
    """
    Judges multiple summaries side-by-side to force differentiation.

    Args:
        source_text (str): The original source.
        summaries_dict (dict): { "prompt_id": "summary text", ... }
    """
    model = GenerativeModel(JUDGE_MODEL_ID)

    # Construct a string representation of all candidates
    candidates_str = ""
    for pid, summary in summaries_dict.items():
        candidates_str += f"\n--- CANDIDATE ID: {pid} ---\n{summary}\n"

    judge_prompt = f"""
    You are a strict Factual Consistency Judge.
    You will be provided with a [SOURCE] text and multiple [CANDIDATE SUMMARIES].

    Task:
    1. Compare the candidates against the source AND against each other.
    2. Look for "hallucinations" (facts not in source) and "omissions" (missing critical context that alters meaning).
    3. Assign a "hallucination_score" (1.0 to 10.0).
       - 10.0: Severe fabrication.
       - 1.0: Perfect consistency.
    4. DIFFERENTIATION IS KEY: If two summaries are good, give the one that captures more nuance a better (lower) score.
       If they are identical in quality, give them the same score.

    Output STRICT JSON format:
    {{
        "scores": {{
            "prompt_id_1": {{ "score": float, "reason": "string" }},
            "prompt_id_2": {{ "score": float, "reason": "string" }}
        }}
    }}

    [SOURCE]
    {source_text}

    [CANDIDATE SUMMARIES]
    {candidates_str}
    """

    config = GenerationConfig(
        temperature=0.0,
        response_mime_type="application/json"
    )

    try:
        response = model.generate_content(judge_prompt, generation_config=config)
        result = json.loads(response.text)
        return result.get('scores', {})
    except Exception as e:
        print(f"Judge Error: {e}")
        return {}

# ==========================================
# 4. EXECUTION LOOP (RESTRUCTURED)
# ==========================================
test_sources = {
    "Source 1 (Physics)": "Newton's first law states that an object at rest stays at rest and an object in motion stays in motion with the same speed and in the same direction unless acted upon by an unbalanced force. Essentially, objects resist changes in their state of motion; it takes a force to start something moving or to stop it, change its speed, or alter its direction. Example: When a car stops suddenly, your body wants to keep moving forward (inertia).",
    "Source 2 (History)": "The Apollo 11 mission landed the first humans on the Moon. Commander Neil Armstrong and lunar module pilot Buzz Aldrin formed the American crew that landed the Apollo Lunar Module Eagle on July 20, 1969. The Apollo Guidance Computer (AGC) and Display and Keyboard (DSKY) instrument panel were created specifically for the programme. The DSKY panel was akin to a calculator keyboard and display, working with codes the astronauts learned or checked from a flight manual.",
    "Source 3 (Finance)": "EBITDA stands for Earnings Before Interest, Taxes, Depreciation, and Amortization. It is a measure of a company's overall financial performance and is used as an alternative to net income in some circumstances. It's criticized for ignoring asset costs and potential accounting manipulation, as it's not a GAAP metric. It doesn't account for the cash needed to replace or maintain assets (capex), which are significant in asset-intensive businesses."
}

prompt_candidates = {
    "101": "You are an expert technical editor. Summarize this: {text}",
    "102": "You are an editor. Summarize this: {text}",
    "103": "Summarize this: {text}"
}

# We need to store results in a way we can pivot later
# Structure: { "101": {"Source 1": 1.0, "Source 2": ...}, "102": ... }
formatted_results = {pid: {"Prompt index": pid} for pid in prompt_candidates}

print("\n‚ö° Starting Comparative Evaluation Pipeline...")

for source_name, source_text in test_sources.items():
    print(f"\nProcessing {source_name}...")

    # 1. Generate ALL summaries for this source
    current_batch = {}
    for pid, template in prompt_candidates.items():
        print(f"  - Generating {pid}...")
        current_batch[pid] = generate_summary(template, source_text)
        time.sleep(0.2) # Avoid rate limits

    # 2. Compare them ALL in one shot
    print(f"  - ‚öñÔ∏è Judging comparison...")
    scores_dict = evaluate_comparative_hallucination(source_text, current_batch)

    # 3. Store results
    for pid in prompt_candidates:
        # Default to 0 if scoring failed
        score_data = scores_dict.get(pid, {"score": 0, "reason": "Error"})
        formatted_results[pid][source_name] = score_data['score']

# ==========================================
# 5. OUTPUT
# ==========================================
# Convert dictionary of rows to list
results_data = list(formatted_results.values())

# Calculate Averages
for row in results_data:
    scores = [row[k] for k in test_sources.keys() if isinstance(row[k], (int, float))]
    row["Average"] = sum(scores) / len(scores) if scores else 0

df = pd.DataFrame(results_data)
cols = ["Prompt index"] + list(test_sources.keys()) + ["Average"]
df = df[cols]

print("\n‚úÖ Final Scoreboard:")
try:
    from IPython.display import display
    display(df.style.background_gradient(cmap='RdYlGn_r', vmin=1, vmax=10))
except ImportError:
    print(df)

üöÄ Initialized Vertex AI project pulseai-team3-ba882-fall25

‚ö° Starting Comparative Evaluation Pipeline...

Processing Source 1 (Physics)...
  - Generating 101...




  - Generating 102...
  - Generating 103...
  - ‚öñÔ∏è Judging comparison...

Processing Source 2 (History)...
  - Generating 101...
  - Generating 102...
  - Generating 103...
  - ‚öñÔ∏è Judging comparison...

Processing Source 3 (Finance)...
  - Generating 101...
  - Generating 102...
  - Generating 103...
  - ‚öñÔ∏è Judging comparison...

‚úÖ Final Scoreboard:


Unnamed: 0,Prompt index,Source 1 (Physics),Source 2 (History),Source 3 (Finance),Average
0,101,1.0,1.0,1.5,1.166667
1,102,1.5,1.5,2.0,1.666667
2,103,4.0,1.0,1.0,2.0


# 2. Completeness and Conciseness score:

This works in a similar way to the above code with these key differences:

**Identify Key Facts**: The Judge first lists the most critical information points in the source.

**Completeness Check**: It checks how many of these key facts each summary captured.

**Conciseness Tie-Breaker**: It assesses the efficiency. If two summaries captured the same number of facts, the Judge gave the higher score to the shorter, less-verbose summary.

**Quality Score**: The final score is now on a scale where 10.0 is the best result (high completeness and conciseness), allowing for a clearer differentiation on the final scoreboard.

In [None]:
# !pip install google-cloud-aiplatform

import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig
import pandas as pd
import json
import time

# ==========================================
# 1. SETUP: Initialize Vertex AI
# ==========================================
PROJECT_ID = "pulseai-team3-ba882-fall25"
LOCATION = "us-central1"

vertexai.init(project=PROJECT_ID, location=LOCATION)

GENERATOR_MODEL_ID = "gemini-2.0-flash"
JUDGE_MODEL_ID = "gemini-2.5-pro" # Using 1.5 Pro for better reasoning capabilities

# ==========================================
# 2. CORE FUNCTIONS
# ==========================================

def generate_summary(prompt_template, source_text):
    """Generates the summary using the Flash model."""
    model = GenerativeModel(GENERATOR_MODEL_ID)
    config = GenerationConfig(temperature=0.4, max_output_tokens=1024)

    try:
        final_prompt = prompt_template.format(text=source_text)
        response = model.generate_content(final_prompt, generation_config=config)
        return response.text
    except Exception as e:
        return f"[Error]: {str(e)}"

def evaluate_completeness_and_conciseness(source_text, summaries_dict):
    """
    Judges summaries based on how many key facts they retain (Completeness)
    and penalizes unnecessary fluff (Conciseness).
    """
    model = GenerativeModel(JUDGE_MODEL_ID)

    candidates_str = ""
    for pid, summary in summaries_dict.items():
        candidates_str += f"\n--- CANDIDATE ID: {pid} ---\n{summary}\n"

    judge_prompt = f"""
    You are an expert Content Quality Evaluator.

    Goal: Rate the following summaries based on COMPLETENESS and CONCISENESS.

    [SOURCE TEXT]
    {source_text}

    [CANDIDATE SUMMARIES]
    {candidates_str}

    Evaluation Steps (Perform these internally):
    1. Analyze the [SOURCE TEXT] and identify the specific "Key Information Points" (facts, dates, names, core concepts).
    2. For each Candidate, check which of these points are present.
    3. Assess the word count/verbosity.

    Scoring Rubric (1.0 to 10.0):
    - 10.0: Perfect. Captures ALL key facts AND is very concise.
    - 8.0 - 9.0: Captures all key facts but slightly wordy.
    - 5.0 - 7.0: Misses 1 minor fact OR is very repetitive/verbose.
    - 3.0 - 4.0: Misses significant facts.
    - 1.0 - 2.0: Misses the main point entirely.

    Tie-Breaker Rule: If two summaries capture the exact same facts, the shorter (more concise) one MUST receive the higher score.

    Output STRICT JSON format:
    {{
        "scores": {{
            "prompt_id_1": {{ "score": float, "reason": "string" }},
            "prompt_id_2": {{ "score": float, "reason": "string" }}
        }}
    }}
    """

    config = GenerationConfig(
        temperature=0.0,
        response_mime_type="application/json"
    )

    try:
        response = model.generate_content(judge_prompt, generation_config=config)
        result = json.loads(response.text)
        return result.get('scores', {})
    except Exception as e:
        print(f"Judge Error: {e}")
        return {}

# ==========================================
# 3. EXECUTION LOOP
# ==========================================
test_sources = {
    "Source 1 (Physics)": "Newton's first law states that an object at rest stays at rest and an object in motion stays in motion with the same speed and in the same direction unless acted upon by an unbalanced force. Essentially, objects resist changes in their state of motion; it takes a force to start something moving or to stop it, change its speed, or alter its direction. Example: When a car stops suddenly, your body wants to keep moving forward (inertia).",
    "Source 2 (History)": "The Apollo 11 mission landed the first humans on the Moon. Commander Neil Armstrong and lunar module pilot Buzz Aldrin formed the American crew that landed the Apollo Lunar Module Eagle on July 20, 1969. The Apollo Guidance Computer (AGC) and Display and Keyboard (DSKY) instrument panel were created specifically for the programme. The DSKY panel was akin to a calculator keyboard and display, working with codes the astronauts learned or checked from a flight manual.",
    "Source 3 (Finance)": "EBITDA stands for Earnings Before Interest, Taxes, Depreciation, and Amortization. It is a measure of a company's overall financial performance and is used as an alternative to net income in some circumstances. It's criticized for ignoring asset costs and potential accounting manipulation, as it's not a GAAP metric. It doesn't account for the cash needed to replace or maintain assets (capex), which are significant in asset-intensive businesses."
}

# Added a "Short" prompt to test if the judge actually rewards conciseness
prompt_candidates = {
    "101": "You are an expert technical editor. Summarize this: {text}",
    "102": "You are an editor. Summarize this: {text}",
    "103": "Summarize this: {text}"
}
# prompt_candidates = {
#     "101 (Verbose)": "You are a detailed technical editor. Summarize this comprehensively: {text}",
#     "102 (Standard)": "Summarize this: {text}",
#     "103 (Concise)": "Summarize this in one short sentence: {text}"
# }

formatted_results = {pid: {"Prompt index": pid} for pid in prompt_candidates}

print("\n‚ö° Starting Quality Evaluation Pipeline...")

for source_name, source_text in test_sources.items():
    print(f"\nProcessing {source_name}...")

    # 1. Generate ALL summaries
    current_batch = {}
    for pid, template in prompt_candidates.items():
        # print(f"  - Generating {pid}...")
        current_batch[pid] = generate_summary(template, source_text)
        time.sleep(0.2)

    # 2. Compare them ALL
    print(f"  - ‚öñÔ∏è Judging Completeness & Conciseness...")
    scores_dict = evaluate_completeness_and_conciseness(source_text, current_batch)

    # 3. Store results
    for pid in prompt_candidates:
        score_data = scores_dict.get(pid, {"score": 0, "reason": "Error"})
        formatted_results[pid][source_name] = score_data['score']

# ==========================================
# 4. OUTPUT
# ==========================================
results_data = list(formatted_results.values())

# Calculate Averages
for row in results_data:
    scores = [row[k] for k in test_sources.keys() if isinstance(row[k], (int, float))]
    row["Average"] = sum(scores) / len(scores) if scores else 0

df = pd.DataFrame(results_data)
cols = ["Prompt index"] + list(test_sources.keys()) + ["Average"]
df = df[cols]

print("\n‚úÖ Final Scoreboard (Higher Score = Better Completeness/Conciseness):")
try:
    from IPython.display import display
    # Using 'RdYlGn' (Red to Green) because 10 is now GOOD
    display(df.style.background_gradient(cmap='RdYlGn', vmin=1, vmax=10))
except ImportError:
    print(df)


‚ö° Starting Quality Evaluation Pipeline...

Processing Source 1 (Physics)...




  - ‚öñÔ∏è Judging Completeness & Conciseness...

Processing Source 2 (History)...
  - ‚öñÔ∏è Judging Completeness & Conciseness...

Processing Source 3 (Finance)...
  - ‚öñÔ∏è Judging Completeness & Conciseness...

‚úÖ Final Scoreboard (Higher Score = Better Completeness/Conciseness):


Unnamed: 0,Prompt index,Source 1 (Physics),Source 2 (History),Source 3 (Finance),Average
0,101,9.0,8.5,9.0,8.833333
1,102,7.0,7.0,8.5,7.5
2,103,8.0,9.0,7.0,8.0
