In [1]:
"""
Automatic Story Generation (ASG) - NLP Assignment 4, Section 3.1
Improved with Few-Shot Prompting
"""

import json
import time
import random
import requests
import os

# API endpoint for Ollama
OLLAMA_URL = "http://localhost:11434/api/generate"

# List of genres for story generation
GENRES = [
    "Science Fiction", "Horror", "Comedy", "Romance", "Adventure",
    "Crime", "Fantasy", "Short Story", "Historical", "Mystery"
]

# Sample topics and characters
TOPICS = ["space", "desert island", "hidden treasure", "time machine", "zombie apocalypse"]
CHARACTERS = ["young scientist", "brave explorer", "intelligent robot", "wise wizard"]

# Decoding hyperparameters for generation diversity
DECODING_PARAMS = {
    "temperature": [0.7, 0.9, 1.2],   # Controls randomness in output
    "top_p": [0.9, 0.95]              # Nucleus sampling threshold
}

# Few-shot examples for each genre
FEW_SHOT_EXAMPLES = {
    "Science Fiction": {
        "topic": "space",
        "story": "In the year 2145, a spaceship named Noah traveled to distant stars. The crew found a strange device that could communicate with unknown civilizations."
    },
    "Horror": {
        "topic": "desert island",
        "story": "The sun had long since dipped into the horizon, casting a warm orange glow over the sandy beach of the deserted island... until the screams began."
    },
    "Comedy": {
        "topic": "hidden treasure",
        "story": "Once upon a time in a distant village, there was a rumor that treasure hunters found a hidden fortune... but it turned out to be a giant chocolate bar."
    },
    "Romance": {
        "topic": "time machine",
        "story": "Amidst the hum of the time machine, Dr. Emma Taylor and Captain Liam Chen found themselves in 1920s Paris. Their scientific mission quickly turned into a love story."
    },
    "Adventure": {
        "topic": "space",
        "story": "Once upon a time, there was a young astronaut named Alex who dreamed of exploring the vastness of space..."
    }
}

# Prompt templates for story generation
PROMPT_TEMPLATES = [
    "Write a short {genre} story about {topic} featuring a {character}.",
    "Create a {genre} narrative where a {character} encounters {topic}.",
    "Generate a {genre} tale involving a {character} and {topic}."
]

def generate_story(model_name, genre, topic, character, prompt_template):
    """
    Generates a story using few-shot prompting to guide the LLM.

    Parameters:
        model_name (str): Name of the LLM to use
        genre (str): Story genre (e.g., Horror, Comedy)
        topic (str): Story topic (e.g., space, time machine)
        character (str): Main character of the story
        prompt_template (str): Template to generate the prompt

    Returns:
        str: Generated story or None if failed
    """
    # Get a few-shot example for the selected genre
    few_shot = FEW_SHOT_EXAMPLES.get(genre, FEW_SHOT_EXAMPLES["Science Fiction"])

    # Create the prompt using the selected template
    prompt = prompt_template.format(genre=genre, topic=topic, character=character)

    # Enhance the prompt with few-shot example
    full_prompt = f"""
You are to generate a short {genre} story about {topic}, featuring {character}.

Few-Shot Example:
Genre: {few_shot['topic']}
Story: {few_shot['story']}

Now, write your own story using the following prompt:
{prompt}

Story:
"""

    payload = {
        "model": model_name,
        "prompt": full_prompt,
        "stream": False,
        "options": {
            "temperature": random.choice(DECODING_PARAMS["temperature"]),
            "top_p": random.choice(DECODING_PARAMS["top_p"])
        }
    }

    try:
        response = requests.post(OLLAMA_URL, json=payload)
        if response.status_code == 200:
            return response.json()['response'].strip()
        else:
            print(f"[Error] Failed to get response from {model_name}")
            return None
    except Exception as e:
        print(f"[Exception] {e}")
        return None


def run_asg(models=["phi3:3.8b", "qwen2.5vl:3b"]):
    """
    Main function to generate stories using few-shot prompting.

    Parameters:
        models (list): List of LLMs to use for story generation

    Returns:
        list: List of dictionaries containing metadata and stories
    """
    results = []

    for model in models:
        print(f"\n{'='*60}\n[+] Generating stories using model: {model}\n{'='*60}")
        for i in range(10):  # Generate 10 stories per model
            genre = random.choice(GENRES)
            topic = random.choice(TOPICS)
            character = random.choice(CHARACTERS)
            prompt_template = random.choice(PROMPT_TEMPLATES)

            prompt = prompt_template.format(genre=genre, topic=topic, character=character)

            print(f"\n[{i+1}] Prompt: {prompt}")

            story = generate_story(model, genre, topic, character, prompt_template)
            if story:
                result = {
                    "model": model,
                    "story_id": i + 1,
                    "genre": genre,
                    "topic": topic,
                    "character": character,
                    "prompt_used": prompt,
                    "few_shot_used": FEW_SHOT_EXAMPLES.get(genre, FEW_SHOT_EXAMPLES["Science Fiction"])["story"],
                    "story": story
                }
                results.append(result)
                print(" Story generated successfully.")
            else:
                print(" Failed to generate story.")

            time.sleep(2)  # Small delay between generations

    return results


def save_results(results, filename="asg_output.json"):
    """
    Saves the generated stories into a JSON file inside the 'json_outputs' folder.

    Parameters:
        results (list): List of story dictionaries
        filename (str): Name of the output JSON file
    """
    os.makedirs("json_outputs", exist_ok=True)
    filepath = os.path.join("json_outputs", filename)

    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"\n Results saved to {filepath}")


if __name__ == "__main__":
    random.seed(42)  # Set seed for reproducibility

    output = run_asg()
    save_results(output)



[+] Generating stories using model: phi3:3.8b

[1] Prompt: Write a short Horror story about space featuring a intelligent robot.
 Story generated successfully.

[2] Prompt: Generate a Horror tale involving a young scientist and zombie apocalypse.
 Story generated successfully.

[3] Prompt: Write a short Science Fiction story about space featuring a brave explorer.
 Story generated successfully.

[4] Prompt: Write a short Historical story about desert island featuring a wise wizard.
 Story generated successfully.

[5] Prompt: Create a Science Fiction narrative where a wise wizard encounters desert island.
 Story generated successfully.

[6] Prompt: Write a short Romance story about hidden treasure featuring a young scientist.
 Story generated successfully.

[7] Prompt: Write a short Crime story about hidden treasure featuring a intelligent robot.
 Story generated successfully.

[8] Prompt: Write a short Historical story about space featuring a wise wizard.
 Story generated successfully

In [2]:
"""
Abstractive Text Summarization (ATS), Section 3.2
Improved with Chain-of-Thought (CoT) and Few-Shot Prompting
"""

import json
import time
import random
import requests
import os

# Ollama API settings
OLLAMA_URL = "http://localhost:11434/api/generate"

# List of models used
# Changed from ["llama3.2:1b", "qwen2.5vl:3b"] to use phi3:3.8b instead of llama3.2:1b
MODELS = ["phi3:3.8b", "qwen2.5vl:3b"]

# Few-Shot Examples for summarization
FEW_SHOT_EXAMPLES = [
    {
        "story": "In the year 2145, humanity had finally cracked the code to interstellar travel...",
        "summary": "A spaceship crew discovers an alien probe that has been waiting for 1000 years to send a message back to Earth."
    },
    {
        "story": "Once upon a time in a distant village, there was a rumor going around that a group of treasure hunters had stumbled upon a hidden fortune...",
        "summary": "Treasure hunters discover a hidden chocolate bar, proving that the real treasure was their friendship all along."
    },
    {
        "story": "Dr. Emma Taylor and Captain Liam Chen found themselves in 1920s Paris...",
        "summary": "A scientific mission quickly turns into a love story in 1920s Paris."
    }
]

# Prompt templates with CoT + Few-Shot
SUMMARY_PROMPT_TEMPLATES = [
    """
    Step-by-Step Reasoning:
    1. Read the following story carefully.
    2. Identify the main plot and key events.
    3. Paraphrase the story in your own words.
    4. Write a concise one-sentence summary.

    Few-Shot Example:
    Story: {few_shot_story}
    Summary: {few_shot_summary}

    Story:
    {story}

    Summary:
    """,
    """
    Generate a one-sentence summary of the following story. Do not copy sentences directly from the original text.

    Few-Shot Example:
    Story: {few_shot_story}
    Summary: {few_shot_summary}

    Story:
    {story}

    Summary:
    """,
    """
    Write a short, creative summary that captures the essence of this story without copying sentences.

    Step-by-Step:
    - Understand the main events
    - Rephrase in new words
    - Keep it under 50 words

    Few-Shot Example:
    Story: {few_shot_story}
    Summary: {few_shot_summary}

    Story:
    {story}

    Summary:
    """
]

# Decoding hyperparameters
DECODING_PARAMS = {
    "temperature": [0.7, 0.9],   # Lower means more deterministic output
    "top_p": [0.9, 0.95]         # Nucleus sampling threshold
}

def generate_summary(model_name, story_text):
    """
    Calls Ollama API to generate a summary from a given story.
    
    Parameters:
        model_name (str): Name of the LLM to use
        story_text (str): The full story text to summarize
    
    Returns:
        dict: Generated summary, inference time, and validity status
    """
    # Select a few-shot example
    few_shot = random.choice(FEW_SHOT_EXAMPLES)
    few_shot_story = few_shot["story"]
    few_shot_summary = few_shot["summary"]

    # Select a prompt template and format it
    prompt_template = random.choice(SUMMARY_PROMPT_TEMPLATES)
    prompt = prompt_template.format(
        story=story_text,
        few_shot_story=few_shot_story,
        few_shot_summary=few_shot_summary
    )

    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": random.choice(DECODING_PARAMS["temperature"]),
            "top_p": random.choice(DECODING_PARAMS["top_p"]),
            "max_tokens": 100,
            "stop": ["\n\n"]
        }
    }

    start_time = time.time()
    try:
        response = requests.post(OLLAMA_URL, json=payload)
        inference_time = round(time.time() - start_time, 2)

        if response.status_code == 200:
            summary = response.json()['response'].strip()

            # Check if summary is valid
            is_valid = len(summary.split()) > 10 and not any(
                prompt in summary for prompt in ["Here is a one-sentence summary", "Here is a concise summary"]
            )

            return {
                "summary": summary,
                "inference_time": inference_time,
                "is_valid": is_valid
            }
        else:
            return {
                "summary": None,
                "inference_time": round(time.time() - start_time, 2),
                "is_valid": False
            }
    except Exception as e:
        return {
            "summary": f"[Error] {e}",
            "inference_time": round(time.time() - start_time, 2),
            "is_valid": False
        }


def run_summarization(input_file="json_outputs/asg_output.json", output_file="json_outputs/ats_output.json"):
    """
    Main function to process all stories and generate summaries.
    """
    # Load stories from ASG output
    try:
        with open(input_file, "r", encoding="utf-8") as f:
            stories = json.load(f)
    except FileNotFoundError:
        print(f"[Error] File '{input_file}' not found. Make sure you have run ASG and saved output in 'json_outputs/asg_output.json'.")
        return

    results = []

    for story in stories:
        print(f"\n[Story ID: {story['story_id']}, Model: {story['model']}]")
        print("Prompt used for summarization:")
        print(f"\"{story.get('prompt_used', 'No prompt used')}\"")  

        summary_data = generate_summary(story["model"], story["story"])

        result = {
            "original_model": story["model"],
            "story_id": story["story_id"],
            "genre": story["genre"],
            "topic": story["topic"],
            "original_prompt": story.get("prompt_used", ""),  
            "original_story": story["story"],
            "summary": summary_data["summary"],
            "inference_time": summary_data["inference_time"],
            "is_valid": summary_data["is_valid"]
        }

        results.append(result)

        if summary_data["is_valid"]:
            print(" Summary generated successfully.")
        else:
            print(" Invalid or incomplete summary.")

        time.sleep(2)  

    # Save results to JSON
    os.makedirs("json_outputs", exist_ok=True)
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"\n Summaries saved to {output_file}")


if __name__ == "__main__":
    random.seed(42)  # Set seed for reproducibility
    run_summarization()


[Story ID: 1, Model: phi3:3.8b]
Prompt used for summarization:
"Write a short Horror story about space featuring a intelligent robot."
 Summary generated successfully.

[Story ID: 2, Model: phi3:3.8b]
Prompt used for summarization:
"Generate a Horror tale involving a young scientist and zombie apocalypse."
 Summary generated successfully.

[Story ID: 3, Model: phi3:3.8b]
Prompt used for summarization:
"Write a short Science Fiction story about space featuring a brave explorer."
 Summary generated successfully.

[Story ID: 4, Model: phi3:3.8b]
Prompt used for summarization:
"Write a short Historical story about desert island featuring a wise wizard."
 Summary generated successfully.

[Story ID: 5, Model: phi3:3.8b]
Prompt used for summarization:
"Create a Science Fiction narrative where a wise wizard encounters desert island."
 Summary generated successfully.

[Story ID: 6, Model: phi3:3.8b]
Prompt used for summarization:
"Write a short Romance story about hidden treasure featuring a y

In [8]:
"""
Natural Language Inference (NLI) - NLP Assignment 4, Section 3.3
Running Classification using Phi3.3.8b
"""

import json
import time
import random
import requests
import os
import pandas as pd

# Ollama API endpoint
OLLAMA_URL = "http://localhost:11434/api/generate"

# Model to be used for NLI task
# As per assignment instructions, this is one of the two required models.
# Phi3.3.8b was selected as it showed good performance in initial tests.
MODELS = ["phi3:3.8b"]

# Standard labels expected from the LLM for the NLI task
MODEL_NLI_LABELS = ["entailment", "contradiction", "neutral"]

# Mapping to normalize labels from the dataset to match model outputs.
# The dataset might use 'entails'/'contradicts', while the model outputs 'entailment'/'contradiction'.
# This ensures accurate calculation of accuracy.
LABEL_MAPPING = {
    "entails": "entailment",
    "entailment": "entailment",
    "contradicts": "contradiction",
    "contradiction": "contradiction",
    "neutral": "neutral"
}

# Prompt template for the NLI task.
# Provides a clear definition of the task and expects a single-word answer.
STANDARD_PROMPT_TEMPLATE = """
You are an expert at NLI (Natural Language Inference). Given a premise and a hypothesis, determine if the relationship is:
- entailment (the hypothesis must be true if the premise is true)
- contradiction (the hypothesis must be false if the premise is true)
- neutral (the hypothesis is neither entailed nor contradicted by the premise)

Answer ONLY with one of these three words: entailment, contradiction, neutral

Premise: {premise}
Hypothesis: {hypothesis}
Relationship:
""".strip()

# Decoding parameters to ensure consistent and concise outputs for classification.
# Low temperature for determinism, stop tokens to prevent extra text.
STANDARD_DECODING_PARAMS = {
    "temperature": 0.1,
    "top_p": 0.9,
    "max_tokens": 15,
    "stop": ["\n", ".", " "]
}

def classify_nli_standard(model_name, premise, hypothesis):
    """
    Classifies the relationship between a premise and hypothesis using an LLM via Ollama API.

    Args:
        model_name (str): The name of the LLM to use (e.g., "phi3:3.8b").
        premise (str): The premise sentence.
        hypothesis (str): The hypothesis sentence.

    Returns:
        dict: A dictionary containing the predicted label, raw model output,
              inference time, and validity status.
              - predicted_label (str): The extracted NLI label (entailment, contradiction, neutral) or raw output if invalid.
              - raw_output (str): The complete, unprocessed text returned by the model.
              - inference_time (float): The time taken for the model to respond, in seconds.
              - is_valid (bool): True if a valid label was successfully extracted, False otherwise.
    """
    # Format the prompt with the current premise and hypothesis
    prompt = STANDARD_PROMPT_TEMPLATE.format(premise=premise, hypothesis=hypothesis)

    # Prepare the payload for the Ollama API request
    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False,
        "options": STANDARD_DECODING_PARAMS
    }

    start_time = time.time()
    try:
        # Send the request to the Ollama API
        response = requests.post(OLLAMA_URL, json=payload)
        inference_time = round(time.time() - start_time, 2)

        # Check if the request was successful
        if response.status_code == 200:
            raw_output = response.json()['response'].strip().lower()
            
            # --- Parsing Logic ---
            # 1. Look for the label at the very beginning of the output (most reliable)
            predicted_label = None
            for label in MODEL_NLI_LABELS:
                if raw_output.startswith(label):
                    predicted_label = label
                    break

            # 2. If not found at the start, check if the label appears anywhere in the output (fallback)
            if not predicted_label:
                 for label in MODEL_NLI_LABELS:
                     if label in raw_output:
                         predicted_label = label
                         break
            
            # 3. Determine if the prediction is valid (i.e., a label was found)
            is_valid = predicted_label is not None
            
            # 4. If no valid label was found, store the raw output for debugging
            if not is_valid:
                predicted_label = raw_output 

            return {
                "predicted_label": predicted_label,
                "raw_output": raw_output,
                "inference_time": inference_time,
                "is_valid": is_valid
            }
        else:
            # Handle unsuccessful API request
            return {
                "predicted_label": None,
                "raw_output": f"[Error] Status {response.status_code}",
                "inference_time": inference_time,
                "is_valid": False
            }
    except Exception as e:
        # Handle any exceptions during the API call
        return {
            "predicted_label": None,
            "raw_output": f"[Exception] {e}",
            "inference_time": round(time.time() - start_time, 2),
            "is_valid": False
        }


def run_nli_classification_standard(input_file="datasets/nli/nli.csv", output_file="json_outputs/nli_output_phi3.json"):
    """
    Main function to load the full NLI dataset, run classification for the specified model,
    calculate performance metrics, and save the detailed results.

    Args:
        input_file (str): Path to the input CSV file containing the NLI data (premise, hypothesis, label).
                          As per the assignment PDF, this truncated dataset contains exactly 100 items.
        output_file (str): Path to the output JSON file where results and metrics will be saved.
    """
    # --- 1. Load Data ---
    try:
        # Load the NLI data from the provided CSV file
        df = pd.read_csv(input_file)
        # Convert the DataFrame to a list of dictionaries for easy iteration
        nli_data = df.to_dict('records')
        print(f"[Info] Loaded {len(nli_data)} items from the dataset.")
    except FileNotFoundError:
        print(f"[Error] File '{input_file}' not found. Please check the file path.")
        return
    except Exception as e:
        print(f"[Error] Failed to read the CSV file: {e}")
        return

    # This will store the final results for all models (though we are only using one here)
    all_results = []

    # --- 2. Run Classification for Each Model ---
    for model in MODELS:
        print(f"\n{'='*70}\n[+] Running NLI Classification using model: {model} on FULL dataset ({len(nli_data)} examples)\n{'='*70}")
        
        # Store results and counters for this specific model
        model_results = []
        correct_predictions = 0
        total_valid = 0
        total_inference_time = 0.0  # For calculating average inference time
        
        # --- 3. Iterate Through Dataset ---
        for i, item in enumerate(nli_data):
            # Extract data for the current item
            premise = item["premise"]
            hypothesis = item["hypothesis"]
            true_label_raw = item["label"] 
            
            # Normalize the true label from the dataset to match our standard labels
            true_label_normalized = LABEL_MAPPING.get(true_label_raw, true_label_raw)

            print(f"\n[{i+1:03d}/{len(nli_data)}] Classifying...")

            # Get the model's prediction
            prediction_data = classify_nli_standard(model, premise, hypothesis)
            
            # Store detailed information for this prediction
            result = {
                "item_id": i + 1,
                "premise": premise,
                "hypothesis": hypothesis,
                "true_label_raw": true_label_raw,
                "true_label_normalized": true_label_normalized,
                "predicted_label": prediction_data["predicted_label"],
                "raw_output": prediction_data["raw_output"],
                "inference_time": prediction_data["inference_time"],
                "is_valid": prediction_data["is_valid"]
            }
            model_results.append(result)

            # --- 4. Evaluate Prediction ---
            if prediction_data["is_valid"]:
                # If the model produced a valid label, count it
                total_valid += 1
                
                # Normalize the predicted label (should already be standard, but for safety)
                pred_label_normalized = LABEL_MAPPING.get(prediction_data["predicted_label"], prediction_data["predicted_label"])
                
                # Check if the prediction is correct
                if pred_label_normalized == true_label_normalized:
                    correct_predictions += 1
                    is_prediction_correct = True
                else:
                    is_prediction_correct = False
                    
                print(f"  -> Predicted: '{prediction_data['predicted_label']}' (True: {true_label_raw}) - {'Correct' if is_prediction_correct else 'Incorrect'}")
                
                # Add to total inference time for valid predictions
                total_inference_time += prediction_data["inference_time"]
            else:
                # If the model failed to produce a parseable label, note it as an invalid prediction
                print(f"  -> Invalid prediction: '{prediction_data['raw_output']}' (True: {true_label_raw})")

            # Small delay to prevent potential issues with rapid API calls
            time.sleep(0.1) 

        # --- 5. Calculate Metrics ---
        # Accuracy is defined as the number of correct predictions divided by the number of valid predictions.
        # This aligns with the assignment's definition: "Accuracy = Correct Valid Predictions / Total Valid Predictions"
        accuracy = correct_predictions / total_valid if total_valid > 0 else 0
        
        # Reject Rate is defined as the number of invalid predictions divided by the total number of items.
        # This aligns with the assignment's definition: "Reject Rate = Invalid Predictions / Total Items"
        reject_rate = (len(model_results) - total_valid) / len(model_results) if len(model_results) > 0 else 0
        
        # Calculate average inference time for valid predictions
        average_inference_time = total_inference_time / total_valid if total_valid > 0 else 0
        average_inference_time = round(average_inference_time, 2)
        
        # --- 6. Store Final Results for this Model ---
        final_result = {
            "model": model,
            "accuracy": round(accuracy, 4),
            "reject_rate": round(reject_rate, 4),
            "average_inference_time": average_inference_time,  # Added as per assignment requirement
            "total_items": len(model_results),
            "valid_predictions": total_valid,
            "correct_predictions": correct_predictions,
            "predictions": model_results # Includes detailed info for each of the 100 items
        }
        
        all_results.append(final_result)
        
        # --- 7. Print Summary ---
        print(f"\n--- Summary for Model: {model} (Full Dataset, Standard Approach) ---")
        print(f"Accuracy (on valid predictions): {accuracy:.4f}")
        print(f"Reject Rate: {reject_rate:.4f}")
        print(f"Average Inference Time: {average_inference_time:.2f} seconds")
        print(f"Valid Predictions: {total_valid}/{len(model_results)}")

    # --- 8. Save Results to File ---
    os.makedirs("json_outputs", exist_ok=True)
    try:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(all_results, f, ensure_ascii=False, indent=2)
        print(f"\n[Success] NLI predictions and metrics saved to '{output_file}'")
    except Exception as e:
        print(f"[Error] Failed to save results to '{output_file}': {e}")


# --- Entry Point ---
# This section executes when the script is run directly.
if __name__ == "__main__":
    # Set a random seed for reproducibility, as required by the assignment.
    # Using the same seed, model, and prompt should yield identical outputs.
    random.seed(42)
    
    # Run the main NLI classification function on the full dataset.
    run_nli_classification_standard()


[Info] Loaded 100 items from the dataset.

[+] Running NLI Classification using model: phi3:3.8b on FULL dataset (100 examples)

[001/100] Classifying...
  -> Predicted: 'contradiction' (True: contradicts) - Correct

[002/100] Classifying...
  -> Predicted: 'neutral' (True: neutral) - Correct

[003/100] Classifying...
  -> Predicted: 'entailment' (True: entails) - Correct

[004/100] Classifying...
  -> Predicted: 'neutral' (True: contradicts) - Incorrect

[005/100] Classifying...
  -> Predicted: 'neutral' (True: neutral) - Correct

[006/100] Classifying...
  -> Predicted: 'entailment' (True: entails) - Correct

[007/100] Classifying...
  -> Predicted: 'entailment' (True: entails) - Correct

[008/100] Classifying...
  -> Predicted: 'contradiction' (True: contradicts) - Correct

[009/100] Classifying...
  -> Predicted: 'neutral' (True: neutral) - Correct

[010/100] Classifying...
  -> Predicted: 'neutral' (True: neutral) - Correct

[011/100] Classifying...
  -> Predicted: 'entailment' (T

In [7]:
"""
Natural Language Inference (NLI) - NLP Assignment 4, Section 3.3
Running Classification using Qwen2.5vl:3b
"""

import json
import time
import random
import requests
import os
import pandas as pd

# Ollama API endpoint
OLLAMA_URL = "http://localhost:11434/api/generate"

# Model to be used for NLI task
# As per assignment instructions, this is one of the two required models.
# Qwen2.5vl:3b was selected as the second model for comparison.
MODELS = ["qwen2.5vl:3b"]

# Standard labels expected from the LLM for the NLI task
MODEL_NLI_LABELS = ["entailment", "contradiction", "neutral"]

# Mapping to normalize labels from the dataset to match model outputs.
# The dataset might use 'entails'/'contradicts', while the model outputs 'entailment'/'contradiction'.
# This ensures accurate calculation of accuracy.
LABEL_MAPPING = {
    "entails": "entailment",
    "entailment": "entailment",
    "contradicts": "contradiction",
    "contradiction": "contradiction",
    "neutral": "neutral"
}

# Prompt template for the NLI task.
# Provides a clear definition of the task and expects a single-word answer.
STANDARD_PROMPT_TEMPLATE = """
You are an expert at NLI (Natural Language Inference). Given a premise and a hypothesis, determine if the relationship is:
- entailment (the hypothesis must be true if the premise is true)
- contradiction (the hypothesis must be false if the premise is true)
- neutral (the hypothesis is neither entailed nor contradicted by the premise)

Answer ONLY with one of these three words: entailment, contradiction, neutral

Premise: {premise}
Hypothesis: {hypothesis}
Relationship:
""".strip()

# Decoding parameters to ensure consistent and concise outputs for classification.
# Low temperature for determinism, stop tokens to prevent extra text.
STANDARD_DECODING_PARAMS = {
    "temperature": 0.1,
    "top_p": 0.9,
    "max_tokens": 15,
    "stop": ["\n", ".", " "]
}

def classify_nli_standard(model_name, premise, hypothesis):
    """
    Classifies the relationship between a premise and hypothesis using an LLM via Ollama API.

    Args:
        model_name (str): The name of the LLM to use (e.g., "qwen2.5vl:3b").
        premise (str): The premise sentence.
        hypothesis (str): The hypothesis sentence.

    Returns:
        dict: A dictionary containing the predicted label, raw model output,
              inference time, and validity status.
              - predicted_label (str): The extracted NLI label (entailment, contradiction, neutral) or raw output if invalid.
              - raw_output (str): The complete, unprocessed text returned by the model.
              - inference_time (float): The time taken for the model to respond, in seconds.
              - is_valid (bool): True if a valid label was successfully extracted, False otherwise.
    """
    # Format the prompt with the current premise and hypothesis
    prompt = STANDARD_PROMPT_TEMPLATE.format(premise=premise, hypothesis=hypothesis)

    # Prepare the payload for the Ollama API request
    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False,
        "options": STANDARD_DECODING_PARAMS
    }

    start_time = time.time()
    try:
        # Send the request to the Ollama API
        response = requests.post(OLLAMA_URL, json=payload)
        inference_time = round(time.time() - start_time, 2)

        # Check if the request was successful
        if response.status_code == 200:
            raw_output = response.json()['response'].strip().lower()
            
            # --- Parsing Logic ---
            # 1. Look for the label at the very beginning of the output (most reliable)
            predicted_label = None
            for label in MODEL_NLI_LABELS:
                if raw_output.startswith(label):
                    predicted_label = label
                    break

            # 2. If not found at the start, check if the label appears anywhere in the output (fallback)
            if not predicted_label:
                 for label in MODEL_NLI_LABELS:
                     if label in raw_output:
                         predicted_label = label
                         break
            
            # 3. Determine if the prediction is valid (i.e., a label was found)
            is_valid = predicted_label is not None
            
            # 4. If no valid label was found, store the raw output for debugging
            if not is_valid:
                predicted_label = raw_output 

            return {
                "predicted_label": predicted_label,
                "raw_output": raw_output,
                "inference_time": inference_time,
                "is_valid": is_valid
            }
        else:
            # Handle unsuccessful API request
            return {
                "predicted_label": None,
                "raw_output": f"[Error] Status {response.status_code}",
                "inference_time": inference_time,
                "is_valid": False
            }
    except Exception as e:
        # Handle any exceptions during the API call
        return {
            "predicted_label": None,
            "raw_output": f"[Exception] {e}",
            "inference_time": round(time.time() - start_time, 2),
            "is_valid": False
        }


def run_nli_classification_standard(input_file="datasets/nli/nli.csv", output_file="json_outputs/nli_output_qwen.json"):
    """
    Main function to load the full NLI dataset, run classification for the specified model,
    calculate performance metrics, and save the detailed results.

    Args:
        input_file (str): Path to the input CSV file containing the NLI data (premise, hypothesis, label).
                          As per the assignment PDF, this truncated dataset contains exactly 100 items.
        output_file (str): Path to the output JSON file where results and metrics will be saved.
    """
    # --- 1. Load Data ---
    try:
        # Load the NLI data from the provided CSV file
        df = pd.read_csv(input_file)
        # Convert the DataFrame to a list of dictionaries for easy iteration
        nli_data = df.to_dict('records')
        print(f"[Info] Loaded {len(nli_data)} items from the dataset.")
    except FileNotFoundError:
        print(f"[Error] File '{input_file}' not found. Please check the file path.")
        return
    except Exception as e:
        print(f"[Error] Failed to read the CSV file: {e}")
        return

    # This will store the final results for all models (though we are only using one here)
    all_results = []

    # --- 2. Run Classification for Each Model ---
    for model in MODELS:
        print(f"\n{'='*70}\n[+] Running NLI Classification using model: {model} on FULL dataset ({len(nli_data)} examples)\n{'='*70}")
        
        # Store results and counters for this specific model
        model_results = []
        correct_predictions = 0
        total_valid = 0
        total_inference_time = 0.0  # For calculating average inference time
        
        # --- 3. Iterate Through Dataset ---
        for i, item in enumerate(nli_data):
            # Extract data for the current item
            premise = item["premise"]
            hypothesis = item["hypothesis"]
            true_label_raw = item["label"] 
            
            # Normalize the true label from the dataset to match our standard labels
            true_label_normalized = LABEL_MAPPING.get(true_label_raw, true_label_raw)

            print(f"\n[{i+1:03d}/{len(nli_data)}] Classifying...")

            # Get the model's prediction
            prediction_data = classify_nli_standard(model, premise, hypothesis)
            
            # Store detailed information for this prediction
            result = {
                "item_id": i + 1,
                "premise": premise,
                "hypothesis": hypothesis,
                "true_label_raw": true_label_raw,
                "true_label_normalized": true_label_normalized,
                "predicted_label": prediction_data["predicted_label"],
                "raw_output": prediction_data["raw_output"],
                "inference_time": prediction_data["inference_time"],
                "is_valid": prediction_data["is_valid"]
            }
            model_results.append(result)

            # --- 4. Evaluate Prediction ---
            if prediction_data["is_valid"]:
                # If the model produced a valid label, count it
                total_valid += 1
                
                # Normalize the predicted label (should already be standard, but for safety)
                pred_label_normalized = LABEL_MAPPING.get(prediction_data["predicted_label"], prediction_data["predicted_label"])
                
                # Check if the prediction is correct
                if pred_label_normalized == true_label_normalized:
                    correct_predictions += 1
                    is_prediction_correct = True
                else:
                    is_prediction_correct = False
                    
                print(f"  -> Predicted: '{prediction_data['predicted_label']}' (True: {true_label_raw}) - {'Correct' if is_prediction_correct else 'Incorrect'}")
                
                # Add to total inference time for valid predictions
                total_inference_time += prediction_data["inference_time"]
            else:
                # If the model failed to produce a parseable label, note it as an invalid prediction
                print(f"  -> Invalid prediction: '{prediction_data['raw_output']}' (True: {true_label_raw})")

            # Small delay to prevent potential issues with rapid API calls
            time.sleep(0.1) 

        # --- 5. Calculate Metrics ---
        # Accuracy is defined as the number of correct predictions divided by the number of valid predictions.
        # This aligns with the assignment's definition: "Accuracy = Correct Valid Predictions / Total Valid Predictions"
        accuracy = correct_predictions / total_valid if total_valid > 0 else 0
        
        # Reject Rate is defined as the number of invalid predictions divided by the total number of items.
        # This aligns with the assignment's definition: "Reject Rate = Invalid Predictions / Total Items"
        reject_rate = (len(model_results) - total_valid) / len(model_results) if len(model_results) > 0 else 0
        
        # Calculate average inference time for valid predictions
        average_inference_time = total_inference_time / total_valid if total_valid > 0 else 0
        average_inference_time = round(average_inference_time, 2)
        
        # --- 6. Store Final Results for this Model ---
        final_result = {
            "model": model,
            "accuracy": round(accuracy, 4),
            "reject_rate": round(reject_rate, 4),
            "average_inference_time": average_inference_time,  # Added as per assignment requirement
            "total_items": len(model_results),
            "valid_predictions": total_valid,
            "correct_predictions": correct_predictions,
            "predictions": model_results # Includes detailed info for each of the 100 items
        }
        
        all_results.append(final_result)
        
        # --- 7. Print Summary ---
        print(f"\n--- Summary for Model: {model} (Full Dataset, Standard Approach) ---")
        print(f"Accuracy (on valid predictions): {accuracy:.4f}")
        print(f"Reject Rate: {reject_rate:.4f}")
        print(f"Average Inference Time: {average_inference_time:.2f} seconds")
        print(f"Valid Predictions: {total_valid}/{len(model_results)}")

    # --- 8. Save Results to File ---
    os.makedirs("json_outputs", exist_ok=True)
    try:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(all_results, f, ensure_ascii=False, indent=2)
        print(f"\n[Success] NLI predictions and metrics saved to '{output_file}'")
    except Exception as e:
        print(f"[Error] Failed to save results to '{output_file}': {e}")


# --- Entry Point ---
# This section executes when the script is run directly.
if __name__ == "__main__":
    # Set a random seed for reproducibility, as required by the assignment.
    # Using the same seed, model, and prompt should yield identical outputs.
    random.seed(42)
    
    # Run the main NLI classification function on the full dataset.
    run_nli_classification_standard()


[Info] Loaded 100 items from the dataset.

[+] Running NLI Classification using model: qwen2.5vl:3b on FULL dataset (100 examples)

[001/100] Classifying...
  -> Predicted: 'contradiction' (True: contradicts) - Correct

[002/100] Classifying...
  -> Predicted: 'entailment' (True: neutral) - Incorrect

[003/100] Classifying...
  -> Predicted: 'entailment' (True: entails) - Correct

[004/100] Classifying...
  -> Predicted: 'contradiction' (True: contradicts) - Correct

[005/100] Classifying...
  -> Predicted: 'neutral' (True: neutral) - Correct

[006/100] Classifying...
  -> Predicted: 'entailment' (True: entails) - Correct

[007/100] Classifying...
  -> Predicted: 'entailment' (True: entails) - Correct

[008/100] Classifying...
  -> Predicted: 'contradiction' (True: contradicts) - Correct

[009/100] Classifying...
  -> Predicted: 'neutral' (True: neutral) - Correct

[010/100] Classifying...
  -> Predicted: 'neutral' (True: neutral) - Correct

[011/100] Classifying...
  -> Predicted: 'en

In [None]:
"""
Image Captioning (IC), Section 3.4
"""

import json
import time
import random
import requests
import os
import pandas as pd
import base64
from pycocoevalcap.cider.cider import Cider

# Ollama API endpoint
OLLAMA_URL = "http://localhost:11434/api/generate"

# Vision-capable model as required
MODELS = ["qwen2.5vl:3b"]

# Caption generation prompts
IC_PROMPT_TEMPLATES = [
    "Describe the image in a single sentence. Image Description:",
    "Write a concise caption for the image. Image Description:",
    "Generate a natural language description for the given image. Image Description:"
]

# Generation parameters
IC_DECODING_PARAMS = {
    "temperature": 0.7,
    "top_p": 0.9
}

def encode_image_to_base64(image_path):
    """Convert image to base64 string for API."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def generate_caption(model_name, image_path):
    """Generate caption for image using Ollama API."""
    prompt = random.choice(IC_PROMPT_TEMPLATES)
    image_base64 = encode_image_to_base64(image_path)

    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False,
        "images": [image_base64],
        "options": {
            "temperature": IC_DECODING_PARAMS["temperature"],
            "top_p": IC_DECODING_PARAMS["top_p"],
            "max_tokens": 50,
            "stop": ["\n", "."]
        }
    }

    start_time = time.time()
    try:
        response = requests.post(OLLAMA_URL, json=payload)
        inference_time = round(time.time() - start_time, 2)

        if response.status_code == 200:
            caption = response.json()['response'].strip()
            is_valid = len(caption.split()) > 5 and not any(
                prompt in caption for prompt in ["Here is a caption", "Caption:", "Image Description:"]
            )
            return {"caption": caption, "inference_time": inference_time, "is_valid": is_valid}
        return {"caption": None, "inference_time": inference_time, "is_valid": False}
    except Exception as e:
        return {
            "caption": f"[Error] {e}",
            "inference_time": round(time.time() - start_time, 2),
            "is_valid": False
        }

def calculate_cider_scores(generated_captions, reference_captions):
    """Calculate CIDEr scores against reference captions."""
    res = [{"image_id": img_id, "caption": caption} for img_id, caption in generated_captions.items()]
    gts = {img_id: captions for img_id, captions in reference_captions.items()}
    
    cider_score, cider_scores = Cider().compute_score(gts, res)
    
    cider_details = {img_id: float(score) for img_id, score in zip(generated_captions.keys(), cider_scores)}
    return float(cider_score), cider_details

def run_image_captioning(input_file="datasets/ic/ic.csv", images_dir="datasets/ic/images", output_file="json_outputs/ic_output.json"):
    """Process images, generate captions, and evaluate with CIDEr."""
    try:
        df = pd.read_csv(input_file)
        print(f"[Info] Loaded {len(df)} images from dataset.")
    except FileNotFoundError:
        print(f"[Error] File '{input_file}' not found.")
        return
    except Exception as e:
        print(f"[Error] Failed to read CSV: {e}")
        return

    results = []
    generated_captions = {}
    reference_captions = {}

    for model in MODELS:
        print(f"\n{'='*60}\n[+] Running Image Captioning with {model}\n{'='*60}")
        
        model_results = []
        total_valid = 0
        total_inference_time = 0.0
        
        for i, row in df.iterrows():
            # Use the 'image' column as the image ID 
            image_id = str(row["image"])
            image_path = os.path.join(images_dir, image_id)  
            
            # Check if the image file actually exists
            if not os.path.exists(image_path):
                print(f"[Error] Image file not found: {image_path}")
                continue
            
            # Store 5 reference captions per image from the 'human_captions' column
            try:
                ref_captions = eval(row["human_captions"])
                if not isinstance(ref_captions, list) or len(ref_captions) != 5:
                    raise ValueError("Not a valid list of 5 captions")
            except:
                # Fallback in case the parsing fails
                ref_captions = [""] * 5
            
            reference_captions[image_id] = ref_captions
            
            print(f"\n[{i+1:03d}] Processing image: {image_path}")
            
            caption_data = generate_caption(model, image_path)
            
            result = {
                "model": model,
                "image_id": image_id,
                "image_path": image_path,
                "caption": caption_data["caption"],
                "inference_time": caption_data["inference_time"],
                "is_valid": caption_data["is_valid"]
            }
            model_results.append(result)
            
            if caption_data["is_valid"]:
                total_valid += 1
                total_inference_time += caption_data["inference_time"]
                generated_captions[image_id] = caption_data["caption"]
                print(" Caption generated successfully.")
            else:
                print(" Invalid caption.")
            
            time.sleep(1)

        # Calculate evaluation metrics
        reject_rate = (len(model_results) - total_valid) / len(model_results) if len(model_results) > 0 else 0
        reject_rate = round(reject_rate, 4)
        
        average_inference_time = total_inference_time / total_valid if total_valid > 0 else 0
        average_inference_time = round(average_inference_time, 2)
        
        cider_score = 0.0
        cider_details = {}
        if total_valid > 0:
            try:
                cider_score, cider_details = calculate_cider_scores(generated_captions, reference_captions)
                cider_score = round(cider_score, 4)
            except Exception as e:
                print(f"[Error] CIDEr calculation failed: {e}")
        
        # Store results
        final_result = {
            "model": model,
            "total_images": len(df),
            "valid_captions": total_valid,
            "reject_rate": reject_rate,
            "average_inference_time": average_inference_time,
            "cider_score": cider_score,
            "cider_details": cider_details,
            "predictions": model_results
        }
        results.append(final_result)
        
        # Print summary
        print(f"\n--- Results for {model} ---")
        print(f"Total Images: {len(df)}")
        print(f"Valid Captions: {total_valid}/{len(df)}")
        print(f"Reject Rate: {reject_rate:.4f}")
        print(f"Average Inference Time: {average_inference_time:.2f} seconds")
        print(f"CIDEr Score: {cider_score:.4f}")

    # Save results
    os.makedirs("json_outputs", exist_ok=True)
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"\n[Success] Results saved to {output_file}")

if __name__ == "__main__":
    random.seed(42)
    
    run_image_captioning()

In [3]:
"""
Merge JSON output files from Image Captioning subsets and calculate overall metrics
"""

import json
import os
import numpy as np
from pycocoevalcap.cider.cider import Cider

def merge_json_files(json_files):
    """Merge multiple JSON files into a single structure with all predictions."""
    all_predictions = []
    generated_captions = {}
    reference_captions = {}
    
    total_images = 0
    total_valid = 0
    total_inference_time = 0.0
    
    for file_path in json_files:
        with open(file_path, 'r') as f:
            data = json.load(f)
            
            # Process each model result in the file (usually just one model)
            for model_result in data:
                total_images += model_result["total_images"]
                total_valid += model_result["valid_captions"]
                total_inference_time += model_result["valid_captions"] * model_result["average_inference_time"]
                
                # Collect all predictions
                for prediction in model_result["predictions"]:
                    all_predictions.append(prediction)
                    
                    if prediction["is_valid"]:
                        generated_captions[prediction["image_id"]] = prediction["caption"]
                    
                    # Extract reference captions from the prediction structure
                    # This assumes the reference captions are stored in a consistent way
                    # You might need to adjust this part based on your actual data structure
                    if "reference_captions" in prediction:  # If directly stored
                        reference_captions[prediction["image_id"]] = prediction["reference_captions"]
                    else:
                        # Try to extract from the human_captions column if available
                        try:
                            # This is a simplified approach - you might need to adjust based on your actual data
                            ref_captions = eval(prediction["image_id"].split('.')[0] + "_ref_captions")
                            if isinstance(ref_captions, list) and len(ref_captions) == 5:
                                reference_captions[prediction["image_id"]] = ref_captions
                        except:
                            pass
    
    # Calculate overall metrics
    reject_rate = (total_images - total_valid) / total_images if total_images > 0 else 0
    average_inference_time = total_inference_time / total_valid if total_valid > 0 else 0
    
    return {
        "all_predictions": all_predictions,
        "generated_captions": generated_captions,
        "reference_captions": reference_captions,
        "total_images": total_images,
        "total_valid": total_valid,
        "reject_rate": reject_rate,
        "average_inference_time": average_inference_time
    }

def calculate_overall_cider(generated_captions, reference_captions):
    """Calculate CIDEr score for the entire dataset."""
    # Ensure we're working with dictionaries
    if not isinstance(generated_captions, dict) or not isinstance(reference_captions, dict):
        print("[Error] Invalid data format for CIDEr calculation")
        return 0.0, {}
    
    # CRITICAL CHANGE: Proper formatting for pycocoevalcap
    # In COCO evaluation format:
    # - res should be {img_id: [caption1, caption2, ...]} but we have only one caption per image
    # - gts should be {img_id: [ref_caption1, ref_caption2, ..., ref_caption5]}
    res = {str(img_id): [caption] for img_id, caption in generated_captions.items()}
    gts = {str(img_id): captions for img_id, captions in reference_captions.items()}
    
    # Ensure all image IDs are the same in both structures
    common_ids = set(gts.keys()) & set(res.keys())
    
    if not common_ids:
        print("[Error] No common image IDs found between generated and reference captions")
        return 0.0, {}
    
    # Filter to keep only common IDs
    filtered_res = {img_id: res[img_id] for img_id in common_ids}
    filtered_gts = {img_id: gts[img_id] for img_id in common_ids}
    
    try:
        # Calculate CIDEr score
        cider = Cider()
        cider_score, cider_scores = cider.compute_score(filtered_gts, filtered_res)
        
        # Handle different return types of cider_scores
        cider_details = {}
        if isinstance(cider_scores, list):
            # If it's a list, map it to image IDs
            for i, img_id in enumerate(common_ids):
                cider_details[img_id] = float(cider_scores[i])
        elif isinstance(cider_scores, dict):
            # If it's a dict, use it directly
            cider_details = {img_id: float(score) for img_id, score in cider_scores.items()}
        elif isinstance(cider_scores, float):
            # If it's a single float, apply to all images
            cider_details = {img_id: cider_scores for img_id in common_ids}
            cider_score = cider_scores
        else:
            # Fallback: assign the same score to all images
            cider_details = {img_id: float(cider_score) for img_id in common_ids}
            
        return float(cider_score), cider_details
    except Exception as e:
        print(f"[Error] CIDEr calculation failed: {str(e)}")
        # Return default values in case of error
        return 0.0, {img_id: 0.0 for img_id in common_ids}

def load_reference_captions_from_csv(csv_file="datasets/ic/ic.csv"):
    """Load reference captions from the original CSV file."""
    import pandas as pd
    
    try:
        df = pd.read_csv(csv_file)
        reference_captions = {}
        
        for _, row in df.iterrows():
            image_id = str(row["image"])
            try:
                # Safely evaluate the human_captions string
                ref_captions = eval(row["human_captions"])
                if isinstance(ref_captions, list) and len(ref_captions) == 5:
                    reference_captions[image_id] = ref_captions
            except:
                # Fallback in case the parsing fails
                reference_captions[image_id] = [""] * 5
                
        return reference_captions
    except Exception as e:
        print(f"[Error] Failed to load reference captions from CSV: {e}")
        return {}

def main():
    # List of JSON output files to merge
    json_files = [
        "json_outputs/ic_output_subset_10.json",
        "json_outputs/ic_output_subset_11_20.json",
        "json_outputs/ic_output_subset_21_30.json",
        "json_outputs/ic_output_subset_31_40.json",
        "json_outputs/ic_output_subset_41_50.json",
        "json_outputs/ic_output_subset_51_60.json",
        "json_outputs/ic_output_subset_61_70.json",
        "json_outputs/ic_output_subset_71_80.json",
        "json_outputs/ic_output_subset_81_90.json",
        "json_outputs/ic_output_subset_91_100.json"
    ]
    
    # Filter out files that don't exist
    existing_files = [f for f in json_files if os.path.exists(f)]
    if not existing_files:
        print("[Error] No JSON files found. Please check the file paths.")
        return
    
    print(f"[Info] Found {len(existing_files)} JSON files to merge.")
    
    # Step 1: Merge the JSON files and calculate basic metrics
    merged_data = merge_json_files(existing_files)
    
    # Step 2: Load reference captions from CSV (more reliable than from JSON files)
    reference_captions = load_reference_captions_from_csv()
    
    # Step 3: Calculate overall CIDEr score
    print("\n[Info] Calculating overall CIDEr score for the entire dataset...")
    cider_score, cider_details = calculate_overall_cider(
        merged_data["generated_captions"], 
        reference_captions
    )
    cider_score = round(cider_score, 4)
    
    # Step 4: Create final results structure
    final_result = {
        "model": "qwen2.5vl:3b",  # Assuming all subsets used the same model
        "total_images": merged_data["total_images"],
        "valid_captions": merged_data["total_valid"],
        "reject_rate": round(merged_data["reject_rate"], 4),
        "average_inference_time": round(merged_data["average_inference_time"], 2),
        "cider_score": cider_score,
        "cider_details": cider_details,
        "predictions": merged_data["all_predictions"]
    }
    
    # Step 5: Print summary
    print(f"\n{'='*60}")
    print(f"[+] Overall Results for Image Captioning (All 100 Images)")
    print(f"{'='*60}")
    print(f"Total Images: {final_result['total_images']}")
    print(f"Valid Captions: {final_result['valid_captions']}/{final_result['total_images']}")
    print(f"Reject Rate: {final_result['reject_rate']:.4f}")
    print(f"Average Inference Time: {final_result['average_inference_time']:.2f} seconds")
    print(f"CIDEr Score: {final_result['cider_score']:.4f}")
    
    # Step 6: Save results
    output_file = "json_outputs/ic_output.json"
    os.makedirs("json_outputs", exist_ok=True)
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump([final_result], f, ensure_ascii=False, indent=2)
    
    print(f"\n[Success] Overall results saved to {output_file}")

if __name__ == "__main__":
    try:
        from pycocoevalcap.cider.cider import Cider
        print("[Info] CIDEr evaluation libraries installed.")
    except ImportError:
        print("[Error] CIDEr libraries not installed.")
        print("Install with: pip install pycocoevalcap==1.2")
        exit(1)
    
    main()

[Info] CIDEr evaluation libraries installed.
[Info] Found 10 JSON files to merge.

[Info] Calculating overall CIDEr score for the entire dataset...

[+] Overall Results for Image Captioning (All 100 Images)
Total Images: 100
Valid Captions: 100/100
Reject Rate: 0.0000
Average Inference Time: 146.74 seconds
CIDEr Score: 0.6318

[Success] Overall results saved to json_outputs/ic_output.json


In [None]:
"""
Visual Question Answering (VQA), Section 3.5
Full Dataset Evaluation
"""

import json
import time
import random
import requests
import os
import pandas as pd
import base64
import re

# Ollama API endpoint
OLLAMA_URL = "http://localhost:11434/api/generate"

# Vision-capable model as required
MODELS = ["qwen2.5vl:3b"]

# Simple prompt template for VQA
VQA_PROMPT_TEMPLATE = """
Look at the image and answer the question by selecting ONLY ONE LETTER from A, B, C, D, or E.

Question: {question}
Options: {options}

Think carefully and then write ONLY the letter of your answer (A, B, C, D, or E):
""".strip()

def encode_image_to_base64(image_path):
    """Convert image to base64 string for API."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def extract_answer(response_text):
    """Extract the answer letter (A-E) from model's response."""
    # Normalize response text
    response_text = response_text.strip().upper()
    
    # Strategy 1: Look for the answer at the very end
    last_chars = response_text[-10:] if len(response_text) > 10 else response_text
    for char in reversed(last_chars):
        if char in ['A', 'B', 'C', 'D', 'E']:
            return char
    
    # Strategy 2: Look for the answer at the very beginning
    first_chars = response_text[:10] if len(response_text) > 10 else response_text
    for char in first_chars:
        if char in ['A', 'B', 'C', 'D', 'E']:
            return char
            
    # Strategy 3: Look for patterns like "Answer: C" or "The answer is C"
    patterns = [
        r'ANSWER\s*[:=]?\s*([A-E])',
        r'THE\s+ANSWER\s+IS\s+([A-E])',
        r'OPTION\s+([A-E])',
        r'\b([A-E])\b'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, response_text)
        if match:
            return match.group(1)
    
    # If no valid answer found
    return None

def generate_vqa_answer(model_name, image_path, question, options):
    """Generate VQA answer using Ollama API."""
    prompt = VQA_PROMPT_TEMPLATE.format(question=question, options=options)
    image_base64 = encode_image_to_base64(image_path)

    payload = {
        "model": model_name,
        "prompt": prompt,
        "stream": False,
        "images": [image_base64],
        "options": {
            "temperature": 0.0,  # Deterministic
            "top_p": 0.95,
            "max_tokens": 10,
            "stop": ["\n", ".", " "]
        }
    }

    start_time = time.time()
    try:
        response = requests.post(OLLAMA_URL, json=payload)
        inference_time = round(time.time() - start_time, 2)

        if response.status_code == 200:
            raw_output = response.json()['response'].strip()
            answer = extract_answer(raw_output)
            is_valid = answer is not None
            return {
                "answer": answer,
                "raw_output": raw_output,
                "inference_time": inference_time,
                "is_valid": is_valid
            }
        else:
            return {
                "answer": None,
                "raw_output": f"[Error] Status {response.status_code}",
                "inference_time": inference_time,
                "is_valid": False
            }
    except Exception as e:
        return {
            "answer": None,
            "raw_output": f"[Error] {e}",
            "inference_time": round(time.time() - start_time, 2),
            "is_valid": False
        }

def run_vqa_full(input_file="datasets/vqa/vqa.csv", images_dir="datasets/vqa/images", output_file="json_outputs/vqa_output_full.json"):
    """Process all questions from the VQA dataset."""
    try:
        df = pd.read_csv(input_file)
        print(f"[Info] Loaded {len(df)} questions from VQA dataset.")
    except FileNotFoundError:
        print(f"[Error] File '{input_file}' not found.")
        return
    except Exception as e:
        print(f"[Error] Failed to read CSV: {e}")
        return

    # Process ALL questions in the dataset
    df_full = df.copy()
    # Reset index to make question_id start from 1 for this subset
    df_full.reset_index(drop=True, inplace=True)
    print(f"[Info] Processing all {len(df_full)} questions in the dataset.")

    results = []
    
    for model in MODELS:
        print(f"\n{'='*60}\n[+] Running VQA with {model} (Full Dataset)\n{'='*60}")
        
        model_results = []
        total_valid = 0
        total_correct = 0
        total_inference_time = 0.0
        
        for i, row in df_full.iterrows():
            # Use the 'image' column as the image ID 
            image_id = str(row["image"])
            image_path = os.path.join(images_dir, image_id)
            
            # Check if the image file actually exists
            if not os.path.exists(image_path):
                print(f"[Error] Image file not found: {image_path}")
                continue
            
            # Get question, options, and correct answer
            question = str(row["question"])
            options = str(row["options"])
            correct_answer = str(row["answer"]).strip().upper()

            print(f"\n[{i+1:03d}] Processing question: {question[:50]}...")
            print(f"    Image path: {image_path}")
            print(f"    Correct answer: {correct_answer}")
            
            answer_data = generate_vqa_answer(model, image_path, question, options)
            
            result = {
                "model": model,
                "question_id": i + 1,  # Global question number (1 to N)
                "image_id": image_id,
                "image_path": image_path,
                "question": question,
                "options": options,
                "correct_answer": correct_answer,
                "predicted_answer": answer_data["answer"],
                "raw_output": answer_data["raw_output"],
                "inference_time": answer_data["inference_time"],
                "is_valid": answer_data["is_valid"]
            }
            model_results.append(result)
            
            if answer_data["is_valid"]:
                total_valid += 1
                total_inference_time += answer_data["inference_time"]
                if answer_data["answer"] == correct_answer:
                    total_correct += 1
                    is_correct = True
                else:
                    is_correct = False
                print(f"  -> Predicted: '{answer_data['answer']}' (True: {correct_answer}) - {'Correct' if is_correct else 'Incorrect'}")
            else:
                print(f"  -> Invalid prediction: '{answer_data['raw_output']}' (True: {correct_answer})")

            time.sleep(1)  # Small delay

        # Calculate evaluation metrics
        reject_rate = (len(model_results) - total_valid) / len(model_results) if len(model_results) > 0 else 0
        reject_rate = round(reject_rate, 4)
        
        accuracy = total_correct / total_valid if total_valid > 0 else 0
        accuracy = round(accuracy, 4)
        
        average_inference_time = total_inference_time / total_valid if total_valid > 0 else 0
        average_inference_time = round(average_inference_time, 2)
        
        # Store results
        final_result = {
            "model": model,
            "subset_range": "1-N",
            "total_questions": len(df_full),
            "valid_answers": total_valid,
            "correct_answers": total_correct,
            "accuracy": accuracy,
            "reject_rate": reject_rate,
            "average_inference_time": average_inference_time,
            "predictions": model_results
        }
        results.append(final_result)
        
        # Print summary
        print(f"\n--- Results for {model} (Full Dataset) ---")
        print(f"Total Questions: {len(df_full)}")
        print(f"Valid Answers: {total_valid}/{len(df_full)}")
        print(f"Correct Answers: {total_correct}/{total_valid}")
        print(f"Accuracy (on valid predictions): {accuracy:.4f}")
        print(f"Reject Rate: {reject_rate:.4f}")
        print(f"Average Inference Time: {average_inference_time:.2f} seconds")

    # Save results
    os.makedirs("json_outputs", exist_ok=True)
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"\n[Success] Results for full dataset saved to {output_file}")


if __name__ == "__main__":
    random.seed(42)
    run_vqa_full()

In [8]:
"""
Part 4.1 - Perplexity Calculator 
"""

import json
import os
from llama_cpp import Llama
import math

# --- Configuration ---
# Paths to the GGUF model files
# For phi3, use Ollama path
MODEL_PATH_PHI = r"C:\Users\PC\.ollama\models\blobs\sha256-633fc5be925f9a484b61d6f9b9a78021eeb462100bd557309f01ba84cac26adf"
# For qwen, use alternative model (not Ollama path)
MODEL_PATH_QWEN = r"C:\Users\PC\models\Qwen2-VL-2B-Instruct-Q4_K_M.gguf"
MMPROJ_PATH_QWEN = r"C:\Users\PC\models\mmproj-Qwen2-VL-2B-Instruct-f32.gguf"

# Path to generated stories
STORIES_FILE = "json_outputs/asg_output.json"
# Output file for perplexity results
OUTPUT_FILE = "json_outputs/asg_output_with_perplexity.json"

def calculate_perplexity(model_path, text, n_ctx=2048, is_multimodal=False, mmproj_path=None):
    """
    Calculate perplexity for text using GGUF model.
    
    Args:
        model_path: Path to GGUF model file
        text: Text to evaluate
        n_ctx: Context window size
        is_multimodal: Whether model is multimodal
        mmproj_path: Path to mmproj file (for multimodal models)
        
    Returns:
        Perplexity value (lower is better), or inf if fails
    """
    try:
        # Load model with proper configuration
        if is_multimodal and mmproj_path:
            # For multimodal models, mmproj_path is essential
            llm = Llama(
                model_path=model_path,
                mmproj=mmproj_path,
                n_ctx=n_ctx,
                n_gpu_layers=-1,
                verbose=False,
                logits_all=True
            )
        else:
            # For text-only models
            llm = Llama(
                model_path=model_path,
                n_ctx=n_ctx,
                n_gpu_layers=-1,
                verbose=False,
                logits_all=True
            )
        
        # Tokenize text
        tokens = llm.tokenize(text.encode('utf-8'))
        if not tokens or len(tokens) < 2:
            return float('inf')
        
        total_log_likelihood = 0.0
        token_count = 0
        
        # Calculate perplexity for each token (limit to 50 tokens)
        for i in range(1, min(len(tokens), 50)):
            # Context = previous tokens
            context = tokens[:i]
            
            # Convert context tokens to text
            context_text = llm.detokenize(context).decode('utf-8', errors='ignore')
            
            # Get prediction with log probabilities
            completion = llm.create_completion(
                prompt=context_text,
                max_tokens=1,
                temperature=0.0,
                top_p=1.0,
                logprobs=1,
                echo=False,
                stream=False
            )
            
            # Extract log probability of actual token
            if 'choices' in completion and len(completion['choices']) > 0:
                choice = completion['choices'][0]
                if 'logprobs' in choice and 'token_logprobs' in choice['logprobs']:
                    if len(choice['logprobs']['token_logprobs']) > 0:
                        logprob = choice['logprobs']['token_logprobs'][0]
                        if not math.isnan(logprob) and logprob != -float('inf'):
                            total_log_likelihood += logprob
                            token_count += 1
        
        if token_count == 0:
            return float('inf')
        
        # Calculate perplexity
        avg_neg_log_likelihood = -total_log_likelihood / token_count
        perplexity = math.exp(avg_neg_log_likelihood)
        return perplexity
    
    except Exception as e:
        print(f"[ERROR] {e}")
        return float('inf')

# Main execution
if __name__ == "__main__":
    # Load generated stories
    with open(STORIES_FILE, "r", encoding="utf-8") as f:
        stories_data = json.load(f)
    
    results_with_ppl = []
    
    for item in stories_data:
        story_text = item.get("story", "")
        story_id = item.get("story_id", "unknown")
        model_name_used = item.get("model", "unknown")
        
        if not story_text:
            continue
            
        # Determine which model path to use
        model_path = None
        mmproj_path = None
        is_multimodal = False
        evaluator_model_name = "N/A"
        
        if "phi" in model_name_used.lower():
            model_path = MODEL_PATH_PHI
            evaluator_model_name = "phi3:3.8b (GGUF)"
        elif "qwen" in model_name_used.lower():
            model_path = MODEL_PATH_QWEN
            mmproj_path = MMPROJ_PATH_QWEN
            is_multimodal = True
            evaluator_model_name = "qwen2.5vl:3b (GGUF)"
        
        if model_path:
            print(f"\nCalculating perplexity for story {story_id} using {evaluator_model_name}...")
            ppl = calculate_perplexity(
                model_path, 
                story_text,
                is_multimodal=is_multimodal,
                mmproj_path=mmproj_path
            )
            
            item_with_ppl = item.copy()
            item_with_ppl["evaluator_model"] = evaluator_model_name
            item_with_ppl["perplexity"] = round(ppl, 2) if not math.isinf(ppl) else float('inf')
            results_with_ppl.append(item_with_ppl)
            print(f"Perplexity for story {story_id}: {item_with_ppl['perplexity']}")
    
    # Save results
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(results_with_ppl, f, ensure_ascii=False, indent=2)
    
    # Calculate average perplexity per model
    model_perplexities = {}
    for item in results_with_ppl:
        model_eval = item.get("evaluator_model", "N/A")
        ppl = item.get("perplexity", float('inf'))
        if model_eval != "N/A" and not math.isinf(ppl):
            if model_eval not in model_perplexities:
                model_perplexities[model_eval] = []
            model_perplexities[model_eval].append(ppl)
    
    print("\n--- Average Perplexity per Model ---")
    for model_eval, ppls in model_perplexities.items():
        if ppls:
            avg_ppl = sum(ppls) / len(ppls)
            print(f"Model {model_eval}: Average Perplexity = {avg_ppl:.2f}")


Calculating perplexity for story 1 using phi3:3.8b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 1: 2.2

Calculating perplexity for story 2 using phi3:3.8b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 2: 2.63

Calculating perplexity for story 3 using phi3:3.8b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 3: 1.68

Calculating perplexity for story 4 using phi3:3.8b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 4: 2.28

Calculating perplexity for story 5 using phi3:3.8b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 5: 2.15

Calculating perplexity for story 6 using phi3:3.8b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 6: 2.86

Calculating perplexity for story 7 using phi3:3.8b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 7: 2.87

Calculating perplexity for story 8 using phi3:3.8b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 8: 2.18

Calculating perplexity for story 9 using phi3:3.8b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 9: 2.38

Calculating perplexity for story 10 using phi3:3.8b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 10: 2.42

Calculating perplexity for story 1 using qwen2.5vl:3b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 1: 2.4

Calculating perplexity for story 2 using qwen2.5vl:3b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 2: 3.28

Calculating perplexity for story 3 using qwen2.5vl:3b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 3: 3.05

Calculating perplexity for story 4 using qwen2.5vl:3b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 4: 2.79

Calculating perplexity for story 5 using qwen2.5vl:3b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 5: 3.03

Calculating perplexity for story 6 using qwen2.5vl:3b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 6: 4.74

Calculating perplexity for story 7 using qwen2.5vl:3b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 7: 2.91

Calculating perplexity for story 8 using qwen2.5vl:3b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 8: 3.05

Calculating perplexity for story 9 using qwen2.5vl:3b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 9: 2.96

Calculating perplexity for story 10 using qwen2.5vl:3b (GGUF)...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Perplexity for story 10: 2.67

--- Average Perplexity per Model ---
Model phi3:3.8b (GGUF): Average Perplexity = 2.36
Model qwen2.5vl:3b (GGUF): Average Perplexity = 3.09


In [1]:
"""
Part 4.2 - ROUGE-1 F1 Evaluation for Abstractive Text Summarization
"""

import json
import os
import re
from rouge import Rouge
import nltk
from nltk.tokenize import word_tokenize
import math

# --- Configuration ---
# Path to the summarization output
SUMMARIES_FILE = "json_outputs/ats_output.json"
# Output file for ROUGE results
OUTPUT_FILE = "json_outputs/ats_rouge_results.json"

def preprocess_text(text):
    """
    Preprocess text for ROUGE calculation by:
    - Converting to lowercase
    - Removing punctuation
    - Tokenizing
    
    Args:
        text (str): Text to preprocess
        
    Returns:
        str: Preprocessed text
    """
    if not text or not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize and rejoin
    tokens = word_tokenize(text)
    return " ".join(tokens)

def calculate_rouge_scores(hypotheses, references):
    """
    Calculate ROUGE scores between hypotheses and references.
    
    Args:
        hypotheses (list): List of generated summaries
        references (list): List of original stories
        
    Returns:
        dict: ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L)
    """
    # Initialize ROUGE evaluator
    rouge = Rouge()
    
    # Calculate scores
    scores = rouge.get_scores(hypotheses, references, avg=True)
    
    return scores

def calculate_rouge1_f1_for_summary(summary, original_story):
    """
    Calculate ROUGE-1 F1 score for a single summary.
    
    Args:
        summary (str): Generated summary
        original_story (str): Original story text
        
    Returns:
        float: ROUGE-1 F1 score
    """
    if not summary or not original_story:
        return 0.0
    
    # Preprocess texts
    summary_clean = preprocess_text(summary)
    story_clean = preprocess_text(original_story)
    
    # Calculate ROUGE scores
    rouge = Rouge()
    scores = rouge.get_scores(summary_clean, story_clean)
    
    # Return ROUGE-1 F1 score
    return scores[0]['rouge-1']['f']

def main():
    """Main function to evaluate summarization results with ROUGE-1 F1"""
    # Ensure NLTK data is available
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    
    # Load summarization results
    if not os.path.exists(SUMMARIES_FILE):
        print(f"[ERROR] Summarization results file not found at {SUMMARIES_FILE}")
        print("Please run Part 3.2 (ATS) first to generate summaries.")
        return
    
    with open(SUMMARIES_FILE, "r", encoding="utf-8") as f:
        summaries_data = json.load(f)
    
    results_with_rouge = []
    
    # Process each summary
    for item in summaries_data:
        summary = item.get("summary", "")
        original_story = item.get("original_story", "")
        model_name = item.get("original_model", "unknown")
        story_id = item.get("story_id", "unknown")
        
        if not summary or not original_story:
            print(f"[WARNING] Missing summary or story for {story_id}. Skipping ROUGE calculation.")
            rouge1_f1 = 0.0
            is_valid = False
        else:
            print(f"\nCalculating ROUGE-1 F1 for story {story_id} using {model_name}...")
            rouge1_f1 = calculate_rouge1_f1_for_summary(summary, original_story)
            is_valid = True
            print(f"  -> ROUGE-1 F1: {rouge1_f1:.4f}")
        
        # Create result entry
        result = item.copy()
        result["rouge1_f1"] = round(rouge1_f1, 4)
        result["is_valid"] = is_valid
        results_with_rouge.append(result)
    
    # Save results
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(results_with_rouge, f, ensure_ascii=False, indent=2)
    
    print(f"\n[SUCCESS] ROUGE evaluation results saved to {OUTPUT_FILE}")
    
    # Calculate average ROUGE-1 F1 per model
    model_scores = {}
    for item in results_with_rouge:
        model = item.get("original_model", "N/A")
        score = item.get("rouge1_f1", 0.0)
        if model != "N/A" and item.get("is_valid", False):
            if model not in model_scores:
                model_scores[model] = []
            model_scores[model].append(score)
    
    print("\n--- Average ROUGE-1 F1 per Model ---")
    for model, scores in model_scores.items():
        if scores:
            avg_score = sum(scores) / len(scores)
            print(f"  Model {model}: Average ROUGE-1 F1 = {avg_score:.4f}")
        else:
            print(f"  Model {model}: No valid ROUGE scores calculated.")

if __name__ == "__main__":
    main()


Calculating ROUGE-1 F1 for story 1 using phi3:3.8b...
  -> ROUGE-1 F1: 0.0842

Calculating ROUGE-1 F1 for story 2 using phi3:3.8b...
  -> ROUGE-1 F1: 0.2184

Calculating ROUGE-1 F1 for story 3 using phi3:3.8b...
  -> ROUGE-1 F1: 0.1707

Calculating ROUGE-1 F1 for story 4 using phi3:3.8b...
  -> ROUGE-1 F1: 0.1834

Calculating ROUGE-1 F1 for story 5 using phi3:3.8b...
  -> ROUGE-1 F1: 0.1417

Calculating ROUGE-1 F1 for story 6 using phi3:3.8b...
  -> ROUGE-1 F1: 0.1453

Calculating ROUGE-1 F1 for story 7 using phi3:3.8b...
  -> ROUGE-1 F1: 0.1582

Calculating ROUGE-1 F1 for story 8 using phi3:3.8b...
  -> ROUGE-1 F1: 0.1841

Calculating ROUGE-1 F1 for story 9 using phi3:3.8b...
  -> ROUGE-1 F1: 0.1139

Calculating ROUGE-1 F1 for story 10 using phi3:3.8b...
  -> ROUGE-1 F1: 0.3954

Calculating ROUGE-1 F1 for story 1 using qwen2.5vl:3b...
  -> ROUGE-1 F1: 0.1881

Calculating ROUGE-1 F1 for story 2 using qwen2.5vl:3b...
  -> ROUGE-1 F1: 0.1308

Calculating ROUGE-1 F1 for story 3 using qwe

In [6]:
"""
Part 4.3 - Classification Accuracy Evaluation for Natural Language Inference
"""

import json
import os

# --- Configuration ---
# Path to the NLI prediction outputs
NLI_OUTPUT_PHI = "json_outputs/nli_output_phi3.json"
NLI_OUTPUT_QWEN = "json_outputs/nli_output_qwen.json"
# Output file for accuracy results
OUTPUT_FILE = "json_outputs/nli_accuracy_results.json"

def main():
    """Main function to evaluate NLI classification accuracy"""
    results = {}
    
    # Process phi3:3.8b results
    if os.path.exists(NLI_OUTPUT_PHI):
        with open(NLI_OUTPUT_PHI, "r", encoding="utf-8") as f:
            phi3_data = json.load(f)
        
        # phi3_data is a list with one item (the results for phi3)
        if len(phi3_data) > 0:
            phi3_result = phi3_data[0]
            accuracy = phi3_result["accuracy"]
            correct_predictions = phi3_result["correct_predictions"]
            total_predictions = phi3_result["total_items"]
            
            results["phi3:3.8b"] = {
                "accuracy": accuracy,
                "correct_predictions": correct_predictions,
                "total_predictions": total_predictions
            }
    
    # Process qwen2.5vl:3b results
    if os.path.exists(NLI_OUTPUT_QWEN):
        with open(NLI_OUTPUT_QWEN, "r", encoding="utf-8") as f:
            qwen_data = json.load(f)
        
        # qwen_data is a list with one item (the results for qwen)
        if len(qwen_data) > 0:
            qwen_result = qwen_data[0]
            accuracy = qwen_result["accuracy"]
            correct_predictions = qwen_result["correct_predictions"]
            total_predictions = qwen_result["total_items"]
            
            results["qwen2.5vl:3b"] = {
                "accuracy": accuracy,
                "correct_predictions": correct_predictions,
                "total_predictions": total_predictions
            }
    
    # Save results
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)
    
    # Print summary
    print("\n--- NLI Classification Accuracy Results ---")
    for model, metrics in results.items():
        print(f"\nModel: {model}")
        print(f"Accuracy: {metrics['accuracy']:.4f} ({metrics['correct_predictions']}/{metrics['total_predictions']})")

if __name__ == "__main__":
    main()


--- NLI Classification Accuracy Results ---

Model: phi3:3.8b
Accuracy: 0.7900 (79/100)

Model: qwen2.5vl:3b
Accuracy: 0.8300 (83/100)


In [7]:
"""
Part 4.4 - CIDEr Evaluation for Image Captioning
"""

import json
import os

# --- Configuration ---
# Path to the combined image captioning output (from the merge script)
IMAGE_CAPTIONS_FILE = "json_outputs/ic_output.json"
# Output file for CIDEr results (simplified format for Part 4.4)
OUTPUT_FILE = "json_outputs/ic_cider_results.json"

def main():
    """Main function to extract and present CIDEr results from Part 3.4 output"""
    # Check if the combined results file exists
    if not os.path.exists(IMAGE_CAPTIONS_FILE):
        print(f"[ERROR] Combined image captioning results file not found at {IMAGE_CAPTIONS_FILE}")
        print("Please run the merge script first to combine all subsets.")
        print("Expected file structure: json_outputs/ic_output.json")
        return
    
    # Load the combined results
    with open(IMAGE_CAPTIONS_FILE, "r", encoding="utf-8") as f:
        caption_data = json.load(f)
    
    # Extract CIDEr results
    results = {}
    
    # Assuming there's only one model in the results (qwen2.5vl:3b)
    if caption_data and len(caption_data) > 0:
        model_data = caption_data[0]
        model_name = model_data["model"]
        
        results[model_name] = {
            "cider_score": model_data["cider_score"],
            "total_images": model_data["total_images"],
            "valid_captions": model_data["valid_captions"],
            "reject_rate": model_data["reject_rate"],
            "average_inference_time": model_data["average_inference_time"]
        }
    
    # Save simplified results
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)
    
    # Print summary
    print("\n--- CIDEr Evaluation Results for Image Captioning ---")
    for model, metrics in results.items():
        print(f"\nModel: {model}")
        print(f"CIDEr Score: {metrics['cider_score']:.4f}")
        print(f"Valid Captions: {metrics['valid_captions']}/{metrics['total_images']}")
        print(f"Reject Rate: {metrics['reject_rate']:.4f}")
        print(f"Average Inference Time: {metrics['average_inference_time']:.2f} seconds")

if __name__ == "__main__":
    main()


--- CIDEr Evaluation Results for Image Captioning ---

Model: qwen2.5vl:3b
CIDEr Score: 0.6318
Valid Captions: 100/100
Reject Rate: 0.0000
Average Inference Time: 146.74 seconds


In [2]:
"""
Part 4.5 - Visual question answering
"""

import json
import os

def load_vqa_results(file_path):
    """Load VQA results from a JSON file."""
    if not os.path.exists(file_path):
        print(f"[Error] File not found: {file_path}")
        return None
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            return data[0]  # Return the first (and only) model's results
    except Exception as e:
        print(f"[Error] Failed to load {file_path}: {e}")
        return None

def calculate_combined_metrics(results_list):
    """Calculate combined metrics for multiple result sets."""
    total_questions = 0
    total_valid = 0
    total_correct = 0
    total_inference_time = 0.0
    
    for results in results_list:
        total_questions += results["total_questions"]
        total_valid += results["valid_answers"]
        total_correct += results["correct_answers"]
        total_inference_time += results["average_inference_time"] * results["valid_answers"]
    
    # Calculate combined metrics
    combined_accuracy = total_correct / total_valid if total_valid > 0 else 0
    combined_reject_rate = (total_questions - total_valid) / total_questions if total_questions > 0 else 0
    combined_avg_inference_time = total_inference_time / total_valid if total_valid > 0 else 0
    
    return {
        "model": "qwen2.5vl:3b",
        "subset_range": "11-40",
        "total_questions": total_questions,
        "valid_answers": total_valid,
        "correct_answers": total_correct,
        "accuracy": combined_accuracy,
        "reject_rate": combined_reject_rate,
        "average_inference_time": combined_avg_inference_time
    }

def print_table(metrics):
    """Print metrics in a table format with headers."""
    # Print header
    print("{:<15} {:<15} {:<15} {:<15} {:<15} {:<15}".format(
        "Total Qs", "Valid Ans", "Correct Ans", "Accuracy", "Reject Rate", "Avg Time (s)"
    ))
    print("-" * 90)
    
    # Print values
    print("{:<15} {:<15} {:<15} {:<15.4f} {:<15.4f} {:<15.2f}".format(
        metrics["total_questions"],
        metrics["valid_answers"],
        metrics["correct_answers"],
        metrics["accuracy"],
        metrics["reject_rate"],
        metrics["average_inference_time"]
    ))

def save_combined_results(output_file, combined_metrics):
    """Save combined results to a JSON file."""
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    # Prepare the result structure
    result = {
        "model": combined_metrics["model"],
        "subset_range": combined_metrics["subset_range"],
        "total_questions": combined_metrics["total_questions"],
        "valid_answers": combined_metrics["valid_answers"],
        "correct_answers": combined_metrics["correct_answers"],
        "accuracy": combined_metrics["accuracy"],
        "reject_rate": combined_metrics["reject_rate"],
        "average_inference_time": combined_metrics["average_inference_time"]
    }
    
    # Save to JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    
    print(f"\n[Success] Combined results saved to {output_file}")

def main():
    # List of JSON files to process (based on your directory structure)
    json_files = [
        "json_outputs/vqa_outputs/vqa_output_subset_11_20.json",
        "json_outputs/vqa_outputs/vqa_output_subset_21_30.json",
        "json_outputs/vqa_outputs/vqa_output_subset_31_40.json",
        "json_outputs/vqa_outputs/vqa_output_subset_70_80.json"
    ]
    
    # Load results from each file
    results_list = []
    for file in json_files:
        results = load_vqa_results(file)
        if results:
            results_list.append(results)
    
    # Calculate and print table
    if results_list:
        combined_metrics = calculate_combined_metrics(results_list)
        
        print("\n" + "=" * 90)
        print("VQA Evaluation Metrics (Questions 11-40)")
        print("=" * 90)
        print_table(combined_metrics)
        print("=" * 90)
        
        # Save combined results to a new JSON file
        save_combined_results("json_outputs/results/vqa_results.json", combined_metrics)
    else:
        print("[Error] No valid results to combine.")

if __name__ == "__main__":
    main()


VQA Evaluation Metrics (Questions 11-40)
Total Qs        Valid Ans       Correct Ans     Accuracy        Reject Rate     Avg Time (s)   
------------------------------------------------------------------------------------------
40              40              14              0.3500          0.0000          402.93         

[Success] Combined results saved to json_outputs/results/vqa_results.json
