# Result Analysis for VQA4Mix

This notebook analyzes the results of the VQA4Mix project, including:
- Loading and examining results from different categories
- Calculating accuracy metrics
- Visualizing results
- Comparing performance across categories and difficulty levels

## Import Required Libraries

In [None]:
import sys
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add the project root to the Python path
sys.path.append('..')

# Import project modules
from src.data_processing.data_loader import load_json_data, load_annotation_data
from src.utils.evaluation import calculate_accuracy, calculate_accuracy_by_difficulty, generate_confusion_matrix
from src.visualization.plotting import plot_confusion_matrix, plot_accuracy_by_category, plot_model_comparison

# Set pandas display options
pd.set_option('display.max_colwidth', None)

## Configuration

In [None]:
# Define result paths for each category
RESULT_PATHS = {
    'food': '../data/food/food_annotation_with_MCQ_result_3_difficulties.json',
    'painting': '../data/painting/paintings_with_MCQ_3diff_result.json',
    'people': '../data/people/people_annotation_with_MCQ_result_3_difficulties.json',
    'cat': '../data/cat/upking_annotation_with_MCQ_result_3_difficulties.json'
}

# Define augmented result paths for each category
AUGMENTED_RESULT_PATHS = {
    'food': '../data/food/food_annotation_with_MCQ_result_3_difficulties_with_image_augmentation.json',
    'painting': '../data/painting/paintings_with_MCQ_3diff_result_augmentation.json',
    'people': '../data/people/people_annotation_with_MCQ_result_3_difficulties_with_image_augmentation.json',
    'cat': '../data/cat/upking_annotation_with_MCQ_result_3_difficulties_with_image_augmentation.json'
}

# Define the category to analyze (set to None to analyze all categories)
CATEGORY = None  # Options: 'food', 'painting', 'people', 'cat', or None for all

## Load Results

In [None]:
def load_result_data(category, augmented=False):
    """Load result data for a specific category."""
    file_path = AUGMENTED_RESULT_PATHS[category] if augmented else RESULT_PATHS[category]
    print(f"Loading {'augmented ' if augmented else ''}results for {category} from {file_path}")
    
    try:
        df = load_annotation_data(file_path)
        print(f"Loaded {len(df)} records for {category}")
        return df
    except Exception as e:
        print(f"Error loading results for {category}: {e}")
        return None

# Load results for the specified category or all categories
result_data = {}
augmented_result_data = {}

if CATEGORY is not None:
    result_data[CATEGORY] = load_result_data(CATEGORY, augmented=False)
    augmented_result_data[CATEGORY] = load_result_data(CATEGORY, augmented=True)
else:
    for category in RESULT_PATHS.keys():
        result_data[category] = load_result_data(category, augmented=False)
        augmented_result_data[category] = load_result_data(category, augmented=True)

## Examine Result Structure

In [None]:
# Display the first row of each category's results
for category, df in result_data.items():
    if df is not None:
        print(f"\n{category.upper()} RESULT STRUCTURE:")
        print(f"Columns: {df.columns.tolist()}")
        print(f"Sample row:")
        display(df.head(1))

## Calculate Accuracy Metrics

In [None]:
def calculate_metrics(df, category, augmented=False):
    """Calculate accuracy metrics for a category."""
    print(f"Calculating metrics for {category} ({'augmented' if augmented else 'standard'})...")
    
    # Define prediction columns for each difficulty level
    prediction_cols = {
        'easy': 'multiple_choice_prediction_easy',
        'medium': 'multiple_choice_prediction_medium',
        'hard': 'multiple_choice_prediction_hard'
    }
    
    # Calculate accuracy for each difficulty level
    accuracies = {}
    for level, col in prediction_cols.items():
        if col in df.columns:
            accuracy = calculate_accuracy(df[col], df['multiple_choice_solution'])
            accuracies[level] = accuracy
            print(f"{level.capitalize()} accuracy: {accuracy:.2%}")
    
    return accuracies

# Calculate metrics for each category
metrics = {}
augmented_metrics = {}

for category, df in result_data.items():
    if df is not None:
        metrics[category] = calculate_metrics(df, category, augmented=False)

for category, df in augmented_result_data.items():
    if df is not None:
        augmented_metrics[category] = calculate_metrics(df, category, augmented=True)

## Visualize Results by Category

In [None]:
def visualize_results_by_category(metrics, title_prefix=""):
    """Visualize results by category."""
    if not metrics:
        print("No metrics to visualize.")
        return
    
    # Visualize results for each difficulty level
    for level in ['easy', 'medium', 'hard']:
        # Collect accuracies for each category
        category_accuracies = {}
        for category, accuracies in metrics.items():
            if level in accuracies:
                category_accuracies[category] = accuracies[level]
        
        # Plot comparison
        if category_accuracies:
            title = f"{title_prefix}{level.capitalize()} Difficulty - Accuracy by Category"
            fig = plot_accuracy_by_category(
                list(category_accuracies.values()),
                list(category_accuracies.keys()),
                title=title
            )
            plt.show()

# Visualize results by category
visualize_results_by_category(metrics, title_prefix="Standard - ")
visualize_results_by_category(augmented_metrics, title_prefix="Augmented - ")

## Visualize Results by Difficulty Level

In [None]:
def visualize_results_by_difficulty(metrics, augmented_metrics):
    """Visualize results by difficulty level."""
    if not metrics or not augmented_metrics:
        print("Insufficient metrics to visualize.")
        return
    
    # Visualize results for each category
    for category in metrics.keys():
        if category in augmented_metrics:
            std_accuracies = metrics[category]
            aug_accuracies = augmented_metrics[category]
            
            # Combine standard and augmented results
            combined_accuracies = {}
            for level in ['easy', 'medium', 'hard']:
                if level in std_accuracies:
                    combined_accuracies[f"Standard - {level.capitalize()}"] = std_accuracies[level]
                if level in aug_accuracies:
                    combined_accuracies[f"Augmented - {level.capitalize()}"] = aug_accuracies[level]
            
            # Plot comparison
            if combined_accuracies:
                title = f"{category.capitalize()} - Accuracy by Difficulty Level and Augmentation"
                fig = plot_model_comparison(combined_accuracies, title=title)
                plt.show()

# Visualize results by difficulty level
visualize_results_by_difficulty(metrics, augmented_metrics)

## Generate Confusion Matrices

In [None]:
def generate_and_plot_confusion_matrices(df, category, augmented=False):
    """Generate and plot confusion matrices for a category."""
    print(f"Generating confusion matrices for {category} ({'augmented' if augmented else 'standard'})...")
    
    # Define prediction columns for each difficulty level
    prediction_cols = {
        'easy': 'multiple_choice_prediction_easy',
        'medium': 'multiple_choice_prediction_medium',
        'hard': 'multiple_choice_prediction_hard'
    }
    
    # Generate and plot confusion matrix for each difficulty level
    for level, col in prediction_cols.items():
        if col in df.columns:
            # Generate confusion matrix
            confusion_mat = generate_confusion_matrix(df[col], df['multiple_choice_solution'])
            
            # Plot confusion matrix
            title = f"{category.capitalize()} - {level.capitalize()} Difficulty ({'Augmented' if augmented else 'Standard'})"
            fig = plot_confusion_matrix(confusion_mat, title=title)
            plt.show()

# Generate and plot confusion matrices for each category
for category, df in result_data.items():
    if df is not None:
        generate_and_plot_confusion_matrices(df, category, augmented=False)

for category, df in augmented_result_data.items():
    if df is not None:
        generate_and_plot_confusion_matrices(df, category, augmented=True)

## Compare Standard vs. Augmented Results

In [None]:
def compare_standard_vs_augmented(metrics, augmented_metrics):
    """Compare standard vs. augmented results."""
    if not metrics or not augmented_metrics:
        print("Insufficient metrics to compare.")
        return
    
    # Compare results for each difficulty level
    for level in ['easy', 'medium', 'hard']:
        # Collect accuracies for each category
        comparison = {}
        for category in metrics.keys():
            if category in augmented_metrics and level in metrics[category] and level in augmented_metrics[category]:
                std_acc = metrics[category][level]
                aug_acc = augmented_metrics[category][level]
                comparison[category] = {
                    'Standard': std_acc,
                    'Augmented': aug_acc,
                    'Difference': aug_acc - std_acc
                }
        
        # Create a DataFrame for comparison
        if comparison:
            df_comparison = pd.DataFrame(comparison).T
            df_comparison['Difference %'] = df_comparison['Difference'] * 100
            
            print(f"\nComparison for {level.capitalize()} Difficulty:")
            display(df_comparison)
            
            # Plot comparison
            plt.figure(figsize=(10, 6))
            df_comparison[['Standard', 'Augmented']].plot(kind='bar', ax=plt.gca())
            plt.title(f"{level.capitalize()} Difficulty - Standard vs. Augmented")
            plt.ylabel('Accuracy')
            plt.ylim(0, 1.1)
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.show()

# Compare standard vs. augmented results
compare_standard_vs_augmented(metrics, augmented_metrics)

## Analyze Error Patterns

In [None]:
def analyze_error_patterns(df, category, level):
    """Analyze error patterns for a specific category and difficulty level."""
    print(f"Analyzing error patterns for {category} - {level} difficulty...")
    
    # Define prediction column
    prediction_col = f'multiple_choice_prediction_{level}'
    
    if prediction_col not in df.columns:
        print(f"Prediction column '{prediction_col}' not found.")
        return
    
    # Identify incorrect predictions
    df_errors = df[df[prediction_col] != df['multiple_choice_solution']].copy()
    
    if len(df_errors) == 0:
        print("No errors found.")
        return
    
    print(f"Found {len(df_errors)} errors out of {len(df)} samples ({len(df_errors)/len(df):.2%}).")
    
    # Analyze error distribution
    error_distribution = df_errors.groupby(['multiple_choice_solution', prediction_col]).size().unstack(fill_value=0)
    print("\nError distribution:")
    display(error_distribution)
    
    # Plot error distribution
    plt.figure(figsize=(8, 6))
    sns.heatmap(error_distribution, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{category.capitalize()} - {level.capitalize()} Difficulty - Error Distribution")
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.show()
    
    # Return a few examples of errors
    print("\nExamples of errors:")
    display(df_errors.head(3))

# Analyze error patterns for each category and difficulty level
for category, df in result_data.items():
    if df is not None:
        for level in ['easy', 'medium', 'hard']:
            analyze_error_patterns(df, category, level)

## Conclusion

This notebook has analyzed the results of the VQA4Mix project, including calculating accuracy metrics, visualizing results, and comparing performance across categories and difficulty levels. The analysis provides insights into the performance of the model on different types of images and the impact of image augmentation.