# Data Preprocessing for VQA4Mix

This notebook handles data preprocessing tasks for the VQA4Mix project, including:
- Loading and examining data from different categories
- Standardizing data formats
- Cleaning and preprocessing data
- Saving processed data for further use

## Import Required Libraries

In [None]:
import sys
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import re

# Add the project root to the Python path
sys.path.append('..')

# Import project modules
from src.data_processing.data_loader import load_json_data, load_annotation_data, save_json_data, convert_df_to_json

# Set pandas display options
pd.set_option('display.max_colwidth', None)

## Configuration

In [None]:
# Define data paths for each category
DATA_PATHS = {
    'food': '../data/food/food_annotation.json',
    'painting': '../data/painting/paintings.json',
    'people': '../data/people/people_data.json',
    'cat': '../data/cat/upking_data.json'
}

# Define output paths for processed data
OUTPUT_PATHS = {
    'food': '../data/food/food_annotation_modified.json',
    'painting': '../data/painting/paintings_modified.json',
    'people': '../data/people/people_data_modified.json',
    'cat': '../data/cat/upking_data_modified.json'
}

# Define the category to process (set to None to process all categories)
CATEGORY = None  # Options: 'food', 'painting', 'people', 'cat', or None for all

## Load Dataset

In [None]:
def load_category_data(category):
    """Load data for a specific category."""
    file_path = DATA_PATHS[category]
    print(f"Loading {category} data from {file_path}")
    
    try:
        # Try loading as a DataFrame
        df = load_annotation_data(file_path)
        print(f"Loaded {len(df)} records for {category}")
        return df
    except Exception as e:
        print(f"Error loading {category} data as DataFrame: {e}")
        try:
            # Try loading as raw JSON
            data = load_json_data(file_path)
            if isinstance(data, list):
                df = pd.DataFrame(data)
                print(f"Loaded {len(df)} records for {category} from list")
                return df
            else:
                print(f"Unexpected data format for {category}")
                return None
        except Exception as e2:
            print(f"Error loading {category} data as raw JSON: {e2}")
            return None

# Load data for the specified category or all categories
category_data = {}
if CATEGORY is not None:
    category_data[CATEGORY] = load_category_data(CATEGORY)
else:
    for category in DATA_PATHS.keys():
        category_data[category] = load_category_data(category)

## Examine Data Structure

In [None]:
# Display the first row of each category's data
for category, df in category_data.items():
    if df is not None:
        print(f"\n{category.upper()} DATA STRUCTURE:")
        print(f"Columns: {df.columns.tolist()}")
        print(f"Sample row:")
        display(df.head(1))

## Standardize Data Format

In [None]:
def standardize_data_format(df, category):
    """Standardize the data format for a category."""
    print(f"Standardizing data format for {category}...")
    
    # Create a copy of the DataFrame to avoid modifying the original
    df_std = df.copy()
    
    # Ensure 'id' column exists
    if 'id' not in df_std.columns:
        if 'image_id' in df_std.columns:
            df_std['id'] = df_std['image_id']
        else:
            df_std['id'] = range(len(df_std))
    
    # Ensure 'file_path' column exists
    if 'file_path' not in df_std.columns:
        if 'img_url' in df_std.columns:
            df_std['file_path'] = df_std['img_url']
        elif 'image_path' in df_std.columns:
            df_std['file_path'] = df_std['image_path']
        else:
            # Create a default file path based on category and id
            df_std['file_path'] = df_std['id'].apply(lambda x: f"../data/{category}/{x}.jpg")
    
    # Ensure 'captions' column exists
    if 'captions' not in df_std.columns:
        if 'reference_caption' in df_std.columns:
            # Convert single caption to list format
            df_std['captions'] = df_std['reference_caption'].apply(lambda x: [x] if isinstance(x, str) else x)
        elif 'caption' in df_std.columns:
            df_std['captions'] = df_std['caption'].apply(lambda x: [x] if isinstance(x, str) else x)
        else:
            df_std['captions'] = [[] for _ in range(len(df_std))]
    
    # Ensure captions are in list format
    df_std['captions'] = df_std['captions'].apply(lambda x: [x] if isinstance(x, str) else x)
    
    # Select only the standardized columns
    std_columns = ['id', 'file_path', 'captions']
    additional_columns = [col for col in df_std.columns if col not in std_columns and not col.startswith('_')]
    
    return df_std[std_columns + additional_columns]

# Standardize data format for each category
standardized_data = {}
for category, df in category_data.items():
    if df is not None:
        standardized_data[category] = standardize_data_format(df, category)

## Clean and Preprocess Data

In [None]:
def clean_and_preprocess_data(df, category):
    """Clean and preprocess data for a category."""
    print(f"Cleaning and preprocessing data for {category}...")
    
    # Create a copy of the DataFrame to avoid modifying the original
    df_clean = df.copy()
    
    # Clean file paths
    def clean_file_path(path):
        if not isinstance(path, str):
            return path
        
        # Replace absolute paths with relative paths
        if '/shared/data/' in path:
            path = path.replace('/shared/data/', '../data/')
        
        # Ensure the path contains the category name
        if f'/{category}/' not in path and not path.startswith(f'../data/{category}/'):
            path = f'../data/{category}/{os.path.basename(path)}'
        
        return path
    
    df_clean['file_path'] = df_clean['file_path'].apply(clean_file_path)
    
    # Clean captions
    def clean_caption(caption):
        if not isinstance(caption, str):
            return caption
        
        # Remove extra whitespace
        caption = re.sub(r'\s+', ' ', caption).strip()
        
        # Ensure proper punctuation
        if not caption.endswith(('.', '!', '?')):
            caption += '.'
        
        return caption
    
    df_clean['captions'] = df_clean['captions'].apply(lambda captions: [clean_caption(c) for c in captions] if isinstance(captions, list) else captions)
    
    # Remove duplicates
    df_clean = df_clean.drop_duplicates(subset=['id'])
    
    # Reset index
    df_clean = df_clean.reset_index(drop=True)
    
    return df_clean

# Clean and preprocess data for each category
preprocessed_data = {}
for category, df in standardized_data.items():
    if df is not None:
        preprocessed_data[category] = clean_and_preprocess_data(df, category)

## Save Processed Data

In [None]:
# Save processed data for each category
for category, df in preprocessed_data.items():
    if df is not None:
        output_path = OUTPUT_PATHS[category]
        print(f"Saving processed data for {category} to {output_path}...")
        convert_df_to_json(df, output_path)

## Verify Processed Data

In [None]:
# Load and verify processed data for each category
for category in preprocessed_data.keys():
    output_path = OUTPUT_PATHS[category]
    print(f"\nVerifying processed data for {category} from {output_path}...")
    
    try:
        df = load_annotation_data(output_path)
        print(f"Successfully loaded {len(df)} records for {category}")
        print(f"Columns: {df.columns.tolist()}")
        print(f"Sample row:")
        display(df.head(1))
    except Exception as e:
        print(f"Error loading processed data for {category}: {e}")

## Conclusion

This notebook has processed data from all categories and standardized their format for use in the unified pipeline. The processed data is now ready for generating multiple-choice questions and model inference.