# Preparing the dataset
After analyzing our dataset, we need to prepare it for model training. The prepare_dataset function stored in functions.py handles this by cleaning and organizing our data. It keeps only the essential columns (ImgId and categories), creates backups of the original data, and performs consistency checks between the CSV entries and actual image files. This function also provides detailed information about the dataset's shape, category distribution, and any potential inconsistencies, helping us ensure our data is properly structured before proceeding with feature extraction and model training.

In [None]:
def prepare_dataset(data_dir, CSV_PATH, images_dir, backup_dir):
    """
    Prepare dataset by cleaning and organizing data
    """
    # Create backup directory
    os.makedirs(backup_dir, exist_ok=True)

    print("Loading dataset...")
    df = pd.read_csv(CSV_PATH)

    # Print initial shape
    print(f"\nInitial dataset shape: {df.shape}")
    print("Initial columns:", df.columns.tolist())

    # Keep only essential columns
    keep_columns = ['ImgId', 'categories']
    df_filtered = df[keep_columns]

    print("\nAfter filtering:")
    print(f"Final dataset shape: {df_filtered.shape}")
    print("\nCategory distribution:")
    print(df_filtered['categories'].value_counts())

    # Save CSVs
    backup_CSV_PATH = os.path.join(backup_dir, 'styles_original.csv')
    df.to_csv(backup_CSV_PATH, index=False)  # Backup
    df_filtered.to_csv(CSV_PATH, index=False)  # Filtered
    print(f"\nSaved backup to: {backup_CSV_PATH}")
    print(f"Saved filtered data to: {CSV_PATH}")

    # Return dictionary with all results
    results = {
        'original_shape': df.shape,
        'filtered_shape': df_filtered.shape,
        'kept_columns': df_filtered.columns.tolist(),
        'category_distribution': df_filtered['categories'].value_counts(),
        'missing_values': df_filtered.isnull().sum()
    }

    # Add consistency check results
    print("\nPerforming consistency checks...")
    remaining_images = set(os.path.splitext(f)[0] for f in os.listdir(images_dir) if f.endswith('.jpg'))
    csv_ids = set(df_filtered['ImgId'].astype(str))

    results['inconsistencies'] = {
        'missing_images': csv_ids - remaining_images,
        'extra_images': remaining_images - csv_ids
    }

    print(f"Missing images: {len(results['inconsistencies']['missing_images'])}")
    print(f"Extra images: {len(results['inconsistencies']['extra_images'])}")

    return results

# Connect dataset and csv file
The connect_dataset function verifies the relationship between our CSV metadata and actual image files, ensuring data integrity before model training. It creates image paths from ImgId values, validates each image's existence and readability, and maintains only valid entries. The function identified 4,229 missing images, matching our earlier analysis, and produced a clean dataset of 42,000 valid images with their corresponding categories ready for feature extraction.


In [None]:
def connect_dataset(CSV_PATH, IMAGES_DIR):
    """
    Connect the image dataset by linking the CSV metadata with actual image files.

    Parameters:
    CSV_PATH (str): Path to the CSV file containing image metadata
    IMAGES_DIR (str): Path to the folder containing image files

    Returns:
    tuple: (DataFrame with verified image paths, list of any missing images)
    """
    print("Reading CSV metadata...")
    metadata_df = pd.read_csv(CSV_PATH)

    metadata_df['image_path'] = metadata_df['ImgId'].apply(
        lambda x: os.path.join(IMAGES_DIR, f"{x}.jpg")
    )

    print("Verifying image files...")
    missing_images = []
    existing_images = []

    for idx, row in metadata_df.iterrows():
        if idx % 1000 == 0:
            print(f"Checking image {idx} of {len(metadata_df)}")

        if os.path.exists(row['image_path']):
            try:
                with Image.open(row['image_path']) as img:
                    existing_images.append(True)
            except Exception as e:
                print(f"Error with image {row['ImgId']}: {str(e)}")
                existing_images.append(False)
                missing_images.append(row['ImgId'])
        else:
            existing_images.append(False)
            missing_images.append(row['ImgId'])

    metadata_df['image_exists'] = existing_images
    valid_df = metadata_df[metadata_df['image_exists']][['ImgId', 'categories', 'image_path']].copy()

    print("\nDataset Summary:")
    print(f"Total entries in CSV: {len(metadata_df)}")
    print(f"Valid images found: {len(valid_df)}")
    print(f"Missing images: {len(missing_images)}")

    return valid_df, missing_images