## Component Two: Vehicle Damage Insurance Claim Verification

---

### Loading the Dataset

In [49]:
# Import libraries
import pandas as pd
import numpy as np
import os
import shutil

# Set the project directory (FinalProject folder)
project_dir = '/Users/dannywu/Library/CloudStorage/OneDrive-hull.ac.uk/AI/FinalProject'

# Change to project directory
if os.getcwd() != project_dir:
    os.chdir(project_dir)
    print(f"Changed to directory: {os.getcwd()}")
else:
    print(f"Already in directory: {os.getcwd()}")

# Define file paths
train_path = os.path.join(project_dir, 'train', 'train.csv')
test_path = os.path.join(project_dir, 'test', 'test.csv')

# Verify files exist before loading
print(f"\nChecking files...")
print(f"Train CSV: {train_path}")
print(f"  Exists: {os.path.exists(train_path)}")
print(f"Test CSV: {test_path}")
print(f"  Exists: {os.path.exists(test_path)}")

if not os.path.exists(train_path):
    raise FileNotFoundError(f"Train CSV not found at: {train_path}")
if not os.path.exists(test_path):
    raise FileNotFoundError(f"Test CSV not found at: {test_path}")

# Load CSV files to understand the data structure
print("\nLoading CSV files...")
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
print("\nUnique labels:", sorted(train_df['label'].unique()))
print("\nLabel distribution:")
print(train_df['label'].value_counts().sort_index())

train_df.head()

Already in directory: /Users/dannywu/Library/CloudStorage/OneDrive-hull.ac.uk/AI/FinalProject

Checking files...
Train CSV: /Users/dannywu/Library/CloudStorage/OneDrive-hull.ac.uk/AI/FinalProject/train/train.csv
  Exists: True
Test CSV: /Users/dannywu/Library/CloudStorage/OneDrive-hull.ac.uk/AI/FinalProject/test/test.csv
  Exists: True

Loading CSV files...
Training data shape: (7200, 3)
Test data shape: (4800, 2)

Unique labels: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]

Label distribution:
label
1     171
2    2349
3     534
4    2079
5    1185
6     882
Name: count, dtype: int64


Unnamed: 0,image_id,filename,label
0,1,1.jpg,2
1,2,2.jpg,4
2,3,3.jpg,2
3,4,4.jpg,3
4,5,5.jpg,5


In [50]:
# Rename existing class directories to meaningful names (if already organized)
organized_dir = 'organized_data'

# Define meaningful class names for vehicle damage categories
class_name_mapping = {
    '1': 'crack',
    '2': 'scratch',
    '3': 'tire_flat',
    '4': 'dent',
    '5': 'glass_shatter',
    '6': 'lamp_broken'
}

if os.path.exists(organized_dir):
    print("Renaming class directories to meaningful names...")
    
    for old_label, new_name in class_name_mapping.items():
        old_path = os.path.join(organized_dir, old_label)
        new_path = os.path.join(organized_dir, new_name)
        
        if os.path.exists(old_path) and not os.path.exists(new_path):
            os.rename(old_path, new_path)
            img_count = len([f for f in os.listdir(new_path) if f.endswith(('.jpg', '.jpeg', '.png'))])
            print(f"  Renamed '{old_label}' -> '{new_name}' ({img_count} images)")
        elif os.path.exists(new_path):
            print(f"  '{new_name}' already exists")
    
    print("\nFinal class directories:")
    for class_dir in sorted(os.listdir(organized_dir)):
        class_path = os.path.join(organized_dir, class_dir)
        if os.path.isdir(class_path):
            img_count = len([f for f in os.listdir(class_path) if f.endswith(('.jpg', '.jpeg', '.png'))])
            print(f"  {class_dir}: {img_count} images")
else:
    print("organized_data directory does not exist. Run Cell 4 first to organize images.")


Renaming class directories to meaningful names...
  'crack' already exists
  'scratch' already exists
  'tire_flat' already exists
  'dent' already exists
  'glass_shatter' already exists
  'lamp_broken' already exists

Final class directories:
  crack: 171 images
  dent: 2079 images
  glass_shatter: 1185 images
  lamp_broken: 882 images
  scratch: 2349 images
  tire_flat: 534 images


In [51]:
# Organize images into class subdirectories (required for image_dataset_from_directory)
organized_dir = 'organized_data'

# Define meaningful class names for vehicle damage categories
class_name_mapping = {
    '1': 'crack',
    '2': 'scratch',
    '3': 'tire_flat',
    '4': 'dent',
    '5': 'glass_shatter',
    '6': 'lamp_broken'
}

# Remove existing directory if it exists
if os.path.exists(organized_dir):
    shutil.rmtree(organized_dir)

os.makedirs(organized_dir, exist_ok=True)

# Copy images to class subdirectories with meaningful names
print("Organizing images into class subdirectories...")
for idx, row in train_df.iterrows():
    label = str(row['label'])
    class_name = class_name_mapping.get(label, f'class_{label}')
    filename = row['filename']
    src = os.path.join(project_dir, 'train', 'images', filename)
    dst_dir = os.path.join(organized_dir, class_name)
    os.makedirs(dst_dir, exist_ok=True)
    dst = os.path.join(dst_dir, filename)
    
    if os.path.exists(src):
        shutil.copy2(src, dst)
    
    if (idx + 1) % 1000 == 0:
        print(f"  Processed {idx + 1}/{len(train_df)} images...")

print(f"\nCompleted! Images organized into {len(os.listdir(organized_dir))} class directories")
print("\nClass directories created:")
for class_dir in sorted(os.listdir(organized_dir)):
    class_path = os.path.join(organized_dir, class_dir)
    if os.path.isdir(class_path):
        img_count = len([f for f in os.listdir(class_path) if f.endswith(('.jpg', '.jpeg', '.png'))])
        print(f"  {class_dir}: {img_count} images")

Organizing images into class subdirectories...
  Processed 1000/7200 images...
  Processed 2000/7200 images...
  Processed 3000/7200 images...
  Processed 4000/7200 images...
  Processed 5000/7200 images...
  Processed 6000/7200 images...
  Processed 7000/7200 images...

Completed! Images organized into 6 class directories

Class directories created:
  crack: 171 images
  dent: 2079 images
  glass_shatter: 1185 images
  lamp_broken: 882 images
  scratch: 2349 images
  tire_flat: 534 images


In [52]:
# Load images using image_dataset_from_directory (as per workshop instructions)
from tensorflow.keras.utils import image_dataset_from_directory

# Check if organized_data directory exists
if not os.path.exists(organized_dir):
    print(f"ERROR: {organized_dir} directory does not exist!")
    print("Please run the previous cell (Cell 4) to organize images first.")
else:
    # Count images to verify
    total_images = 0
    for class_dir in os.listdir(organized_dir):
        class_path = os.path.join(organized_dir, class_dir)
        if os.path.isdir(class_path):
            img_count = len([f for f in os.listdir(class_path) if f.endswith(('.jpg', '.jpeg', '.png'))])
            total_images += img_count
    
    print(f"Found {total_images} images in {organized_dir}")
    
    if total_images == 0:
        print("ERROR: No images found! Please run Cell 4 to organize images.")
    elif total_images < len(train_df):
        print(f"WARNING: Only {total_images} images found, expected {len(train_df)}")
        print("The organization may not be complete. Please run Cell 4 again.")
    else:
        data_dir = organized_dir
        
        # Load training dataset with 10% validation split
        train_ds = image_dataset_from_directory(
            data_dir,
            validation_split=0.1,
            subset="training",
            seed=42,
            batch_size=None
        )
        
        # Load validation dataset
        val_ds = image_dataset_from_directory(
            data_dir,
            validation_split=0.1,
            subset="validation",
            seed=42,
            batch_size=None
        )
        
        print("Class names:", train_ds.class_names)
        print("Number of classes:", len(train_ds.class_names))

Found 7200 images in organized_data


Found 7200 files belonging to 6 classes.
Using 6480 files for training.
Found 7200 files belonging to 6 classes.
Using 720 files for validation.
Class names: ['crack', 'dent', 'glass_shatter', 'lamp_broken', 'scratch', 'tire_flat']
Number of classes: 6


In [53]:
# Extract images and labels into NumPy arrays (as per workshop instructions)
x_train = []
y_train = []
for x, y in train_ds:
    x_train.append(np.uint8(x.numpy()))
    y_train.append(y.numpy())

x_train = np.array(x_train)
y_train = np.array(y_train)

print("Training data shape:", x_train.shape)
print("Training labels shape:", y_train.shape)
print("Image dtype:", x_train.dtype)
print("Label dtype:", y_train.dtype)

Training data shape: (6480, 256, 256, 3)
Training labels shape: (6480,)
Image dtype: uint8
Label dtype: int32


In [54]:
# Extract validation images and labels
x_val = []
y_val = []
for x, y in val_ds:
    x_val.append(np.uint8(x.numpy()))
    y_val.append(y.numpy())

x_val = np.array(x_val)
y_val = np.array(y_val)

print("Validation data shape:", x_val.shape)
print("Validation labels shape:", y_val.shape)

Validation data shape: (720, 256, 256, 3)
Validation labels shape: (720,)


In [None]:
# Load test images (test set has no labels)
# Note: Images will be loaded when needed for predictions
# For now, we'll just store the filenames

from PIL import Image

test_filenames = []
test_image_paths = []

for idx, row in test_df.iterrows():
    filename = row['filename']
    img_path = os.path.join(project_dir, 'test', 'images', filename)
    
    if os.path.exists(img_path):
        test_filenames.append(filename)
        test_image_paths.append(img_path)

print(f"Test images found: {len(test_filenames)}")
print(f"Test images will be loaded when needed for predictions")
print(f"First few test filenames: {test_filenames[:5]}")

Test images found: 4800
Test images will be loaded when needed for predictions
First few test filenames: ['7201.jpg', '7202.jpg', '7203.jpg', '7204.jpg', '7205.jpg']


In [56]:
# Summary
print("="*50)
print("Dataset Summary")
print("="*50)
print(f"Training images: {x_train.shape[0]}")
print(f"Validation images: {x_val.shape[0]}")
print(f"Test images: {len(test_filenames)} (filenames stored)")
print(f"\nImage shape: {x_train.shape[1:]}")
print(f"Number of classes: {len(train_ds.class_names)}")
print(f"Class names: {train_ds.class_names}")

Dataset Summary
Training images: 6480
Validation images: 720
Test images: 4800 (filenames stored)

Image shape: (256, 256, 3)
Number of classes: 6
Class names: ['crack', 'dent', 'glass_shatter', 'lamp_broken', 'scratch', 'tire_flat']
