In [1]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Navigate to the folder where your dataset is located
# Example path: /content/drive/MyDrive/GTSRB_Compelte_Data
dataset_path = '/content/drive/MyDrive/GTSRB_Complete_Data'

# Step 3: Check if you can list the files in the dataset folder
import os
print(os.listdir(dataset_path))


Mounted at /content/drive
['Metadata', 'Test', 'Train']


In [5]:
import pandas as pd
import cv2
import os
import concurrent.futures

# Define base path for metadata and images
metadata_base_path = '/content/drive/MyDrive/GTSRB_Complete_Data/Metadata/'
image_base_path_train = '/content/drive/MyDrive/GTSRB_Complete_Data/Train/'
image_base_path_test = '/content/drive/MyDrive/GTSRB_Complete_Data/Test/'

# 1. Load all the training metadata CSV files (GT-00000.csv to GT-00042.csv)
train_metadata_files = [f'GT-{str(i).zfill(5)}.csv' for i in range(43)]  # Generates filenames GT-00000.csv to GT-00042.csv

# 2. Load the test metadata CSV file (GT-final_test.test.csv)
test_metadata_file = 'GT-final_test.test.csv'

# Function to load and process each metadata CSV file
def process_metadata_file(metadata_file_path, image_base_path, is_train=True):
    try:
        metadata_df = pd.read_csv(metadata_file_path, sep=';')  # Read CSV file

        for index, row in metadata_df.iterrows():
            # Extract metadata for each image
            filename = row['Filename']
            width = row['Width']
            height = row['Height']
            roi_x1 = row['Roi.X1']
            roi_y1 = row['Roi.Y1']
            roi_x2 = row['Roi.X2']
            roi_y2 = row['Roi.Y2']

            # For training, include ClassId; for test, ClassId will not exist
            class_id = row.get('ClassId', None)  # For training data

            # Get the folder name based on the ClassId (training data only)
            if is_train:
                folder_name = str(class_id).zfill(5)  # Class folder names are 00000, 00001, ..., 00042
                image_path = os.path.join(image_base_path, folder_name, filename)
            else:
                # For the test data, the image is located directly under the Test folder
                image_path = os.path.join(image_base_path, filename)

            # Read the image
            image = cv2.imread(image_path)

            # Check if the image is successfully loaded
            if image is not None:
                # Process the image and metadata (e.g., for training/testing)
                print(f"Processing Image: {filename} (Class ID: {class_id if class_id is not None else 'N/A'})")

                # You can add further processing here, like resizing or ROI cropping using roi_x1, roi_y1, etc.
            else:
                print(f"Error loading image: {filename}")

        return metadata_df  # Return the loaded metadata dataframe

    except Exception as e:
        print(f"Error processing metadata file: {e}")
        return None

# Function to process a single metadata file in parallel
def process_single_file(metadata_file):
    metadata_file_path = os.path.join(metadata_base_path, metadata_file)
    print(f"Processing Training Dataset: {metadata_file}")
    return process_metadata_file(metadata_file_path, image_base_path_train, is_train=True)

# 3. Parallel processing of the training datasets (GT-00000.csv to GT-00042.csv)
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Map the metadata files to the process_single_file function for parallel execution
    results = list(executor.map(process_single_file, train_metadata_files))

# 4. Process the test dataset (GT-final_test.test.csv)
test_metadata_file_path = os.path.join(metadata_base_path, test_metadata_file)
print(f"Processing Test Dataset: {test_metadata_file}")
test_metadata_df = process_metadata_file(test_metadata_file_path, image_base_path_test, is_train=False)

# If you want to check the content of the loaded metadata for any specific dataset
if test_metadata_df is not None:
    print("Test Dataset Preview:")
    print(test_metadata_df.head())
else:
    print("Test dataset metadata could not be loaded.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing Image: 07637.ppm (Class ID: N/A)
Processing Image: 07638.ppm (Class ID: N/A)
Processing Image: 07639.ppm (Class ID: N/A)
Processing Image: 07640.ppm (Class ID: N/A)
Processing Image: 07641.ppm (Class ID: N/A)
Processing Image: 07642.ppm (Class ID: N/A)
Processing Image: 07643.ppm (Class ID: N/A)
Processing Image: 07644.ppm (Class ID: N/A)
Processing Image: 07645.ppm (Class ID: N/A)
Processing Image: 07646.ppm (Class ID: N/A)
Processing Image: 07647.ppm (Class ID: N/A)
Processing Image: 07648.ppm (Class ID: N/A)
Processing Image: 07649.ppm (Class ID: N/A)
Processing Image: 07650.ppm (Class ID: N/A)
Processing Image: 07651.ppm (Class ID: N/A)
Processing Image: 07652.ppm (Class ID: N/A)
Processing Image: 07653.ppm (Class ID: N/A)
Processing Image: 07654.ppm (Class ID: N/A)
Processing Image: 07655.ppm (Class ID: N/A)
Processing Image: 07656.ppm (Class ID: N/A)
Processing Image: 07657.ppm (Class ID: N/A)
Processing 