# We have done the dataset organization and transformation part on Kaggle

In [2]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
import torch.nn as nn
import random

# Organising The Dataset

In [3]:
# Seed is 42
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# Dataset paths (you may adjust the paths (throughout the code) based on whether you're using your local machine or working on kaggle)
# Dataset link - https://www.kaggle.com/datasets/meowmeowmeowmeowmeow/gtsrb-german-traffic-sign

BASE_PATH = "/kaggle/input/gtsrb-german-traffic-sign"
TRAIN_CSV = os.path.join(BASE_PATH, "Train.csv")
TEST_CSV = os.path.join(BASE_PATH, "Test.csv")
TRAIN_DIR = os.path.join(BASE_PATH, "Train")
TEST_DIR = os.path.join(BASE_PATH, "Test")

In [4]:
# We took the top 16 classes (with respect to the frequency of images of each classes) To avoid the effect of long-tailed distribution.

train_df = pd.read_csv(TRAIN_CSV)
class_counts = train_df['ClassId'].value_counts()
top_classes = class_counts[:16].index.tolist()
print("Top 16 classes:", top_classes)

filtered_df = train_df[train_df['ClassId'].isin(top_classes)].reset_index(drop=True)

Top 16 classes: [2, 1, 13, 12, 38, 10, 4, 5, 25, 9, 7, 3, 8, 11, 35, 18]


In [5]:
top16_classes = [2, 1, 13, 12, 38, 10, 4, 5, 25, 9, 7, 3, 8, 11, 35, 18]

test_csv_path = '/kaggle/input/gtsrb-german-traffic-sign/Test.csv'
test_dir = '/kaggle/input/gtsrb-german-traffic-sign/Test'

test_df = pd.read_csv(test_csv_path)
test_df = test_df[test_df['ClassId'].isin(top16_classes)]
print(f"Filtered Test Set Size: {len(test_df)}")

Filtered Test Set Size: 9030


# Train Dataset (only preprocessing the top 16 classes)

In [6]:
root_dir = '/kaggle/input/gtsrb-german-traffic-sign/Train' 
output_dir = '/kaggle/working/gtsrb_resized'  # Output directory for the new resized interpolated dataset

# Creating output directories for each class
for class_id in filtered_df["ClassId"].unique():
    os.makedirs(os.path.join(output_dir, str(class_id)), exist_ok=True)

# Preprocessing and saving the images according to our transforms
for idx, row in tqdm(filtered_df.iterrows(), total=len(filtered_df)):
    img_path = os.path.join(root_dir, str(row['ClassId']), row['Path'].split("/")[-1])
    
    image = cv2.imread(img_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB

    
    image = cv2.resize(image, (800, 800), interpolation=cv2.INTER_NEAREST)  # Resizing the dataset with nearest neighbours
    image = cv2.GaussianBlur(image, (31, 31), 0)  # Guassian blur
    
    # Saving the preprocessed image
    save_path = os.path.join(output_dir, str(row['ClassId']), row['Path'].split("/")[-1])
    cv2.imwrite(save_path, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))  # Savig as BGR for OpenCV compatibility

100%|██████████| 27600/27600 [18:46<00:00, 24.50it/s]


# Test Dataset (repeating the same steps as the train data)

In [7]:
test_img_dir = '/kaggle/input/gtsrb-german-traffic-sign/Test'
output_dir = '/kaggle/working/gtsrb_resized_test'

# Creating output class folders
for class_id in test_df["ClassId"].unique():
    os.makedirs(os.path.join(output_dir, str(class_id)), exist_ok=True)

# Processing and saving
for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    filename = row["Path"].split("/")[-1]
    label = row["ClassId"]
    
    img_path = os.path.join(test_img_dir, filename)
    img = cv2.imread(img_path)
    
    if img is None:
        print(f"Could not read image: {img_path}")
        continue

    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (800, 800), interpolation=cv2.INTER_NEAREST)
    img = cv2.GaussianBlur(img, (31, 31), 0)

    save_path = os.path.join(output_dir, str(label), filename)
    cv2.imwrite(save_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR)) 

100%|██████████| 9030/9030 [05:58<00:00, 25.17it/s]
