In [1]:
import pickle
import pandas as pd
import numpy as np
from skimage import transform
from skimage import exposure
from skimage import io
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical

def read_image(imgpath, size=(32, 32)):
    image = io.imread(imgpath)
    image = process_image(image, size)
    return image

def process_image(image, size, clahe=True):
    image = transform.resize(image, size)
    if clahe:
        image = exposure.equalize_adapthist(image, clip_limit=0.1)
    return image

def load_and_split(csv_path, shuffle=True, frac=1):
    data_path, _ = os.path.split(csv_path)
    df = pd.read_csv(csv_path)
    if shuffle:
        df = df.sample(frac=frac)
    data = [read_image(os.path.join(data_path, f)) for f in df['Path']]
    labels = [int(c) for c in df['ClassId']]
    return np.array(data), np.array(labels)

def pickle_dataset(X_train, y_train, X_test, y_test, out_dir='./data'):
    objects = {'X_train': X_train, 'y_train': y_train, 'X_test': X_test, 'y_test': y_test}
    for name, obj in objects.items():
        pkl_path = os.path.join(out_dir, f'{name}.pkl')
        pickle.dump(obj, open(pkl_path, 'wb'))
        print(f'saved {name} to {pkl_path}')

def normalize(X):
    return X.astype('float32') / 255.0

dataset_dir = './data/gtsrb'
# sign_names = pd.read_csv(os.path.join(dataset_dir, 'Signs.csv'))
X_train, y_train = load_and_split(os.path.join(dataset_dir, 'Train.csv'))
X_test, y_test = load_and_split(os.path.join(dataset_dir, 'Test.csv'))

X_train = normalize(X_train)
X_test = normalize(X_test)

n_labels = len(np.unique(y_train))
y_train = to_categorical(y_train, n_labels)
y_test = to_categorical(y_test, n_labels)

pickle_dataset(X_train, y_train, X_test, y_test)

saved X_train to ./data/X_train.pkl
saved y_train to ./data/y_train.pkl
saved X_test to ./data/X_test.pkl
saved y_test to ./data/y_test.pkl


In [None]:
# image = read_image('data/gtsrb/Train/0/00000_00000_00000.png')
# plt.imshow(image)