In [19]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black


In [17]:
# create 3 folders for in data/clear_data
# fila, adidas, social
# and move ../data/raw_data/

### Data Preparation

In [22]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical


# Function to load and preprocess images from a directory
def load_and_preprocess_images(directory):
    image_data = []
    labels = []

    image_files = [f for f in os.listdir(directory) if f.endswith(".jpg")]

    for image_file in image_files:
        img_path = os.path.join(directory, image_file)
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(
            img, (224, 224)
        )  # Resize the images to a common size (adjust as needed)
        img = img / 255.0  # Normalize pixel values to be between 0 and 1
        label = directory.split("/")[-1]  # Extract the label from the directory name

        image_data.append(img)
        labels.append(label)

    return np.array(image_data), np.array(labels)


# Specify the directories
adidas_dir = "../data/clear_data/adidas"
fila_dir = "../data/clear_data/fila"
social_dir = "../data/clear_data/social"

# Load and preprocess images from each category
adidas_dir_data, adidas_dir_labels = load_and_preprocess_images(adidas_dir)
fila_dir_data, fila_dir_labels = load_and_preprocess_images(fila_dir)
social_dir_data, social_dir_labels = load_and_preprocess_images(social_dir)

# Combine data from all categories
X = np.concatenate([adidas_dir_data, fila_dir_data, social_dir_data], axis=0)
y = np.concatenate([adidas_dir_labels, fila_dir_labels, social_dir_labels], axis=0)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Convert labels to one-hot encoding
y_one_hot = to_categorical(y_encoded)

# Split the data into training, testing, and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y_one_hot, test_size=0.2, random_state=42, stratify=y_one_hot
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Save the processed data to a file (optional)
np.savez(
    "../data/splitted_data/preprocessed_data.npz",
    X_train=X_train,
    X_valid=X_valid,
    X_test=X_test,
    y_train=y_train,
    y_valid=y_valid,
    y_test=y_test,
)

# Display some information about the dataset
print("Number of training samples:", X_train.shape[0])
print("Number of validation samples:", X_valid.shape[0])
print("Number of testing samples:", X_test.shape[0])
print("Image shape:", X_train.shape[1:])
print("Number of classes:", len(label_encoder.classes_))

Number of training samples: 226
Number of validation samples: 28
Number of testing samples: 29
Image shape: (224, 224, 3)
Number of classes: 3
