In [4]:
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os

images_dir = "training/"
labels = pd.read_csv("training_labels.csv")

# add the directory to the filename
labels['ID'] = labels['ID'].apply(lambda x: os.path.join(images_dir, x))

# Initialize the ImageDataGenerator
# You can change the size of the validation split (0.25 is 25% of data used as validation set)
datagen = ImageDataGenerator(rescale=1./255, validation_split=0.25)

# Create the training and validation generators
train_generator = datagen.flow_from_dataframe(
    dataframe=labels,
    directory=None,  
    x_col='ID',
    y_col='target',
    target_size=(224, 224), # You can change the size of the image
    batch_size=32, # You can change the batch_size
    class_mode='categorical',  
    subset='training'
)

validation_generator = datagen.flow_from_dataframe(
    dataframe=labels,
    directory=None,  
    x_col='ID',
    y_col='target',
    target_size=(224, 224), # Should match training size
    batch_size=32, # Should match training
    class_mode='categorical',  
    subset='validation'
)
# Here is some example code to view a few of the images.

## Plot a few of the images
import matplotlib.pyplot as plt

# Fetch a batch of images and their labels
images, labels = next(train_generator)

# Number of images to show
num_images = 8

plt.figure(figsize=(20, 10))
for i in range(num_images):
    ax = plt.subplot(2, 4, i + 1)
    plt.imshow(images[i])
    # The label for current image
    label_index = labels[i].argmax()  # Convert one-hot encoding to index
    label = list(train_generator.class_indices.keys())[label_index]  # Get label name from index
    plt.title(label)
    plt.axis('off')
plt.show()


ImportError: Traceback (most recent call last):
  File "c:\Users\thema\anaconda3\envs\stat386\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 70, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

optimizer, ch 10 in HOML
can try a subset of images first

In [None]:
# Using Pillow for Scikit-Learn
# Alternatively, you can read the data into a flat array (for use with Scikit-Learn) using the pillow library. The following is example code of how to do this:

import os
import numpy as np
from PIL import Image
import pandas as pd
from sklearn.model_selection import train_test_split

image_dir = 'training'

# Load labels
labels_csv = 'training_labels.csv'
labels_df = pd.read_csv(labels_csv)
labels = labels_df['target'].values  

# Preprocess images
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))]
# You can resize the image to different dimensions
images = np.array([np.array(Image.open(img).resize((128, 128))).flatten() for img in image_paths])

# Ensure images and labels are aligned, assuming filenames and labels are in the same order
assert len(images) == len(labels), "The number of images and labels do not match."

# Split the dataset into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(images, labels, test_size=0.2, random_state=42)


In [None]:
# And viewing the images:

# Number of images to show
num_images = 8
images, labels = X_train[:num_images], Y_train[:num_images]

plt.figure(figsize=(20, 10))
for i in range(num_images):
    ax = plt.subplot(2, 4, i + 1)
    img_dim = int(np.sqrt(images[i].shape[0] / 3))  # Assuming the images are square and RGB
    ax.imshow(images[i].reshape(img_dim, img_dim, 3))
    ax.set_title(labels[i])
    ax.axis('off')
    plt.axis('off')
plt.tight_layout()
plt.show()