In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!ls
%cd drive/MyDrive/pickle_files/
!ls

drive  sample_data
/content/drive/MyDrive/pickle_files
X_test.pickle		 X_train.pickle  Y_train_balanced.pickle
X_train_balanced.pickle  Y_test.pickle	 Y_train.pickle


In [4]:
import os
import numpy as np
import pickle
import random
from keras.preprocessing.image import ImageDataGenerator

# Load the pickled training data
with open("X_train.pickle", "rb") as pickle_in:
    X_train = pickle.load(pickle_in)

with open("Y_train.pickle", "rb") as pickle_in:
    Y_train = pickle.load(pickle_in)

# Verify the loaded data
print(f"Number of images in X_train: {len(X_train)}")
print(f"Number of labels in Y_train: {len(Y_train)}")

# Separate the images by category
X_train_0 = [X_train[i] for i in range(len(Y_train)) if Y_train[i] == 0]
X_train_1 = [X_train[i] for i in range(len(Y_train)) if Y_train[i] == 1]

print(f"Number of '0' images: {len(X_train_0)}")
print(f"Number of '1' images: {len(X_train_1)}")

Number of images in X_train: 11649
Number of labels in Y_train: 11649
Number of '0' images: 10038
Number of '1' images: 1611


In [5]:
# Define the ImageDataGenerator for augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Function to augment images
def augment_images(images, target_count):
    augmented_images = []
    total_images_needed = target_count - len(images)
    print(f"Total augmented images needed: {total_images_needed}")

    while len(augmented_images) < total_images_needed:
        for img in images:
            img = img.reshape((1,) + img.shape)  # Reshape image to 4D tensor
            for batch in datagen.flow(img, batch_size=1):
                augmented_images.append(batch[0].reshape(img.shape[1:]))
                if len(augmented_images) >= total_images_needed:
                    break
        if len(augmented_images) == 0:  # Prevent infinite loop
            print("No images were augmented.")
            break
    return augmented_images

# Perform augmentation to balance the dataset
target_count = 10000
augmented_images = augment_images(X_train_1, target_count)
print(f"Number of augmented '1' images generated: {len(augmented_images)}")

# Combine original '1' images with augmented '1' images
X_train_1_augmented = X_train_1 + augmented_images
print(f"Total number of '1' images after augmentation: {len(X_train_1_augmented)}")

# Convert augmented images back to numpy arrays
X_train_1_augmented = np.array(X_train_1_augmented)
Y_train_1_augmented = [1] * len(X_train_1_augmented)

# Combine the augmented "1" images with the original "0" images
X_train_balanced = np.concatenate((X_train_0, X_train_1_augmented), axis=0)
Y_train_balanced = np.concatenate(([0] * len(X_train_0), Y_train_1_augmented), axis=0)

# Shuffle the dataset
combined = list(zip(X_train_balanced, Y_train_balanced))
random.shuffle(combined)
X_train_balanced, Y_train_balanced = zip(*combined)

X_train_balanced = np.array(X_train_balanced)
Y_train_balanced = np.array(Y_train_balanced)

print(f"Number of images in balanced X_train: {len(X_train_balanced)}")
print(f"Number of labels in balanced Y_train: {len(Y_train_balanced)}")

# Save the balanced dataset as pickle files
with open("X_train_balanced.pickle", "wb") as pickle_out:
    pickle.dump(X_train_balanced, pickle_out)

with open("Y_train_balanced.pickle", "wb") as pickle_out:
    pickle.dump(Y_train_balanced, pickle_out)

print("Balanced dataset saved to pickle files.")

Total augmented images needed: 8389
Number of augmented '1' images generated: 9999
Total number of '1' images after augmentation: 11610
Number of images in balanced X_train: 21648
Number of labels in balanced Y_train: 21648
Balanced dataset saved to pickle files.


In [6]:
!ls

X_test.pickle		 X_train.pickle  Y_train_balanced.pickle
X_train_balanced.pickle  Y_test.pickle	 Y_train.pickle
