In [1]:
#import numpy as np
import numpy as np
import tensorflow as tf
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from tensorflow.keras.applications import VGG16
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from collections import Counter

In [None]:
def process_images_with_tf(image_paths, labels, images_mmap_path, labels_mmap_path, batch_size=500):
    total_images = len(image_paths)
    print(f"\nProcessing {total_images} images for memory-mapped storage...")

    #########################################
    # Define image shape and dtype for memory-mapped storage
    image_shape = (256, 256, 3)
    dtype = 'float32'

    # Create memory-mapped arrays for images and labels
    images_mmap = np.memmap(images_mmap_path, dtype=dtype, mode='w+', shape=(total_images, *image_shape))
    labels_mmap = np.memmap(labels_mmap_path, dtype='int32', mode='w+', shape=(total_images,))

    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))

    def _load_and_preprocess_image(file_path, label):
        try:
            img = tf.io.read_file(file_path)
            img = tf.io.decode_image(img, channels=3, expand_animations=False)
            img.set_shape([None, None, 3])
            img = tf.image.resize(img, [256, 256])
            img = tf.cast(img, tf.float32) / 255.0
            return img, label
        except Exception as e:
            tf.print(f"Error processing image {file_path}: {e}")
            img = tf.zeros([256, 256, 3], dtype=tf.float32)
            return img, label

    dataset = dataset.map(_load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    idx = 0
    for batch_imgs, batch_labels in tqdm(dataset, total=(total_images // batch_size) + 1, desc="Processing Images"):
        batch_imgs_np = batch_imgs.numpy()
        batch_labels_np = batch_labels.numpy()
        batch_size_actual = batch_imgs_np.shape[0]
        images_mmap[idx:idx+batch_size_actual] = batch_imgs_np
        labels_mmap[idx:idx+batch_size_actual] = batch_labels_np
        idx += batch_size_actual

        # Flush changes to memory-mapped file to disk and clear memory
        images_mmap.flush()
        labels_mmap.flush()
        gc.collect()

    del images_mmap
    del labels_mmap
    print(f"Finished processing {total_images} images into {images_mmap_path} and {labels_mmap_path}.")


In [3]:
import os
import zipfile
import shutil
import gc
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt
import logging
import random

# --------------------- Configuration ---------------------

# Configure logging to capture errors during processing
logging.basicConfig(
    filename='data_processing.log',
    level=logging.ERROR,
    format='%(asctime)s:%(levelname)s:%(message)s'
)

# Define paths for memory-mapped arrays
IMAGES_D_MEMMAP_PATH = 'images_d_mmap.npy'
LABELS_D_MEMMAP_PATH = 'labels_d.npy'
IMAGES_P_MEMMAP_PATH = 'images_p_mmap.npy'
LABELS_P_MEMMAP_PATH = 'labels_p.npy'

# Temporary extraction directory
TEMP_DIR = "temp_extracted"

# Sampling fraction (20%)
SAMPLE_FRACTION = .2

# Seed for reproducibility
RANDOM_SEED = 42

# --------------------- Setup ---------------------

# Ensure temporary directory exists
os.makedirs(TEMP_DIR, exist_ok=True)

# Set random seed for reproducibility
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Enable GPU processing for TensorFlow, if available
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"{len(gpus)} GPU(s) detected and memory growth enabled.")
    except RuntimeError as e:
        print(e)

# --------------------- Helper Functions ---------------------

def is_image_file(filename):
    """
    Check if a file is an image based on its extension.
    
    Parameters:
    - filename (str): Name of the file.
    
    Returns:
    - bool: True if the file is an image, False otherwise.
    """
    return filename.lower().endswith(('.jpg', '.jpeg', '.png'))

def extract_images_and_labels(zip_path, image_filenames, temp_dir, label):
    """
    Extract images from a zip file to a temporary directory and assign labels.
    
    Parameters:
    - zip_path (str): Path to the zip file.
    - image_filenames (list): List of image file paths within the zip.
    - temp_dir (str): Directory to extract images to.
    - label (int): Label to assign to all extracted images.
    
    Returns:
    - extracted_paths (list): List of paths to the extracted images.
    - labels (list): List of labels corresponding to the extracted images.
    """
    extracted_paths = []
    labels = []
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        for img_path in tqdm(image_filenames, desc=f"Extracting to {temp_dir}"):
            try:
                zip_ref.extract(img_path, temp_dir)
                extracted_file_path = os.path.join(temp_dir, img_path)
                extracted_paths.append(extracted_file_path)
                labels.append(label)  # Assign the passed label
            except Exception as e:
                logging.error(f"Error extracting {img_path}: {e}")
                continue
    return extracted_paths, labels

# --------------------- Main Processing Function ---------------------

def main():
    """
    Main function to process images from 'D' and 'P' folders within 'SDNET2018.zip'.
    It samples 20% of images from each subfolder, assigns labels, processes images,
    and saves them into separate memory-mapped files.
    """
    zip_path = 'SDNET2018.zip'  # Path to your zip file
    batch_size = 500  # Adjust based on available memory

    # Define mapping from subfolders to labels
    subfolder_label_mapping = {
        'SDNET2018/D/UD': 0,  # Uncracked
        'SDNET2018/D/CD': 1,  # Cracked
        'SDNET2018/P/UP': 0,  # Uncracked
        'SDNET2018/P/CP': 1   # Cracked
    }

    # Extract all filenames from the zip file
    print(f"Opening zip file: {zip_path}")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        all_filenames = zip_ref.namelist()

    # Function to filter image filenames from a specific subfolder
    def get_image_filenames(subfolder):
        prefix = subfolder + '/'
        return [f for f in all_filenames if f.startswith(prefix) and is_image_file(f)]

    # Initialize dictionaries to hold image paths and labels for D and P
    data = {
        'D': {'image_paths': [], 'labels': []},
        'P': {'image_paths': [], 'labels': []}
    }

    # Iterate over each subfolder and process images
    for subfolder, label in subfolder_label_mapping.items():
        image_filenames = get_image_filenames(subfolder)
        total_images = len(image_filenames)
        sample_size = max(1, int(total_images * SAMPLE_FRACTION))  # Ensure at least 1 image is sampled

        # Randomly sample 20% of the images
        sampled_filenames = random.sample(image_filenames, sample_size)

        print(f"\nFound {total_images} images in subfolder '{subfolder}'. Sampling {sample_size} images for label {label}.")

        # Extract images and assign labels
        image_paths, labels = extract_images_and_labels(zip_path, sampled_filenames, TEMP_DIR, label)

        # Determine main folder (D or P) based on subfolder
        main_folder = subfolder.split('/')[1]

        # Append to the respective main folder's data
        data[main_folder]['image_paths'].extend(image_paths)
        data[main_folder]['labels'].extend(labels)

    # Process and save D folder data
    if data['D']['image_paths']:
        process_images_with_tf(
            data['D']['image_paths'],
            labels=data['D']['labels'],
            images_mmap_path=IMAGES_D_MEMMAP_PATH,
            labels_mmap_path=LABELS_D_MEMMAP_PATH,
            batch_size=batch_size
        )

        # Save the filenames and labels for D
        np.save('filenames_d.npy', data['D']['image_paths'])
        np.save('labels_d.npy', data['D']['labels'])
    else:
        print("\nNo images found for 'D' folder after sampling.")

    # Process and save P folder data
    if data['P']['image_paths']:
        process_images_with_tf(
            data['P']['image_paths'],
            labels=data['P']['labels'],
            images_mmap_path=IMAGES_P_MEMMAP_PATH,
            labels_mmap_path=LABELS_P_MEMMAP_PATH,
            batch_size=batch_size
        )

        # Save the filenames and labels for P
        np.save('filenames_p.npy', data['P']['image_paths'])
        np.save('labels_p.npy', data['P']['labels'])
    else:
        print("\nNo images found for 'P' folder after sampling.")

    # Clear temporary directory
    shutil.rmtree(TEMP_DIR)
    os.makedirs(TEMP_DIR, exist_ok=True)

    # Free up memory
    del data
    gc.collect()

    print("\nFilenames and labels saved for 'D' and 'P' folders.")

# --------------------- Execute Script ---------------------

if __name__ == "__main__":
    main()


Opening zip file: SDNET2018.zip

Found 11595 images in subfolder 'SDNET2018/D/UD'. Sampling 2319 images for label 0.


Extracting to temp_extracted: 100%|███████████████████████████████████████████████| 2319/2319 [00:03<00:00, 596.91it/s]



Found 1260 images in subfolder 'SDNET2018/D/CD'. Sampling 252 images for label 1.


Extracting to temp_extracted: 100%|█████████████████████████████████████████████████| 252/252 [00:00<00:00, 649.48it/s]



Found 21726 images in subfolder 'SDNET2018/P/UP'. Sampling 4345 images for label 0.


Extracting to temp_extracted: 100%|███████████████████████████████████████████████| 4345/4345 [00:06<00:00, 644.98it/s]



Found 2608 images in subfolder 'SDNET2018/P/CP'. Sampling 521 images for label 1.


Extracting to temp_extracted: 100%|█████████████████████████████████████████████████| 521/521 [00:01<00:00, 471.07it/s]



Processing 2571 images for memory-mapped storage...


Processing Images: 100%|█████████████████████████████████████████████████████████████████| 6/6 [00:21<00:00,  3.61s/it]


Finished processing 2571 images into images_d_mmap.npy and labels_d.npy.

Processing 4866 images for memory-mapped storage...


Processing Images: 100%|███████████████████████████████████████████████████████████████| 10/10 [00:40<00:00,  4.08s/it]


Finished processing 4866 images into images_p_mmap.npy and labels_p.npy.

Filenames and labels saved for 'D' and 'P' folders.


In [4]:
from sklearn.model_selection import train_test_split
# Paths for memory-mapped files after processing images
IMAGES_D_MEMMAP_PATH = 'images_d_mmap.npy'
LABELS_D_MEMMAP_PATH = 'labels_d.npy'
IMAGES_P_MEMMAP_PATH = 'images_p_mmap.npy'
LABELS_P_MEMMAP_PATH = 'labels_p.npy'

# Ensure that necessary files exist
if not (os.path.exists(IMAGES_D_MEMMAP_PATH) and os.path.exists(LABELS_D_MEMMAP_PATH) and os.path.exists(IMAGES_P_MEMMAP_PATH) and os.path.exists(LABELS_P_MEMMAP_PATH)):
    print("Required processed files not found. Make sure to run the image extraction and processing first.")
    exit()

# Load labels for datasets D and P
labels_d = np.load(LABELS_D_MEMMAP_PATH)
labels_p = np.load(LABELS_P_MEMMAP_PATH)

# Define the number of images in each set
total_images_d = len(labels_d)
total_images_p = len(labels_p)

# Load images from memory-mapped files
images_d = np.memmap(
    IMAGES_D_MEMMAP_PATH,
    dtype='float32',
    mode='r',
    shape=(total_images_d, 256, 256, 3)
)

images_p = np.memmap(
    IMAGES_P_MEMMAP_PATH,
    dtype='float32',
    mode='r',
    shape=(total_images_p, 256, 256, 3)
)

# Verify that the number of labels matches the number of images
assert len(labels_d) == total_images_d, "Mismatch in number of D labels and images."
assert len(labels_p) == total_images_p, "Mismatch in number of P labels and images."

print("Data loaded successfully.")

# Split the D data into training and test sets (80% training, 20% testing)
train_indices_d, test_indices_d = train_test_split(
    np.arange(total_images_d),
    test_size=0.20,
    random_state=42,
    stratify=labels_d
)

# Extract training and testing images and labels for D
train_images_d = images_d[train_indices_d]
train_labels_d = labels_d[train_indices_d]
test_images_d = images_d[test_indices_d]
test_labels_d = labels_d[test_indices_d]

print(f"Train Set D: {len(train_images_d)} samples.")
print(f"Test Set D: {len(test_images_d)} samples.")

# Split the P data into training and test sets (80% training, 20% testing)
train_indices_p, test_indices_p = train_test_split(
    np.arange(total_images_p),
    test_size=0.20,
    random_state=42,
    stratify=labels_p
)

# Extract training and testing images and labels for P
train_images_p = images_p[train_indices_p]
train_labels_p = labels_p[train_indices_p]
test_images_p = images_p[test_indices_p]
test_labels_p = labels_p[test_indices_p]

print(f"Train Set P: {len(train_images_p)} samples.")
print(f"Test Set P: {len(test_images_p)} samples.")

# Combine D and P training data for Train Set A (large and diverse)
train_images_A = np.concatenate((train_images_d, train_images_p), axis=0)
train_labels_A = np.concatenate((train_labels_d, train_labels_p), axis=0)

print(f"Train Set A (Large and Diverse): {len(train_images_A)} samples.")

# Create a small, diverse training set by taking 10% of Train Set A (Train Set B)
fraction_B = 0.10  # 10%
num_samples_B = int(len(train_images_A) * fraction_B)

# Randomly sample Train Set B from Train Set A
indices_B = np.random.choice(len(train_images_A), size=num_samples_B, replace=False)
train_images_B = train_images_A[indices_B]
train_labels_B = train_labels_A[indices_B]

print(f"Train Set B (Small and Diverse): {len(train_images_B)} samples.")

# Define Train Set C as all D training data
train_images_C = train_images_d
train_labels_C = train_labels_d

print(f"Train Set C (D only): {len(train_images_C)} samples.")

# Define Test Set D (only D testing data)
test_images_D = test_images_d
test_labels_D = test_labels_d

print(f"Test Set D (D only): {len(test_images_D)} samples.")

# Define Test Set P (only P testing data)
test_images_P = test_images_p
test_labels_P = test_labels_p

print(f"Test Set P (P only): {len(test_images_P)} samples.")

# Function to save datasets
def save_dataset(images, labels, images_path, labels_path):
    np.save(images_path, images)
    np.save(labels_path, labels)

# Save Train Set A
save_dataset(train_images_A, train_labels_A, 'train_set_A_images.npy', 'train_set_A_labels.npy')

# Save Train Set B
save_dataset(train_images_B, train_labels_B, 'train_set_B_images.npy', 'train_set_B_labels.npy')

# Save Train Set C
save_dataset(train_images_C, train_labels_C, 'train_set_C_images.npy', 'train_set_C_labels.npy')

# Save Test Set D
save_dataset(test_images_D, test_labels_D, 'test_set_D_images.npy', 'test_set_D_labels.npy')

# Save Test Set P
save_dataset(test_images_P, test_labels_P, 'test_set_P_images.npy', 'test_set_P_labels.npy')

print("\nAll datasets have been saved successfully.")

Data loaded successfully.
Train Set D: 2056 samples.
Test Set D: 515 samples.
Train Set P: 3892 samples.
Test Set P: 974 samples.
Train Set A (Large and Diverse): 5948 samples.
Train Set B (Small and Diverse): 594 samples.
Train Set C (D only): 2056 samples.
Test Set D (D only): 515 samples.
Test Set P (P only): 974 samples.

All datasets have been saved successfully.


In [5]:
# Load the datasets
train_images_A = np.load('train_set_A_images.npy')
train_labels_A = np.load('train_set_A_labels.npy')

train_images_B = np.load('train_set_B_images.npy')
train_labels_B = np.load('train_set_B_labels.npy')

train_images_C = np.load('train_set_C_images.npy')
train_labels_C = np.load('train_set_C_labels.npy')

test_images_D = np.load('test_set_D_images.npy')
test_labels_D = np.load('test_set_D_labels.npy')

test_images_P = np.load('test_set_P_images.npy')
test_labels_P = np.load('test_set_P_labels.npy')

# Data Augmentation Layer
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip('horizontal_and_vertical'),
    tf.keras.layers.RandomRotation(0.1),
    tf.keras.layers.RandomZoom(0.1)
])

# Number of Augmented Samples per Cracked Image
AUGMENT_PER_IMAGE = 5

def augment_cracked_images(images, labels):
    augmented_images, augmented_labels = [], []
    for i in tqdm(range(len(labels)), desc="Augmenting Cracked Images"):
        if labels[i] == 1:  # Only augment cracked images
            image = images[i]
            for _ in range(AUGMENT_PER_IMAGE):
                augmented_image = data_augmentation(tf.expand_dims(image, 0))
                augmented_images.append(augmented_image[0].numpy())
                augmented_labels.append(1)
    return np.array(augmented_images), np.array(augmented_labels)

# Function to train and evaluate the model
def train_and_evaluate(train_images, train_labels, test_images, test_labels, set_name):
    # Augment cracked images
    augmented_images, augmented_labels = augment_cracked_images(train_images, train_labels)
    train_images_balanced = np.concatenate((train_images, augmented_images), axis=0)
    train_labels_balanced = np.concatenate((train_labels, augmented_labels), axis=0)

    # Print class distribution after augmentation
    train_class_distribution_balanced = Counter(train_labels_balanced)
    print(f"Balanced Training Set Class Distribution for {set_name}: {train_class_distribution_balanced}")

    # Load VGG16 pre-trained model
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=(256, 256, 3))
    base_model.trainable = False

    # Build transfer learning model
    model_transfer = models.Sequential([
        base_model,
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])

    model_transfer.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])
    callbacks_transfer = [
        ModelCheckpoint(f'best_model_transfer_{set_name}.keras', monitor='val_accuracy', save_best_only=True, verbose=1),
        EarlyStopping(monitor='val_accuracy', patience=5, verbose=1, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)
    ]

    # Train the model
    history_transfer = model_transfer.fit(
        train_images_balanced,
        train_labels_balanced,
        validation_data=(test_images, test_labels),
        epochs=20,
        batch_size=32,
        callbacks=callbacks_transfer,
        verbose=1
    )

    # Unfreeze layers and fine-tune
    base_model.trainable = True
    model_transfer.compile(optimizer=Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    history_fine_tune = model_transfer.fit(
        train_images_balanced,
        train_labels_balanced,
        validation_data=(test_images, test_labels),
        epochs=20,
        batch_size=32,
        callbacks=callbacks_transfer,
        verbose=1
    )

    # Evaluate on test set
    predictions = model_transfer.predict(test_images)
    predicted_labels = (predictions >= 0.15).astype(int).flatten()

    # Compute and print confusion matrix and metrics
    cm = confusion_matrix(test_labels, predicted_labels)
    precision = precision_score(test_labels, predicted_labels, zero_division=0)
    recall = recall_score(test_labels, predicted_labels, zero_division=0)
    report = classification_report(test_labels, predicted_labels, target_names=['Uncracked', 'Cracked'], zero_division=0)

    print(f"Confusion Matrix ({set_name} Test Set):")
    print(cm)
    print(f"\nPrecision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print("\nClassification Report:")
    print(report)

    # Plot confusion matrix
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Uncracked (0)', 'Cracked (1)'],
                yticklabels=['Uncracked (0)', 'Cracked (1)'])
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.title(f'Confusion Matrix ({set_name} Test Set)')
    plt.show()

# Define train and test sets
datasets = {
    "Train Set A": (train_images_A, train_labels_A),
    "Train Set B": (train_images_B, train_labels_B),
    "Train Set C": (train_images_C, train_labels_C)
}

test_sets = {
    "Test Set D": (test_images_D, test_labels_D),
    "Test Set P": (test_images_P, test_labels_P)
}

# Train and evaluate for each combination of train and test sets
for train_name, (train_images, train_labels) in datasets.items():
    for test_name, (test_images, test_labels) in test_sets.items():
        print(f"\nTraining {train_name} and Evaluating on {test_name}...")
        train_and_evaluate(train_images, train_labels, test_images, test_labels, f'{train_name}_to_{test_name}')


MemoryError: Unable to allocate 4.36 GiB for an array with shape (1169424384,) and data type float32

In [None]:
#!pip install keras_cv

In [None]:
#!pip install albumentations
#!pip install efficientnet