In [None]:
import os
import numpy as np
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold

# Specify the root directory containing the subfolders
root_directory = '/Users/GraceWang/Desktop/UNLV2023/hearingLossOriginalCodebase/REU-Hearing-Loss-Project/machine learning/data split by subject'

# Create a list of paths to the subfolders (classes)
subfolder_paths = [os.path.join(root_directory, "Healthy"), os.path.join(root_directory, "Hearing Impaired")]

# Combine the data paths and corresponding labels (0 for Healthy, 1 for Hearing Impaired)
data_paths = []
labels = []

# Enumerate over subfolder_paths (0 for Healthy, 1 for Hearing Impaired)
for label, subfolder_path in enumerate(subfolder_paths):
    # Iterate over subject folders within each class
    for subject_folder in os.listdir(subfolder_path):
        # Construct the full path to the subject folder
        subject_path = os.path.join(subfolder_path, subject_folder)
        # Append the subject path to data_paths
        data_paths.append(subject_path)
        # Append the corresponding label to labels
        labels.append(label)

# Convert lists to arrays
data_paths = np.array(data_paths)
labels = np.array(labels)

# Number of folds for k-fold cross-validation
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# CNN Model
def build_model(input_shape):
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Iterate over K folds
for fold, (train_indices, test_indices) in enumerate(skf.split(data_paths, labels)):
    print(f"\nFold {fold + 1}/{k_folds}")

    # Extract paths for training and testing
    train_paths, test_paths = data_paths[train_indices], data_paths[test_indices]

    # Define the number of healthy and unhealthy subjects for test
    num_test_healthy = 2
    num_test_unhealthy = 2

    # Lists to store results for each fold
    fold_accuracies = []

    # Hold out different healthy and unhealthy subjects for each training and average the results
    test_healthy_indices = np.random.choice(np.where(labels[test_indices] == 0)[0], num_test_healthy, replace=False)
    test_unhealthy_indices = np.random.choice(np.where(labels[test_indices] == 1)[0], num_test_unhealthy, replace=False)
    
    test_subjects_indices = np.concatenate([test_healthy_indices, test_unhealthy_indices])
    test_subjects_paths = test_paths[test_subjects_indices]

    for test_subject_path in test_subjects_paths:
        # Exclude the current test subject
        train_paths_for_iteration = train_paths.copy()
        train_paths_for_iteration = np.append(train_paths_for_iteration, test_subject_path)

        # Create data generator for the current iteration
        train_generator = ImageDataGenerator(rescale=1.0 / 255.0).flow_from_directory(
            train_paths_for_iteration,
            target_size=(224, 224),
            batch_size=32,
            class_mode='categorical')

        # Clone the model to ensure a new instance is used for each training iteration
        model = build_model((224, 224, 3))

        # Train the model for the current iteration
        history = model.fit(
            train_generator,
            steps_per_epoch=train_generator.samples // train_generator.batch_size,
            epochs=60)

        # Evaluate the model on the test subjects for the current iteration
        test_generator = ImageDataGenerator(rescale=1.0 / 255.0).flow_from_directory(
            test_subject_path,
            target_size=(224, 224),
            batch_size=32,
            class_mode='categorical')

        test_loss, test_accuracy = model.evaluate(test_generator, steps=test_generator.samples // test_generator.batch_size)
        print("Test Loss:", test_loss)
        print("Test Accuracy:", test_accuracy)

        fold_accuracies.append(test_accuracy)

    # Average the accuracy results for the test subjects
    fold_accuracy = np.mean(fold_accuracies)
    print(f"Average Accuracy for Fold {fold + 1}: {fold_accuracy}")

# Note: You may need to adjust the paths and other parameters based on your specific dataset structure.


In [5]:
import os
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split



# Assuming data is organized as follows:
# - Data split by subject folder
#   - Healthy folder
#     - Subject_001 folder
#     - Subject_002 folder
#     ...
#   - Hearing Impaired folder
#     - Subject_003 folder
#     - Subject_004 folder
#     ...

# Specify the root directory containing the subfolders
root_directory = '/Users/GraceWang/Desktop/UNLV2023/hearingLossOriginalCodebase/REU-Hearing-Loss-Project/machine learning/data split by subject'

# Create a list of paths to the subfolders (classes)
subfolder_paths = [os.path.join(root_directory, "Healthy"), os.path.join(root_directory, "Hearing Impaired")]

# Combine the data paths and corresponding labels (0 for Healthy, 1 for Hearing Impaired)
data_paths = []
labels = []

# Enumerate over subfolder_paths (0 for Healthy, 1 for Hearing Impaired)
for label, subfolder_path in enumerate(subfolder_paths):
    # Iterate over subject folders within each class
    for subject_folder in os.listdir(subfolder_path):
        # Construct the full path to the subject folder
        subject_path = os.path.join(subfolder_path, subject_folder)
        # Append the subject path to data_paths
        data_paths.append(subject_path)        
        # Append the corresponding label to labels
        labels.append(label)

# Convert lists to arrays
data_paths = np.array(data_paths)
labels = np.array(labels)

# Number of folds for k-fold cross-validation
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# CNN Model
def build_model(input_shape):
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model
# Iterate over K folds
for fold, (train_indices, test_indices) in enumerate(skf.split(data_paths, labels)):
    print(f"\nFold {fold + 1}/{k_folds}")

    # Extract paths for training and testing
    train_paths, test_paths = data_paths[train_indices], data_paths[test_indices]

    # Split the training set into training and validation sets
    # train_paths, val_paths = train_test_split(train_paths, test_size=0.2, random_state=42)

    # Dimensions of the images (224x224 with 3 channels)
    img_width, img_height = 224, 224
    input_shape = (img_width, img_height, 3)

    # Create the CNN model
    model = build_model(input_shape)
    model.summary()

    # Data generators for training, validation, and testing
    train_datagen = ImageDataGenerator(rescale=1.0 / 255.0)
    val_datagen = ImageDataGenerator(rescale=1.0 / 255.0)

    train_generator = train_datagen.flow_from_directory(
        train_paths,
        target_size=(img_width, img_height),
        batch_size=32,
        class_mode='categorical')

    val_generator = val_datagen.flow_from_directory(
        test_paths,
        target_size=(img_width, img_height),
        batch_size=32,
        class_mode='categorical')

    # Training the model
    history = model.fit(
        train_generator,
        steps_per_epoch=train_generator.samples // train_generator.batch_size,
        epochs=60,
        validation_data=val_generator,
        validation_steps=val_generator.samples // val_generator.batch_size)

    # Evaluate the model on test data
    test_loss, test_accuracy = model.evaluate(val_generator, steps=val_generator.samples // val_generator.batch_size)
    print("Test Loss:", test_loss)
    print("Test Accuracy:", test_accuracy)

    # Plotting the training and validation accuracy
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

    # Plotting the confusion matrix
    y_true = val_generator.classes
    y_pred = np.argmax(model.predict(val_generator), axis=-1)
    cm = confusion_matrix(y_true, y_pred)

    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(val_generator.class_indices))
    plt.xticks(tick_marks, val_generator.class_indices.keys(), rotation=45)
    plt.yticks(tick_marks, val_generator.class_indices.keys())
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()
    print(cm)

    # Classification Report
    class_names = list(val_generator.class_indices.keys())
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=class_names))

Number of subjects: 2
List of subjects:
['Healthy', 'Hearing Impaired']


ValueError: Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=2.

In [None]:
# To implement the strategy of holding out different test subjects for each training and averaging the results, 
# you can modify the code within your k-fold cross-validation loop. Here's a modified version of your code to incorporate 
# this approach:


# ... (Your previous code remains unchanged up to this point)

# Number of folds for k-fold cross-validation
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Define the number of test subjects to hold out for evaluation
num_test_subjects = 3

# Lists to store results for each fold
fold_accuracies = []

# Iterate over K folds
for fold, (train_indices, test_indices) in enumerate(skf.split(data_paths, labels)):
    print(f"\nFold {fold + 1}/{k_folds}")

    # Extract paths for training and testing
    train_paths, test_paths = data_paths[train_indices], data_paths[test_indices]

    # Hold out different test subjects for each training and average the results
    test_subjects_indices = np.random.choice(len(test_paths), num_test_subjects, replace=False)
    test_subjects_paths = test_paths[test_subjects_indices]

    fold_accuracies_for_test_subjects = []

    for test_subject_path in test_subjects_paths:
        # Exclude the current test subject
        train_paths_for_iteration = train_paths.copy()
        train_paths_for_iteration = np.append(train_paths_for_iteration, test_subject_path)
        
        # Create data generators for the current iteration
        train_generator = train_datagen.flow_from_directory(
            train_paths_for_iteration,
            target_size=(img_width, img_height),
            batch_size=32,
            class_mode='categorical')

        # Clone the model to ensure a new instance is used for each training iteration
        model = build_model(input_shape)

        # Train the model for the current iteration
        history = model.fit(
            train_generator,
            steps_per_epoch=train_generator.samples // train_generator.batch_size,
            epochs=60)

        # Evaluate the model on the test subjects for the current iteration
        val_generator = val_datagen.flow_from_directory(
            test_subject_path,
            target_size=(img_width, img_height),
            batch_size=32,
            class_mode='categorical')

        test_loss, test_accuracy = model.evaluate(val_generator, steps=val_generator.samples // val_generator.batch_size)
        print("Test Loss:", test_loss)
        print("Test Accuracy:", test_accuracy)

        fold_accuracies_for_test_subjects.append(test_accuracy)

    # Average the accuracy results for the test subjects
    fold_accuracy = np.mean(fold_accuracies_for_test_subjects)
    fold_accuracies.append(fold_accuracy)

# Calculate the average accuracy across all folds
average_accuracy = np.mean(fold_accuracies)
print(f"\nAverage Accuracy Across All Folds: {average_accuracy}")
