In [4]:
# Standard library imports
import csv
import math
import os
import pickle
import random
import re

# Third-party library imports
import config
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from keras import layers, models, optimizers, callbacks, regularizers, utils
from keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint, TensorBoard
from keras.layers import Dense, Dropout, LSTM, Input
from keras.models import Sequential, load_model
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.models import load_model
from keras.utils import to_categorical
from rich.console import Console
from rich.table import Table
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from tqdm import tqdm
from numpy import mean
from collections import Counter



# Configurations
model_path = config.model_path
dataset_path = config.dataset_path
PGB_path = config.PGB_path
RGB_path = config.RGB_path
csv_file = config.csv_file
preprocessor_file = config.preprocessor_file
train_path = config.train_path
val_path = config.val_path
chunk_size = config.chunk_size
csv_directory = config.csv_directory
data_root_folder = config.data_root_folder
sequence_length = config.sequence_length
sequences_directory = config.sequences_directory
num_features = config.num_features
processed_bases = config.processed_bases
batch_size = config.batch_size
epochs = config.epochs
patience = config.patience
learning_rate = config.learning_rate
n_splits = config.n_splits
model_save_directory = config.model_save_directory
reg_value = config.reg_value
num_train_samples = config.num_train_samples
num_test_samples = config.num_test_samples
reg_type = config.reg_type
n_samples = config.n_samples

def reset_random_seeds(seed_value=42):
    tf.random.set_seed(seed_value)
    np.random.seed(seed_value)
    random.seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

# Call this function at the beginning of your script
reset_random_seeds()


# Auxiliary functions

In [5]:
def extract_fault(file_name):
    fault_mapping = {
        '0Health': 'HEA', '1Chipped': 'CTF', '2Miss': 'MTF', 
        '3Root': 'RCF', '4Surface': 'SWF', '5Ball': 'BWF', 
        '6Combination': 'CWF', '7Inner': 'IRF', '8Outer': 'ORF'
    }
    for key, value in fault_mapping.items():
        if key in file_name:
            return value
    return None

def make_csv_writer(csv_file):
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Channel1', 'Channel2', 'Channel3', 'Channel4', 'Channel5', 'Channel6', 'Channel7', 'Channel8', 'Fault'])
    return csv_writer

def generate_csv(output_directory, root_path, speed, experiment, files, num_train_samples, num_test_samples):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    train_filename_suffix = f"{speed}_{experiment}_train" if experiment else f"{speed}_train"
    test_filename_suffix = f"{speed}_{experiment}_test" if experiment else f"{speed}_test"
    
    train_output_file_path = os.path.join(output_directory, f"PGB_{train_filename_suffix}.csv")
    test_output_file_path = os.path.join(output_directory, f"PGB_{test_filename_suffix}.csv")
    
    with open(train_output_file_path, 'w', newline='', encoding='utf-8') as train_csvfile, \
        open(test_output_file_path, 'w', newline='', encoding='utf-8') as test_csvfile:
        train_csv_writer = make_csv_writer(train_csvfile)
        test_csv_writer = make_csv_writer(test_csvfile)
        
        for file in tqdm(files, desc=f"Processing {speed} {experiment}", unit="file"):
            fault_type = extract_fault(file)
            file_path = os.path.join(root_path, file)
            
            total_rows = num_train_samples + num_test_samples
            data = pd.read_csv(file_path, sep='\t', header=None, encoding='ISO-8859-1', skiprows=1, nrows=total_rows)
            train_samples, test_samples = data.iloc[:num_train_samples, :], data.iloc[num_train_samples:total_rows, :]
            
            for index, row in train_samples.iterrows():
                train_csv_writer.writerow(row[:8].tolist() + [fault_type])
            
            for index, row in test_samples.iterrows():
                test_csv_writer.writerow(row[:8].tolist() + [fault_type])

def process_pgb_data(data_root_folder, csv_directory, num_train_samples, num_test_samples):
    for root, dirs, files in os.walk(data_root_folder):
        parts = root.split(os.sep)
        if 'Variable_speed' in parts:
            speed = "Variable_speed"
            experiment_dir = parts[-1]  # Get the last part as the experiment name
            exp_files = [f for f in os.listdir(root) if f.endswith('.txt')]
            generate_csv(csv_directory, root, speed, experiment_dir, exp_files, num_train_samples, num_test_samples)
        elif 'PGB' in parts and files:
            speed = parts[-1]  # Last part of 'root' is the speed directory
            generate_csv(csv_directory, root, speed, '', files, num_train_samples, num_test_samples)
            
            
def overview_csv_files(directory):
    data = []
    all_faults = set()

    for file in os.listdir(directory):
        if file.endswith(".csv"):
            file_path = os.path.join(directory, file)
            df = pd.read_csv(file_path)

            # Check if the CSV is empty (aside from the header)
            if df.shape[0] == 0:
                # Delete the empty CSV file
                os.remove(file_path)
                print(f"Deleted empty file: {file_path}")
                continue  # Skip further processing for this file

            num_samples = len(df)
            fault_distribution = Counter(df['Fault'])
            all_faults.update(fault_distribution.keys())
            data.append({'File Name': file, 'Number of Samples': num_samples, **fault_distribution})

    if not data:  # If no data has been gathered, exit the function
        print("No data found.")
        return

    overview_df = pd.DataFrame(data)
    for fault in all_faults:
        if fault not in overview_df.columns:
            overview_df[fault] = 0

    cols = ['File Name', 'Number of Samples'] + sorted(all_faults)
    overview_df = overview_df[cols]
    overview_df.fillna(0, inplace=True)
    overview_df.loc[:, 'Number of Samples':] = overview_df.loc[:, 'Number of Samples':].astype(int)

    overview_df = overview_df.sort_values(by='File Name')
    print(overview_df.to_string(index=False))
    
def load_and_scale_data(csv_path, scaler=None, save_scaler_path=None):
    """
    Loads data from a CSV file, scales the features (excluding the 'Fault' column), 
    and returns the scaled DataFrame. Optionally saves the scaler model.
    """
    # Load the data
    data = pd.read_csv(csv_path)
    
    # Separate features and target
    features = data.columns[:-1]  # Assuming the last column is the target
    X = data[features]
    y = data['Fault']

    # Apply scaling
    if scaler is None:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        if save_scaler_path:
            joblib.dump(scaler, save_scaler_path)
    else:
        X_scaled = scaler.transform(X)
    
    # Combine scaled features with target
    scaled_df = pd.DataFrame(X_scaled, columns=features)
    scaled_df['Fault'] = y
    
    return scaled_df

def create_sequences(df, sequence_length):
    sequences = []
    labels = []
    fault_types = df['Fault'].unique()

    for fault in fault_types:
        df_fault = df[df['Fault'] == fault]
        X = df_fault.drop('Fault', axis=1).values
        y = df_fault['Fault'].iloc[0]  # Updated to use iloc for consistency
        
        for i in range(len(df_fault) - sequence_length + 1):
            sequences.append(X[i:i+sequence_length])
            labels.append(fault)  # Keep the fault type as is
    
    return np.array(sequences), np.array(labels)


def save_sequences(input_directory, output_directory, sequence_length):
    """
    Generates sequences and saves them as NumPy files, one for sequences and one for labels.
    
    Parameters:
    - input_directory: The directory with the original, scaled data files.
    - output_directory: The directory where the NumPy sequence files will be saved.
    - sequence_length: The number of consecutive samples in each sequence.
    """
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    for file_name in tqdm(os.listdir(input_directory), desc="Generating sequences"):
        if file_name.endswith('.csv'):
            df = pd.read_csv(os.path.join(input_directory, file_name))
            sequences, labels = create_sequences(df, sequence_length)
            
            # File names for sequences and labels
            base_name = os.path.splitext(file_name)[0]
            sequences_file_path = os.path.join(output_directory, f"{base_name}_sequences.npy")
            labels_file_path = os.path.join(output_directory, f"{base_name}_labels.npy")
            
            # Save sequences and labels
            np.save(sequences_file_path, sequences)
            np.save(labels_file_path, labels)
            
def extract_speed_from_filename(file_name):
    """
    Extracts the speed from the filename.
    Returns the numeric speed for fixed speeds, or -1 for variable speeds.
    """
    fixed_speed_match = re.search(r"PGB_(\d+)_", file_name)
    if fixed_speed_match:
        return int(fixed_speed_match.group(1))
    variable_speed_match = re.search(r"Variable_speed", file_name)
    if variable_speed_match:
        return -1  # Special value for variable speeds
    return None

def add_speed_feature_and_save(input_directory, output_directory, sequence_length):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    for file_name in tqdm(os.listdir(input_directory), desc="Processing files"):
        if file_name.endswith('.csv'):
            speed = extract_speed_from_filename(file_name)
            df = pd.read_csv(os.path.join(input_directory, file_name))
            df['Speed'] = speed  # Add speed as a new column
            
            sequences, labels = create_sequences(df, sequence_length)
            
            base_name = os.path.splitext(file_name)[0]
            sequences_file_path = os.path.join(output_directory, f"{base_name}_sequences.npy")
            labels_file_path = os.path.join(output_directory, f"{base_name}_labels.npy")
            
            np.save(sequences_file_path, sequences)
            np.save(labels_file_path, labels)
            
def display_samples(sequences_file_path, labels_file_path, num_samples=1):
    """
    Displays a specified number of samples from the sequences and labels .npy files.
    
    Parameters:
    - sequences_file_path: Path to the .npy file containing sequences.
    - labels_file_path: Path to the .npy file containing labels.
    - num_samples: Number of samples to display. Default is 5.
    """
    # Load the sequences and labels
    sequences = np.load(sequences_file_path)
    labels = np.load(labels_file_path)
    
    # Determine the number of samples to display (cannot exceed the length of the data)
    num_samples = min(num_samples, len(sequences))
    
    # Display the specified number of samples
    for i in range(num_samples):
        print(f"Sample {i+1}:")
        print("Sequence:")
        print(sequences[i])
        print("Label:")
        print(labels[i])
        print("-" * 50)  # Separator for readability
        
def memmap_append_and_save(input_directory, output_directory, dataset_type, file_type):
    # Update the output file path to indicate pickle format
    output_file_path = os.path.join(output_directory, f"{dataset_type}_merged_{file_type}.pkl")

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    merged_data = None
    current_size = 0

    file_names = [fn for fn in os.listdir(input_directory) if fn.endswith(f'_{file_type}.npy') and dataset_type in fn]
    for file_name in tqdm(file_names, desc=f"Merging {dataset_type} {file_type}"):
        path = os.path.join(input_directory, file_name)
        data = np.load(path)

        # Adjust for both 1D and 2D+ data
        new_shape = (current_size + data.shape[0],) + data.shape[1:] if len(data.shape) > 1 else (current_size + data.shape[0],)
        if merged_data is None:
            # Initially, directly use the loaded data
            merged_data = data.copy()
        else:
            # Concatenate new data
            merged_data = np.concatenate((merged_data, data), axis=0)
        
        current_size += data.shape[0]

    # After processing all files, save the merged data using pickle
    with open(output_file_path, 'wb') as f:
        pickle.dump(merged_data, f, protocol=pickle.HIGHEST_PROTOCOL)
    
    print(f"{dataset_type.capitalize()} {file_type} data merged and saved to {output_file_path} in pickle format")

def merge_npy_files_with_memmap_separated(input_directory, output_directory):
    for dataset_type in ['train', 'test']:
        for file_type in ['sequences', 'labels']:
            memmap_append_and_save(input_directory, output_directory, dataset_type, file_type)
            
def calculate_class_accuracies(predictions, true_labels):
    pred_labels = np.argmax(predictions, axis=1)
    true_labels = np.argmax(true_labels, axis=1)
    
    # Initialize a dictionary to store accuracy for each class present in true_labels
    class_accuracies = {}
    
    for class_index in np.unique(true_labels):  # Loop only through classes present in true_labels
        class_mask = true_labels == class_index
        
        # Calculate accuracy for the current class
        class_accuracies[class_index] = accuracy_score(true_labels[class_mask], pred_labels[class_mask])
    
    return class_accuracies

def permutation_importance_per_class(model, X_val, y_val, n_repeats=10, n_samples=None):
    n_samples = n_samples if n_samples is not None else X_val.shape[0]
    random_indices = np.random.choice(X_val.shape[0], size=n_samples, replace=False)
    X_val_subset = X_val[random_indices]
    y_val_subset = y_val[random_indices]
    
    # Get baseline class-specific accuracies
    baseline_predictions = model.predict(X_val_subset, verbose = 0)
    baseline_class_accuracies = calculate_class_accuracies(baseline_predictions, y_val_subset)
    
    # Prepare storage for importances, using a dictionary to accommodate variable class presence
    feature_importances = {class_index: np.zeros((X_val.shape[2], n_repeats)) for class_index in baseline_class_accuracies.keys()}
    
    for feature_index in tqdm(range(X_val.shape[2]), desc='Calculating Feature Importance'):
        for n in range(n_repeats):
            saved_feature = X_val_subset[:, :, feature_index].copy()
            np.random.shuffle(X_val_subset[:, :, feature_index])
            
            permuted_predictions = model.predict(X_val_subset, verbose = 0)
            permuted_class_accuracies = calculate_class_accuracies(permuted_predictions, y_val_subset)
            
            for class_index in baseline_class_accuracies.keys():
                feature_importances[class_index][feature_index, n] = baseline_class_accuracies[class_index] - permuted_class_accuracies.get(class_index, 0)
            
            X_val_subset[:, :, feature_index] = saved_feature
    
    # Average the importance scores across repeats and prepare formatted output
    average_importances = {class_index: importances.mean(axis=1) for class_index, importances in feature_importances.items()}
    
    # Format the output
    formatted_importances = {f"Class {class_index}": importance.tolist() for class_index, importance in average_importances.items()}
    return formatted_importances


            
def kfold_cross_validation(X, y, num_folds=5):
    input_shape = X.shape[1:]  # Assuming X is (num_samples, time_steps, features)
    num_classes = y.shape[1]
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    fold_no = 1
    for train, test in kfold.split(X, y):
        print(f"Training on fold {fold_no}...")
        
        model = create_model(input_shape, num_classes, l2_reg=reg_value)
        lr_scheduler = LearningRateScheduler(lr_schedule, verbose=0)
        early_stopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
        
        model.fit(X[train], y[train], validation_data=(X[test], y[test]),
                epochs=epochs, batch_size=batch_size, callbacks=[early_stopping, lr_scheduler], verbose=1)
        
        fold_no += 1
        
        
        
# version that takes speed by speed

def lr_schedule(epoch, lr):
    if epoch % 10 == 0 and epoch > 0:
        return lr * 0.1
    return lr

def load_sequences(sequence_file_path, label_file_path):
    sequences = np.load(sequence_file_path)
    labels = np.load(label_file_path)
    encoder = LabelEncoder()
    labels_encoded = encoder.fit_transform(labels)
    labels_onehot = to_categorical(labels_encoded)
    return sequences, labels_onehot

# def create_model(input_shape, num_classes):
#     model = Sequential([
#         LSTM(100, return_sequences=True, input_shape=input_shape),
#         Dropout(0.2),
#         LSTM(100),
#         Dropout(0.2),
#         Dense(100, activation='relu'),
#         Dense(num_classes, activation='softmax')
#     ])
#     model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
#     return model

from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense, Input
from keras.regularizers import l2
from keras.optimizers import Adam
from keras import regularizers

def create_model(input_shape, num_classes, reg_type='l2', reg_value=0.001, return_logits=False):
    if reg_type == 'l2':
        regularizer = regularizers.l2(reg_value)
    elif reg_type == 'l1':
        regularizer = regularizers.l1(reg_value)
    else:
        raise ValueError("Invalid regularizer type. Choose 'l1' or 'l2'.")

    model = Sequential([
        Input(shape=input_shape),
        LSTM(256, return_sequences=True, kernel_regularizer=regularizer),
        Dropout(0.3),
        LSTM(128, return_sequences=True, kernel_regularizer=regularizer),
        Dropout(0.3),
        LSTM(64, kernel_regularizer=regularizer),
        Dropout(0.3),
        Dense(128, activation='relu', kernel_regularizer=regularizer),
        Dropout(0.3),
        Dense(64, activation='relu', kernel_regularizer=regularizer),
        Dropout(0.3)
    ])
    
    if return_logits:
        # If we want the model to return logits, add a Dense layer without activation
        model.add(Dense(num_classes))
    else:
        # Otherwise, add a Dense layer with softmax activation to return probabilities
        model.add(Dense(num_classes, activation='softmax'))
    
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [3]:
process_pgb_data(data_root_folder, csv_directory, num_train_samples, num_test_samples)
overview_csv_files(csv_directory)

Processing Variable_speed Variable_speed: 0file [00:00, ?file/s]
Processing Variable_speed Experiment8: 100%|██████████| 9/9 [00:19<00:00,  2.14s/file]
Processing Variable_speed Experiment4: 100%|██████████| 9/9 [00:20<00:00,  2.25s/file]
Processing Variable_speed Experiment6: 100%|██████████| 9/9 [00:19<00:00,  2.19s/file]
Processing Variable_speed Experiment2: 100%|██████████| 9/9 [00:17<00:00,  1.95s/file]
Processing Variable_speed Experiment5: 100%|██████████| 9/9 [00:17<00:00,  1.91s/file]
Processing Variable_speed Experiment1: 100%|██████████| 9/9 [00:15<00:00,  1.69s/file]
Processing Variable_speed Experiment9: 100%|██████████| 9/9 [00:17<00:00,  1.92s/file]
Processing Variable_speed Experiment7: 100%|██████████| 9/9 [00:17<00:00,  1.96s/file]
Processing Variable_speed Experiment3: 100%|██████████| 9/9 [00:19<00:00,  2.19s/file]
Processing Variable_speed Experiment10: 100%|██████████| 9/9 [00:18<00:00,  2.05s/file]
Processing 40_0 : 100%|██████████| 9/9 [00:17<00:00,  1.97s/file

Deleted empty file: /home/ubuntu/dds_paper/DDS_Paper/data/DDS_Data_SEU/data/csvs/PGB_Variable_speed_Variable_speed_train.csv
Deleted empty file: /home/ubuntu/dds_paper/DDS_Paper/data/DDS_Data_SEU/data/csvs/PGB_Variable_speed_Variable_speed_test.csv
                                File Name  Number of Samples   BWF   CTF   CWF   HEA   IRF   MTF   ORF   RCF   SWF
                        PGB_20_0_test.csv              36000  4000  4000  4000  4000  4000  4000  4000  4000  4000
                       PGB_20_0_train.csv             180000 20000 20000 20000 20000 20000 20000 20000 20000 20000
                        PGB_30_0_test.csv              36000  4000  4000  4000  4000  4000  4000  4000  4000  4000
                       PGB_30_0_train.csv             180000 20000 20000 20000 20000 20000 20000 20000 20000 20000
                        PGB_30_1_test.csv              36000  4000  4000  4000  4000  4000  4000  4000  4000  4000
                       PGB_30_1_train.csv             180000 

In [4]:
# scale data

# Iterate over your dataset files
for root, dirs, files in os.walk(csv_directory):
    for file in sorted(files):
        if file.endswith('.csv') and not file.endswith('_scaled.csv'):  # Process only unscaled .csv files
            csv_path = os.path.join(root, file)
            if 'train' in file:
                # Handle training data
                scaler_path = os.path.join(root, 'scaler_' + file.replace('.csv', '.joblib'))
                scaled_train_df = load_and_scale_data(csv_path, save_scaler_path=scaler_path)
                # Save the scaled training data
                scaled_csv_path = csv_path.replace('.csv', '_scaled.csv')
                scaled_train_df.to_csv(scaled_csv_path, index=False)
            elif 'test' in file:
                # Handle testing data
                scaler_path = os.path.join(root, 'scaler_' + file.replace('_test.csv', '_train.joblib'))
                scaler = joblib.load(scaler_path) if os.path.exists(scaler_path) else None
                scaled_test_df = load_and_scale_data(csv_path, scaler=scaler)
                # Save the scaled testing data
                scaled_csv_path = csv_path.replace('.csv', '_scaled.csv')
                scaled_test_df.to_csv(scaled_csv_path, index=False)

            # Delete the original unscaled .csv file
            os.remove(csv_path)
            
#create sequences
save_sequences(csv_directory, sequences_directory, sequence_length)

#add_speed_feature_and_save(csv_directory, sequences_directory, sequence_length)

#merge_npy_files_with_memmap_separated(sequences_directory, sequences_directory)

Generating sequences: 100%|██████████| 58/58 [00:31<00:00,  1.83it/s]


# Training the model

In [6]:

kf = KFold(n_splits=n_splits, shuffle=False)
counter = 0
console = Console()

# Placeholder for processed base names and metrics
processed_bases = set()
metrics_summary = []

for file in sorted(os.listdir(sequences_directory)):
    if "_train_scaled_sequences.npy" in file:
        base_name = file.replace("_train_scaled_sequences.npy", "")
        if base_name in processed_bases:
            continue
        counter+=1
        
        model_filepath = os.path.join(model_save_directory, f"saved.h5")
        
            
            
        # Load sequences and labels
        train_sequence_file_path = os.path.join(sequences_directory, f"{base_name}_train_scaled_sequences.npy")
        train_label_file_path = os.path.join(sequences_directory, f"{base_name}_train_scaled_labels.npy")
        X_train, y_train = load_sequences(train_sequence_file_path, train_label_file_path)

        # Assuming the existence of a test set (adjust if necessary)
        test_sequence_file_path = os.path.join(sequences_directory, f"{base_name}_test_scaled_sequences.npy")
        test_label_file_path = os.path.join(sequences_directory, f"{base_name}_test_scaled_labels.npy")
        X_test, y_test = load_sequences(test_sequence_file_path, test_label_file_path)

        # Merge for cross-validation
        X = np.concatenate((X_train, X_test), axis=0)
        y = np.concatenate((y_train, y_test), axis=0)

        num_classes = y_train.shape[1]
        input_shape = (sequence_length, num_features)
        fold_metrics = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
            console.print(f"[bold green]Training fold {fold + 1}/{n_splits} for {base_name}[/]")
            X_train_fold, X_val_fold = X[train_idx], X[val_idx]
            y_train_fold, y_val_fold = y[train_idx], y[val_idx]

            model = create_model(input_shape, num_classes, reg_type = reg_type, reg_value=reg_value)
            lr_scheduler = LearningRateScheduler(lr_schedule)
            early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)

            # Add ModelCheckpoint and TensorBoard for improved monitoring and model saving
            model_filepath = os.path.join(model_save_directory, f"model_{base_name}_fold_{fold+1}.h5")
            checkpoint = ModelCheckpoint(model_filepath, save_best_only=True, monitor='val_loss')
            tensorboard = TensorBoard(log_dir=f"./logs/{base_name}_fold_{fold+1}")



            history = model.fit(X_train_fold, y_train_fold, validation_data=(X_val_fold, y_val_fold),
                                epochs=epochs, batch_size=batch_size, callbacks=[early_stopping, lr_scheduler, checkpoint, tensorboard], verbose=1)

            
            # Assuming your model outputs softmax probabilities, adjust as necessary
            y_val_pred = model.predict(X_val_fold)
            y_val_pred_classes = np.argmax(y_val_pred, axis=1)
            y_val_true_classes = np.argmax(y_val_fold, axis=1)

            # Calculate and store metrics for this fold
            accuracy = accuracy_score(y_val_true_classes, y_val_pred_classes)
            precision = precision_score(y_val_true_classes, y_val_pred_classes, average='macro', zero_division=0)
            recall = recall_score(y_val_true_classes, y_val_pred_classes, average='macro', zero_division=0)
            f1 = f1_score(y_val_true_classes, y_val_pred_classes, average='macro')
            fold_metrics.append((accuracy, precision, recall, f1))

            console.print(f"Fold {fold+1} Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
            
            class_importances = permutation_importance_per_class(model, X_val_fold, y_val_fold, n_repeats=4, n_samples=n_samples)


            
                # After processing all folds for the current CSV pair
        if fold_metrics:
            # Calculate the average of each metric across all folds
            avg_accuracy = mean([metric[0] for metric in fold_metrics])
            avg_precision = mean([metric[1] for metric in fold_metrics])
            avg_recall = mean([metric[2] for metric in fold_metrics])
            avg_f1 = mean([metric[3] for metric in fold_metrics])

            # Append averaged metrics to the metrics_summary for overall analysis if needed
            metrics_summary.append((base_name, avg_accuracy, avg_precision, avg_recall, avg_f1))

            # Print the averages
            console.print(f"[bold magenta]Average metrics for {base_name} across {n_splits} folds:[/]")
            console.print(f"Average Accuracy: {avg_accuracy:.4f}")
            console.print(f"Average Precision: {avg_precision:.4f}")
            console.print(f"Average Recall: {avg_recall:.4f}")
            console.print(f"Average F1: {avg_f1:.4f}\n")
        

        
    if counter!=0:
        break


console.print(f"[bold blue]Model for {base_name} saved.[/]")
# Optionally, after all file pairs have been processed, print a summary of averages across all file pairs
console.print("[bold blue]Overall Averages Across All File Pairs:[/]")
overall_avg_accuracy = mean([metrics[1] for metrics in metrics_summary])
overall_avg_precision = mean([metrics[2] for metrics in metrics_summary])
overall_avg_recall = mean([metrics[3] for metrics in metrics_summary])
overall_avg_f1 = mean([metrics[4] for metrics in metrics_summary])

console.print(f"Overall Average Accuracy: {overall_avg_accuracy:.4f}")
console.print(f"Overall Average Precision: {overall_avg_precision:.4f}")
console.print(f"Overall Average Recall: {overall_avg_recall:.4f}")
console.print(f"Overall Average F1: {overall_avg_f1:.4f}")




  saving_api.save_model(




Calculating Feature Importance: 100%|██████████| 8/8 [00:24<00:00,  3.09s/it]




  saving_api.save_model(




Calculating Feature Importance: 100%|██████████| 8/8 [00:24<00:00,  3.10s/it]




  saving_api.save_model(




Calculating Feature Importance: 100%|██████████| 8/8 [00:24<00:00,  3.07s/it]




  saving_api.save_model(




Calculating Feature Importance: 100%|██████████| 8/8 [00:25<00:00,  3.20s/it]


In [8]:
# Save the model at the end
model.save(model_path)
console.print(f"[bold blue]Model for {base_name} saved.[/]")

In [26]:
model = load_weights(model_path)

In [3]:
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score

def calculate_class_accuracies(predictions, true_labels):
    pred_labels = np.argmax(predictions, axis=1)
    true_labels = np.argmax(true_labels, axis=1)
    
    # Initialize a dictionary to store accuracy for each class present in true_labels
    class_accuracies = {}
    
    for class_index in np.unique(true_labels):  # Loop only through classes present in true_labels
        class_mask = true_labels == class_index
        
        # Calculate accuracy for the current class
        class_accuracies[class_index] = accuracy_score(true_labels[class_mask], pred_labels[class_mask])
    
    return class_accuracies

def permutation_importance_per_class(model, X_val, y_val, n_repeats=10, n_samples=None):
    n_samples = n_samples if n_samples is not None else X_val.shape[0]
    random_indices = np.random.choice(X_val.shape[0], size=n_samples, replace=False)
    X_val_subset = X_val[random_indices]
    y_val_subset = y_val[random_indices]
    
    # Get baseline class-specific accuracies
    baseline_predictions = model.predict(X_val_subset)
    baseline_class_accuracies = calculate_class_accuracies(baseline_predictions, y_val_subset)
    
    # Prepare storage for importances, using a dictionary to accommodate variable class presence
    feature_importances = {class_index: np.zeros((X_val.shape[2], n_repeats)) for class_index in baseline_class_accuracies.keys()}
    
    for feature_index in tqdm(range(X_val.shape[2]), desc='Calculating Feature Importance'):
        for n in range(n_repeats):
            saved_feature = X_val_subset[:, :, feature_index].copy()
            np.random.shuffle(X_val_subset[:, :, feature_index])
            
            permuted_predictions = model.predict(X_val_subset)
            permuted_class_accuracies = calculate_class_accuracies(permuted_predictions, y_val_subset)
            
            for class_index in baseline_class_accuracies.keys():
                feature_importances[class_index][feature_index, n] = baseline_class_accuracies[class_index] - permuted_class_accuracies.get(class_index, 0)
            
            X_val_subset[:, :, feature_index] = saved_feature
    
    # Average the importance scores across repeats and prepare formatted output
    average_importances = {class_index: importances.mean(axis=1) for class_index, importances in feature_importances.items()}
    
    # Format the output
    formatted_importances = {f"Class {class_index}": importance.tolist() for class_index, importance in average_importances.items()}
    return formatted_importances

# Example usage
n_samples = 500 if X_val_fold.shape[0] > 1000 else X_val_fold.shape[0]
class_importances = permutation_importance_per_class(model, X_val_fold, y_val_fold, n_repeats=4, n_samples=n_samples)

for class_id, importances in class_importances.items():
    print(f"{class_id} Feature Importances:", importances)


NameError: name 'X_val_fold' is not defined