In [None]:
import os
import numpy as np
import librosa
import soundfile as sf
from typing import Tuple, List

def process_wav_files_in_folder(folder_path: str) -> np.ndarray:
    """
    Finds all .wav files in a folder, loads them, handles corrupted files,
    and returns a zero-padded NumPy matrix of the audio data.

    Args:
        folder_path (str): The path to the directory containing .wav files.

    Returns:
        np.ndarray: A 2D NumPy array with dimensions
                    (#readfiles, max_length_of_files), containing the
                    audio data. Shorter files are zero-padded.
                    Returns an empty array if no valid .wav files are found.
    """
    if not os.path.isdir(folder_path):
        print(f"Error: Folder not found at '{folder_path}'")
        return np.array([])

    wav_files = [os.path.join(root, file)
                 for root, _, files in os.walk(folder_path)
                 for file in files if file.endswith('.wav')]

    total_files = len(wav_files)
    if total_files == 0:
        print("No .wav files found in the specified folder.")
        return np.array([])

    loaded_audio_data = []
    print("Starting explorative read of files...")

    for file_path in wav_files:
        try:
            # librosa.load uses soundfile by default and is robust.
            # sr=None preserves the original sampling rate.
            audio, sr = librosa.load(file_path, sr=None)
            loaded_audio_data.append(audio)
            print(f"Successfully loaded: {os.path.basename(file_path)}")
        except Exception as e:
            # Catching a broad exception class as various issues can occur
            # with corrupted files (e.g., sf.LibsndfileError, ValueError, etc.)
            print(f"Could not read file (corrupted or unsupported format): {os.path.basename(file_path)}. Error: {e}")

    read_files_count = len(loaded_audio_data)
    print(f"\n--- Read Summary ---")
    print(f"Successfully read files / Total .wav files: {read_files_count}/{total_files}")
    print("--------------------\n")

    if not loaded_audio_data:
        print("No valid audio data was loaded. Returning an empty matrix.")
        return np.array([])

    # Determine the maximum length among all loaded files
    max_len = max(len(audio) for audio in loaded_audio_data)
    print(f"Maximum sample length found: {max_len}")

    # Create the zero-padded matrix
    # Dimensions: (# of successfully read files, max_len)
    data_matrix = np.zeros((read_files_count, max_len), dtype=np.float32)

    # Populate the matrix with the audio data
    for i, audio_data in enumerate(loaded_audio_data):
        data_matrix[i, :len(audio_data)] = audio_data

    print(f"Successfully created matrix with shape: {data_matrix.shape}")

    return data_matrix

# --- Example Usage ---
if __name__ == '__main__':
    

    # 5. Specify the path to your folder
    # IMPORTANT: Change this path to your actual folder containing .wav files
    target_folder_path = "/home/javastral/GIT/ANE2-GCPDS/Datasets/CommRad_Dataset/"

    # 6. Run the function and get the matrix
    final_matrix = process_wav_files_in_folder(target_folder_path)

    # 7. Print the results
    if final_matrix.size > 0:
        print("\nFinal Matrix Shape:", final_matrix.shape)

Starting explorative read of files...
