In [1]:
# Import necessary libraries
import os
import re
import pandas as pd
import librosa

In [2]:
# Directories
parent_dir = os.path.abspath('..')  # parent directory
# Audio data directory
data_dir = os.path.join(parent_dir, 'data', 'source')  # audio data directory
# Metada directory
metadata_dir = os.path.join(parent_dir, 'data', 'metadata')  # metadata directory

In [3]:
# List of paths for all audio files (with extension .wav) in the data directory
compiled = re.compile('[\w\d()]+.wav$')  # naming format for the audio file

audio_files_paths = []
for path, dirs, files in os.walk(data_dir):  # search all the directories and subdirectories
    for filename in files:
        if compiled.search(filename) is not None:
            audio_files_paths.append(os.path.join(path,filename))  # file path
            
# Relative paths for audio files
audio_files_paths = [elem.replace(data_dir + "\\", "") for elem in audio_files_paths]

In [4]:
# Extract metadata from list of file paths
metadata_list = []
for audio_file_path in audio_files_paths:
    # File name and path
    file_rel_path = audio_file_path
    file_dir_path = os.path.dirname(audio_file_path)
    file_name = os.path.basename(audio_file_path)
    
    # File information from the directory name
    file_info = re.findall("^(\d+|-\d+)_dB_(\w+)\\\\(\w+)\\\\id_(\w+)\\\\(\w+)", file_dir_path)
    file_info = list(file_info[0])
    # SNR
    SNR_dB = int(file_info[0])
    if file_info[1] == file_info[2]:  # check directory name consistency
        del file_info[2]
    else:
        print("Directory name is not consistent")
        continue
    # Machine name and type
    machine = file_info[1]
    model = int(file_info[2])
    # Anomaly (0 if the sound is normal, 1 if it is abnormal)
    if file_info[3] == 'normal':
        anomaly = 0
    else:
        anomaly = 1
    
    metadata_list.append([machine, model, anomaly, file_name, file_rel_path, None, None, SNR_dB])

# Create a DataFrame from the list
metadata_df = pd.DataFrame(metadata_list, 
                  columns=['machine', 'model', 'anomaly', 'file_name', 'file_rel_path', 'duration_s', 'fs_Hz', 'SNR_dB'])
metadata_df.index.name = 'file_no'  # index name, unique file number

In [5]:
# Extract audio-related information for each file
for n_row in range(metadata_df.shape[0]):
    file_path = os.path.join(data_dir, metadata_df.iloc[n_row].file_rel_path)
    # Duration and sampling frequency
    fs_Hz = librosa.get_samplerate(path=file_path)
    duration_s = librosa.get_duration(filename=file_path)  # in seconds
    # Add data to the DataFrame
    metadata_df.loc[n_row, 'duration_s'] = duration_s
    metadata_df.loc[n_row, 'fs_Hz'] = fs_Hz

In [6]:
# Export the metadata DataFrame to a csv file
metadata_df.to_csv(os.path.join(metadata_dir, 'metadata.csv'), header=True, index=True)