## Data Preprocessing

1. **Process Annotations**: Read the annotation files for each dataset and convert the event data into a unified format. For example, for the MUSP dataset, annotations are transformed into a list of events where each event is represented as a tuple containing the event type ('m' for music or 's' for speech), start time, and end time.

2. **Save Processed Data**: Save the processed data, including the audio file paths and their corresponding events, into a new CSV file. This structured data will serve as the input for data generator.

In [None]:
import os
import csv

### Musp

In [42]:
DATA_PATH = '../data/musp/'

def get_files(data_path, extensions):
  """
  Get a list of files in the specified data path with the given extensions.

  Parameters:
  - data_path (str): The path to the directory containing the files.
  - extensions (str or tuple): The file extensions to filter by.

  Returns:
  - files (list): A list of file names that match the specified extensions.
  """
  files = [f for f in os.listdir(data_path) if f.endswith(extensions)]
  return files

def write_data_to_csv(data, output_path):
  """
  Write data to a CSV file.

  Args:
    data (dict): A dictionary containing the data to be written to the CSV file.
    output_path (str): The path to the output CSV file.

  Returns:
    None
  """
  with open(output_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['filepath', 'events'])
    for key, value in data.items():
      if value:
        writer.writerow([key, value])

# Get list of files in the data path with the .csv extension:
files = get_files(DATA_PATH, extensions='.csv')

musp_data = {}
for f in files:
    with open(DATA_PATH + f, 'r') as file:
        reader = csv.reader(file)
        file_name = f.split('.')[0]     # file name without the extension
        file_name = file_name + '.mp3'  # add the .mp3 extension to match the audio files
        musp_data[file_name] = []       # Initialize the list of events for this file

        print(f'Processing {file_name}...')
        for row in reader:
            if row: # Skip empty lines
              start = float(row[0])
              duration = float(row[1])
              end_time = start + duration
              label = str(row[2])

              musp_data[file_name].append(
                (label, start, end_time) # Tuple with the event label, start (seconds) and end time (seconds)
              )                          # i.e. ('s', 20, 22) means that the event 's' starts at 20s and ends at 22s

# Write the data to a CSV file:     
write_data_to_csv(musp_data, '../data/processed/musp.csv')

Processing ConscinciasParalelasN11-OEspelhoEOReflexoFantasiasEPerplexidadesParte413-12-1994.mp3...
Processing ConscinciasParalelasN3-OsSentidosOSentirEAsNormasParte318-10-1994.mp3...
Processing ConscinciasParalelasN7-OsSentidosOSentirEAsNormasParte715-1-1994.mp3...
Processing eatmycountry1609.mp3...
Processing theconcert16.mp3...
Processing theconcert2.mp3...
Processing theconcert2_v2.mp3...
Processing UTMA-26.mp3...
Processing UTMA-26_v2.mp3...


### TUT

In [55]:
DATA_PATH = '../data/tut/'

"""
tut\TUT-sound-events-2017-development\TUT-sound-events-2017-development.audio.1\TUT-sound-events-2017-development\audio\street
20 audios wav format

tut\TUT-sound-events-2017-development\TUT-sound-events-2017-development.audio.2\TUT-sound-events-2017-development\audio\street
4 audios wav format

total 24 audios


tut\TUT-sound-events-2017-development\TUT-sound-events-2017-development.meta\TUT-sound-events-2017-development\meta\street
24 csv files with the same name as the audio files

"""

ANNOTETION_PATH = '../data/tut/TUT-sound-events-2017-development/TUT-sound-events-2017-development.meta/TUT-sound-events-2017-development/meta/street/'
AUDIO_1_PATH = '../data/tut/TUT-sound-events-2017-development/TUT-sound-events-2017-development.audio.1/TUT-sound-events-2017-development/audio/street/'
AUDIO_2_PATH = '../data/tut/TUT-sound-events-2017-development/TUT-sound-events-2017-development.audio.2/TUT-sound-events-2017-development/audio/street/'


files = get_files(ANNOTETION_PATH, extensions='.ann')

tut_data = {}
for f in files:
    with open(ANNOTETION_PATH + f, 'r'):

        f_name = f.split('.')[0] + '.wav'

        if f_name in ['a128.wav', 'a131.wav', 'b007.wav', 'b093.wav']:
            f_path = AUDIO_2_PATH + f_name
        else:
          f_path = AUDIO_1_PATH + f_name
          
        tut_data[f_path] = []

        print(f'Processing {f_path}...')

        with open(ANNOTETION_PATH + f, 'r') as file:

            reader = csv.reader(file)

            for row in reader:
                if row:
                    # split in \t and get the start and end time
                    row = row[0].split('\t')
                    start = float(row[2])
                    end = float(row[3])
                    label = row[4]
                    tut_data[f_path].append(
                        (label, start, end)
                    )

write_data_to_csv(tut_data, '../data/processed/tut.csv')

Processing ../data/tut/TUT-sound-events-2017-development/TUT-sound-events-2017-development.audio.1/TUT-sound-events-2017-development/audio/street/a001.wav...
Processing ../data/tut/TUT-sound-events-2017-development/TUT-sound-events-2017-development.audio.1/TUT-sound-events-2017-development/audio/street/a003.wav...
Processing ../data/tut/TUT-sound-events-2017-development/TUT-sound-events-2017-development.audio.1/TUT-sound-events-2017-development/audio/street/a008.wav...
Processing ../data/tut/TUT-sound-events-2017-development/TUT-sound-events-2017-development.audio.1/TUT-sound-events-2017-development/audio/street/a010.wav...
Processing ../data/tut/TUT-sound-events-2017-development/TUT-sound-events-2017-development.audio.1/TUT-sound-events-2017-development/audio/street/a012.wav...
Processing ../data/tut/TUT-sound-events-2017-development/TUT-sound-events-2017-development.audio.1/TUT-sound-events-2017-development/audio/street/a013.wav...
Processing ../data/tut/TUT-sound-events-2017-develop

### Urban-SED