# Building the dataset

In this notebook, the dataset of world traditional music is built. Here are the steps:

- From the existing music files in the `data/audio/` directory, each file is cut into 10-second clips.
- The clips are saved in the `data/clips/` directory.
- The clips are then converted to spectrograms using the Librosa library.
- The spectrograms are saved in the `data/spectrograms/` directory.
- The metadata is saved in the `data/metadata.csv` file and contains for each original audio_file:
    - the name of the audio file
    - the genre
    - the region
    - the country
    - the language of the song (if available)
    - the duration in seconds
    - the sample rate in Hz
    - the index_start, first clip index for the audio file
    - the index_end, last clip index for the audio file


## Imports

In [1]:
from convertions import mp3_to_signal, signal_to_spectro, spectro_to_image, spectro_to_mel_spectro, signal_to_mp3, signal_batch_maker

import os
import pandas as pd
from tqdm import tqdm

## Initialisation

In [2]:
CLIP_DURATION = 10 # seconds
N_FFT = 2048
HOP_LENGTH = 512

metadata_columns = ['filename', 'genre', 'region', 'country', 'language', 'instruments', 'duration', 'sample_rate', 'index_start', 'index_end']

# if the metadata file is not present, create it, else load it
if not os.path.exists('data/metadata.csv'):
    metadata_df = pd.DataFrame(columns=metadata_columns)
    metadata_df.to_csv('data/metadata.csv', index=False)

metadata_df = pd.read_csv('data/metadata.csv', header=0)

clips_df = pd.DataFrame(columns=['min_spectro_db', 'max_spectro_db', 'filename'])

## STEP 1: Cut the audio files into 10-second clips

In [3]:
audio_files = [f for f in os.listdir('data/audio') if f.endswith('.mp3')]

clips = []
current_clip_index = 0

for audio_file in tqdm(audio_files):
    signal, sample_rate = mp3_to_signal('data/audio/' + audio_file)
    
    audio_duration = len(signal) / sample_rate # seconds
        
    # create audio clips
    new_clips = signal_batch_maker(signal, CLIP_DURATION*sample_rate)
    for clip in new_clips:
        clips.append(clip)
    
    lower_index = current_clip_index
    current_clip_index = len(clips)
    
    # update metadata
    metadata_index = len(metadata_df)
    if metadata_df['filename'].str.contains(audio_file).any():
        metadata_index = metadata_df[metadata_df['filename'] == audio_file].index[0]
    metadata_df.loc[metadata_index, 'filename'] = audio_file
    metadata_df.loc[metadata_index, 'sample_rate'] = int(sample_rate)
    metadata_df.loc[metadata_index, 'duration'] = audio_duration
    metadata_df.loc[metadata_index, 'index_start'] = int(lower_index)
    metadata_df.loc[metadata_index, 'index_end'] = int(current_clip_index - 1)
    

metadata_df = metadata_df.sort_values(by='index_start')

metadata_df.to_csv('data/metadata.csv', index=False)

100%|██████████| 12/12 [00:09<00:00,  1.32it/s]


## STEP 2: Save the clips in the `data/clips/` directory

In [4]:
current_index_end = -1

pbar = tqdm(total=len(clips))

for i, clip in enumerate(clips):
    if i > current_index_end:
        sub_df = metadata_df[metadata_df['index_end'] > i]
        current_index_end = int(sub_df['index_end'].iloc[0])
        current_sample_rate = int(sub_df['sample_rate'].iloc[0])
    signal_to_mp3(clip, current_sample_rate, 'data/clips_audio/' + str(i) + '.mp3')
    pbar.update(1)

100%|██████████| 473/473 [00:20<00:00, 23.23it/s]

## STEP 3: Convert the clips to spectrograms and save them in the `data/spectrograms/` directory

In [5]:
current_index_end = -1

pbar = tqdm(total=len(clips), position=0)

for i, clip in enumerate(clips):
    if i > current_index_end:
        sub_df = metadata_df[metadata_df['index_end'] > i]
        current_index_end = int(sub_df['index_end'].iloc[0])
        current_sample_rate = int(sub_df['sample_rate'].iloc[0])
        current_audio_file = sub_df['filename'].iloc[0]
    spectro = signal_to_spectro(clip, n_fft=N_FFT, hop_length=HOP_LENGTH)
    spectro_to_image(spectro, 'data/clips_spectro_full/' + str(i) + '.png')
    mel_spectro = spectro_to_mel_spectro(spectro, current_sample_rate, 256, n_fft=N_FFT, hop_length=HOP_LENGTH)
    spectro_to_image(mel_spectro, 'data/clips_spectro_256/' + str(i) + '.png')
    mel_spectro = spectro_to_mel_spectro(spectro, current_sample_rate, 128, n_fft=N_FFT, hop_length=HOP_LENGTH)
    spectro_to_image(mel_spectro, 'data/clips_spectro_128/' + str(i) + '.png')
    
    clips_df.loc[i, 'min_spectro_db'] = spectro.min()
    clips_df.loc[i, 'max_spectro_db'] = spectro.max()
    clips_df.loc[i, 'filename'] = current_audio_file
    pbar.update(1)
    
clips_df.to_csv('data/clips.csv', index=True)

100%|██████████| 473/473 [00:20<00:00, 23.21it/s]
100%|██████████| 473/473 [00:59<00:00,  8.16it/s]