# Building the dataset

In this notebook, the dataset of world traditional music is built. Here are the steps:

1. From the existing music files in the `data/audio/` directory, each file is cut into 10-second clips.
2. The clips are saved in the `data/clips/` directory.
3. The clips are then converted to spectrograms using the Librosa library.
4. The spectrograms are saved in the `data/spectrograms/` directory.
5. The metadata is saved in the `data/metadata.csv` file and contains for each original audio_file:
    - the name of the audio file
    - the genre
    - the region
    - the country
    - the language
    - the duration
    - the sample rate
    - the range (start_index, end_index)


## Imports

In [1]:
from convertions import mp3_to_signal, signal_to_spectro, spectro_to_image
from convertions import image_to_spectro, spectro_to_signal, signal_to_mp3
from convertions import signal_batch_maker, signal_batch_joiner

import os
import numpy as np
import pandas as pd
from tqdm import tqdm

## Initialisation

In [7]:
CLIP_DURATION = 10 # seconds
N_FFT = 2048
HOP_LENGTH = 512

metadata_columns = ['filename', 'genre', 'region', 'country', 'language', 'duration', 'sample_rate', 'index_start', 'index_end']

# if the metadata file is not present, create it, else load it
if not os.path.exists('data/metadata.csv'):
    metadata_df = pd.DataFrame(columns=metadata_columns)
    metadata_df.to_csv('data/metadata.csv', index=False)

metadata_df = pd.read_csv('data/metadata.csv', header=0)

## STEP 1: Cut the audio files into 10-second clips

In [3]:
audio_files = [f for f in os.listdir('data/audio') if f.endswith('.mp3')]

clips = []
current_clip_index = 0

for audio_file in tqdm(audio_files):
    signal, sample_rate = mp3_to_signal('data/audio/' + audio_file)
    
    audio_duration = len(signal) / sample_rate # seconds
        
    # create audio clips
    new_clips = signal_batch_maker(signal, CLIP_DURATION*sample_rate)
    for clip in new_clips:
        clips.append(clip)
    
    lower_index = current_clip_index
    current_clip_index = len(clips)
    
    # update metadata
    metadata_index = len(metadata_df)
    if metadata_df['filename'].str.contains(audio_file).any():
        metadata_index = metadata_df[metadata_df['filename'] == audio_file].index[0]
    metadata_df.loc[metadata_index, 'filename'] = audio_file
    metadata_df.loc[metadata_index, 'sample_rate'] = sample_rate
    metadata_df.loc[metadata_index, 'duration'] = audio_duration
    metadata_df.loc[metadata_index, 'index_start'] = lower_index
    metadata_df.loc[metadata_index, 'index_end'] = current_clip_index - 1
    

metadata_df = metadata_df.sort_values(by='index_start')

metadata_df.to_csv('data/metadata.csv', index=False)

100%|██████████| 7/7 [00:08<00:00,  1.23s/it]


## STEP 2: Save the clips in the `data/clips/` directory

In [6]:
current_index_end = -1

pbar = tqdm(total=len(clips))

for i, clip in enumerate(clips):
    if i > current_index_end:
        sub_df = metadata_df[metadata_df['index_end'] > i]
        current_index_end = int(sub_df['index_end'].iloc[0])
        current_sample_rate = int(sub_df['sample_rate'].iloc[0])
    signal_to_mp3(clip, current_sample_rate, 'data/clips/' + str(i) + '.mp3')
    pbar.update(1)

100%|██████████| 174/174 [00:55<00:00,  3.15it/s]




## STEP 3: Convert the clips to spectrograms

In [10]:
current_index_end = -1

pbar = tqdm(total=len(clips))

for i, clip in enumerate(clips):
    if i > current_index_end:
        sub_df = metadata_df[metadata_df['index_end'] > i]
        current_index_end = int(sub_df['index_end'].iloc[0])
        current_sample_rate = int(sub_df['sample_rate'].iloc[0])
    spectro = signal_to_spectro(clip, n_fft=N_FFT, hop_length=HOP_LENGTH)
    spectro_to_image(spectro, 'data/spectrograms/' + str(i) + '.png')
    pbar.update(1)

  0%|          | 0/174 [00:25<?, ?it/s]
100%|██████████| 174/174 [00:17<00:00, 10.67it/s]

100%|██████████| 174/174 [00:31<00:00, 10.67it/s]

## STEP 4: Save the spectrograms in the `data/spectrograms/` directory