# PREPARING DATASET FOR ML

In [None]:
import os
import librosa
import math
import json

dataset_path = "path\\to\\dataset"
json_path = "path\\to\\create\\json"

sample_rate = 22050
duration = 30 # measured in seconds
samples_per_track = sample_rate * duration

def save_mfcc(dataset_path, json_path, n_mfcc = 13, n_fft = 2048, hop_length = 512, num_segments = 5):
    
    # dictionary to store data
    data = {
        "mapping" : [],
        "mfcc" : [], # training data, label
        "labels" : []  # training data, outputs
    }
    
    num_samples_per_segment = int(samples_per_track / num_segments)
    
    # for tracks that are a bit shorter
    expected_num_mfcc_vectors_per_segment = math.ceil(num_samples_per_segment / hop_length) # math.ceil rounds up
    
    # loop through all the genres
    
    # loading and enumerating (indexing) all files in the dataset
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)): 
        
        # ensure that we're not at the root level
        if dirpath is not dataset_path:
            
            # save the semantic label
            dirpath_components = dirpath.split("/") # genre/blues => ["genre", "blues"]
            semantic_label = dirpath_components[-1]
            data["mapping"].append(semantic_label)
            print("\n Processing {}".format(semantic_label)) # to keep track of where we are when running script
            
            # process files fo a specific genre
            for f in filenames: 
                
                # load audio file
                file_path = os.path.join(dirpath, f)
                signal, sr = librosa.load(file_path, sr = sample_rate)
                
                # process segments extractng mffcc and storing data
                for s in range(num_segments):
                    start_sample = num_samples_per_segment * s # s = 0 -> 0
                    finish_sample = start_sample + num_samples_per_segment # s = 0 -> num_samples_per_segment
                    
                    
                    mfcc = librosa.feature.mfcc(y = signal[start_sample:finish_sample], # analyze a slice of the signal
                                               sr = sr,
                                               n_fft = n_fft,
                                               n_mfcc = n_mfcc, 
                                               hop_length = hop_length)
                    
                    mfcc = mfcc.T
                    
                    # store mfcc for segments if it has the expected length
                    if len(mfcc) == expected_num_mfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append(i - 1)
                        print("{}, segment: {}".format(file_path, s))
                
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent = 4)
        
if __name__ == "__main__":
    save_mfcc(dataset_path, json_path, num_segments = 10)
    
            