# Convert GTZAN Dataset to .npy files

This script is used to convert audio files in GTZAN and their labels to numpy files.

### Run this script:
1. Modify `PATH_DATASET` to the path to GTZAN dataset **on your computer**.
2. Run the last cell to generate `X.npy`, `Y.npy`, and `gtzan_genre.csv` in folder `./datasets/`

[Download GTZAN Dataset](http://marsyasweb.appspot.com/download/data_sets/)

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import time

In [None]:
# PATH TO ORIGIN DATASET, MODIFY THIS TO THE PATH OF DATASET ON YOUR COMPUTER!
PATH_DATASET = '/Users/pengguo/Desktop/coms4995/Project/datasets/gtzan_genre/genres/'
# PATH TO ROOT DIRECTORY
PATH_ROOT = './'
# PATH TO CONVERTED DATASET
if not os.path.exists(PATH_ROOT + 'dataset/'):
    os.mkdir(PATH_ROOT + 'dataset/')
PATH_X = PATH_ROOT + 'dataset/X.npy'
PATH_Y = PATH_ROOT + 'dataset/Y.npy'
# PATH TO CSV FILE
PATH_CSV = PATH_ROOT + 'gtzan_genre.csv'

# PARAMETERS
NUM_DATA = 1000
SR = 12000 # sample rate: 12KHz
len_src = 29. # duration: 29 secs
NUM_JOBS = 9 # Num of jobs in multiprocessing
ref_n_src = 12000 * 29

In [None]:
def get_rows_from_folders():
    '''
    Generate csv file for GTZAN dataset.
    '''
    labels = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
    folders = [s + '/' for s in labels]
    
    rows = []
    for label_idx, folder in enumerate(folders): # assumes different labels per folders.
        files = os.listdir(os.path.join(PATH_DATASET, folder))
        for fname in files:
            file_path = os.path.join(folder, fname)
            file_id = fname.split('.')[0]
            file_label = label_idx
            rows.append([file_id, file_path, file_label])
    
    print 'Done - length:{}'.format(len(rows))
    print "First row: {}".format(rows[0])
    print "Last row: {}".format(rows[-1])
    return rows


def get_csv():
    '''
    Write rows (from get_rows_from_folders()) to csv
    '''
    rows = get_rows_from_folders()
    df = pd.DataFrame(rows, columns=['id', 'filepath', 'label'])
    df.to_csv(PATH_CSV)
    return

In [None]:
def gen_filepaths(df):
    '''
    A Generator reading file paths from dataframe.
    Inputs:
        - df: Dataframe converted from .csv file.
    Return:
        - paths to each audio file
    '''
    for filepath in df['filepath']:
        yield os.path.join(PATH_DATASET, filepath)


def _load_audio(path):
    '''
    Load audio file at path with sampling rate=SR, duration=len_src, and return it
    '''
    src, sr = librosa.load(path, sr=SR, duration=len_src * SR / float(SR))
    src = src[:ref_n_src]
    result = np.zeros(ref_n_src)
    result[:len(src)] = src[:ref_n_src]
    return result


def gen_X():
    '''
    Convert GTZAN audio files to a single .npy file.
    shape of X: (1000, 1, 348000)
    '''
    print 'Start generating X...'
    start_time = time.time()
    df = pd.DataFrame.from_csv(PATH_CSV) # load csv
    filepaths = gen_filepaths(df)  # filepaths: generator
    
    raw_data = np.array([_load_audio(filepaths.next()) for _ in xrange(NUM_DATA)])
    data = raw_data[:, np.newaxis, :] # "Insert" one dimension
    np.save(PATH_X, data)
    print 'DONE! in {:6.4f} sec'.format(time.time() - start_time)
    return data

def gen_Y():
    '''
    Generate labels for GTZAN dataset.
    '''
    df = pd.DataFrame.from_csv(PATH_CSV) # load csv
    data = np.array(df['label'])
    np.save(PATH_Y, data)
    return data

In [None]:
get_csv()
X = gen_X()
Y = gen_Y()
print X.shape
print Y.shape