# 說明
整理切好分批為 training / testing sets, 共切成 5 份

In [1]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
import json
from sklearn.utils import shuffle

In [2]:
GTZAN_FOLDER = './gtzan_genre'
TRACKS_FOLDER = './track_by_genre'
SAVE_FOLDER = './genre_set'
SRC_FOLDER = '../fma_preprocessing'

In [3]:
with open('./category_label_mapping.json', 'r') as f:
    categoryMapping = json.load(f)
categoryMapping

{'0': 'blues',
 '1': 'classical',
 '2': 'country',
 '3': 'disco',
 '4': 'hiphop',
 '5': 'jazz',
 '6': 'metal',
 '7': 'pop',
 '8': 'reggae',
 '9': 'rock'}

In [4]:
iptPath = []
category = []
reverseCategoryMapping = {v: k for k, v in categoryMapping.items()}
for dirpath, _, filenames in os.walk(SRC_FOLDER):
    for f in filenames:
        cat = dirpath.split('/')[2]
        category.append(reverseCategoryMapping[cat])
        iptPath.append(f'../data{dirpath[2:]}/{f}')

In [5]:
dfNew = pd.concat([pd.Series(category), pd.Series(iptPath)], axis=1)
dfNew.columns = ['category', 'path']
dfAll = shuffle(dfNew)
dfAll

Unnamed: 0,category,path
2665,6,../data/fma_preprocessing/metal/31-metal/00101...
1929,0,../data/fma_preprocessing/blues/3-blues/079533...
2159,0,../data/fma_preprocessing/blues/3-blues/086940...
1551,0,../data/fma_preprocessing/blues/3-blues/039161...
1722,0,../data/fma_preprocessing/blues/3-blues/039163...
2418,2,../data/fma_preprocessing/country/9-country/00...
104,8,../data/fma_preprocessing/reggae/79-reggae___d...
3125,7,../data/fma_preprocessing/pop/10-pop/016254.npy
1626,0,../data/fma_preprocessing/blues/3-blues/054342...
2058,0,../data/fma_preprocessing/blues/3-blues/061581...


In [6]:
dfAll.to_csv('./all.csv', index=False)

## split into sets

In [7]:
BATCH_FOLDER = './input_set'
N_SPLIT = 5
BATCH_SIZE = round(dfNew.shape[0] / 5)
if not os.path.isdir(BATCH_FOLDER):
    os.makedirs(f'{BATCH_FOLDER}/raw', mode=0o777)
    os.makedirs(f'{BATCH_FOLDER}/training', mode=0o777)
    os.makedirs(f'{BATCH_FOLDER}/testing', mode=0o777)
for i in range(N_SPLIT):
    dfAll[i * BATCH_SIZE:(i + 1) * BATCH_SIZE].to_csv(f'{BATCH_FOLDER}/raw/raw-{i}.csv', index=False)
    print(dfAll[i * BATCH_SIZE:(i + 1) * BATCH_SIZE].head())
    print('batch ', i + 1, ' done')

     category                                               path
2665        6  ../data/fma_preprocessing/metal/31-metal/00101...
1929        0  ../data/fma_preprocessing/blues/3-blues/079533...
2159        0  ../data/fma_preprocessing/blues/3-blues/086940...
1551        0  ../data/fma_preprocessing/blues/3-blues/039161...
1722        0  ../data/fma_preprocessing/blues/3-blues/039163...
batch  1  done
     category                                               path
2798        6  ../data/fma_preprocessing/metal/31-metal/07393...
1023        0  ../data/fma_preprocessing/blues/3-blues/040046...
1213        0  ../data/fma_preprocessing/blues/3-blues/062937...
941         1  ../data/fma_preprocessing/classical/5-classica...
659         4  ../data/fma_preprocessing/hiphop/21-hip_hop/01...
batch  2  done
     category                                               path
2511        3  ../data/fma_preprocessing/disco/11-disco/01784...
8           8  ../data/fma_preprocessing/reggae/602-reggae__

## split training & testing sets

In [8]:
TESTING_RATIO = 0.1
for i, f in enumerate(os.listdir(f'{BATCH_FOLDER}/raw')):
    if '.ipynb_checkpoints' in f:
        continue
    print(i)
    df = pd.read_csv(f'{BATCH_FOLDER}/raw/{f}')
    trainingLen = round((1-TESTING_RATIO) * df.shape[0])
    testingLen = df.shape[0] - trainingLen
    df[:trainingLen].to_csv(f'{BATCH_FOLDER}/training/training-{i}.csv', index=False)
    df[trainingLen:].to_csv(f'{BATCH_FOLDER}/testing/testing-{i}.csv', index=False)

0
1
2
3
4
