# 說明
整理切好分批為 training / testing sets, 共切成 5 份

In [1]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
import json

In [2]:
GTZAN_FOLDER = './gtzan_genre'
TRACKS_FOLDER = './track_by_genre'
SAVE_FOLDER = './genre_set'
SRC_FOLDER = '../fma_preprocessing'

In [3]:
with open('./category_label_mapping.json', 'r') as f:
    categoryMapping = json.load(f)
categoryMapping

{'0': 'blues',
 '1': 'classical',
 '2': 'country',
 '3': 'disco',
 '4': 'hiphop',
 '5': 'jazz',
 '6': 'metal',
 '7': 'pop',
 '8': 'reggae',
 '9': 'rock'}

In [5]:
iptPath = []
category = []
reverseCategoryMapping = {v: k for k, v in categoryMapping.items()}
for dirpath, _, filenames in os.walk(SRC_FOLDER):
    for f in filenames:
        cat = dirpath.split('/')[2]
        category.append(reverseCategoryMapping[cat])
        iptPath.append(f'../data{dirpath[2:]}/{f}')

In [5]:
dfNew = pd.concat([pd.Series(category), pd.Series(iptPath)], axis=1)
dfNew.columns = ['category', 'path']
dfNew.head()

Unnamed: 0,category,path
0,8,../data/fma_preprocessing/reggae/602-reggae___...
1,8,../data/fma_preprocessing/reggae/602-reggae___...
2,8,../data/fma_preprocessing/reggae/602-reggae___...
3,8,../data/fma_preprocessing/reggae/602-reggae___...
4,8,../data/fma_preprocessing/reggae/602-reggae___...


In [6]:
dfNew.to_csv('./all.csv', index=False)

## split into sets

In [7]:
BATCH_FOLDER = './input_set'
N_SPLIT = 5
BATCH_SIZE = round(dfNew.shape[0] / 5)
if not os.path.isdir(BATCH_FOLDER):
    os.makedirs(f'{BATCH_FOLDER}/raw', mode=0o777)
    os.makedirs(f'{BATCH_FOLDER}/training', mode=0o777)
    os.makedirs(f'{BATCH_FOLDER}/testing', mode=0o777)
for i in range(N_SPLIT):
    dfNew[i * BATCH_SIZE:(i + 1) * BATCH_SIZE].to_csv(f'{BATCH_FOLDER}/raw/raw-{i}.csv', index=False)
    print(dfNew[i * BATCH_SIZE:(i + 1) * BATCH_SIZE].head())
    print('batch ', i + 1, ' done')

Unnamed: 0,category,path
0,8,../data/fma_preprocessing/reggae/602-reggae___...
1,8,../data/fma_preprocessing/reggae/602-reggae___...
2,8,../data/fma_preprocessing/reggae/602-reggae___...
3,8,../data/fma_preprocessing/reggae/602-reggae___...
4,8,../data/fma_preprocessing/reggae/602-reggae___...


batch  1  done


Unnamed: 0,category,path
638,4,../data/fma_preprocessing/hiphop/21-hip_hop/00...
639,4,../data/fma_preprocessing/hiphop/21-hip_hop/01...
640,4,../data/fma_preprocessing/hiphop/21-hip_hop/00...
641,4,../data/fma_preprocessing/hiphop/21-hip_hop/00...
642,4,../data/fma_preprocessing/hiphop/21-hip_hop/01...


batch  2  done


Unnamed: 0,category,path
1276,0,../data/fma_preprocessing/blues/3-blues/100487...
1277,0,../data/fma_preprocessing/blues/3-blues/057799...
1278,0,../data/fma_preprocessing/blues/3-blues/020576...
1279,0,../data/fma_preprocessing/blues/3-blues/039502...
1280,0,../data/fma_preprocessing/blues/3-blues/105093...


batch  3  done


Unnamed: 0,category,path
1914,0,../data/fma_preprocessing/blues/3-blues/113364...
1915,0,../data/fma_preprocessing/blues/3-blues/112030...
1916,0,../data/fma_preprocessing/blues/3-blues/045262...
1917,0,../data/fma_preprocessing/blues/3-blues/114513...
1918,0,../data/fma_preprocessing/blues/3-blues/048073...


batch  4  done


Unnamed: 0,category,path
2552,3,../data/fma_preprocessing/disco/11-disco/00076...
2553,3,../data/fma_preprocessing/disco/11-disco/01622...
2554,3,../data/fma_preprocessing/disco/11-disco/01610...
2555,3,../data/fma_preprocessing/disco/11-disco/00074...
2556,3,../data/fma_preprocessing/disco/11-disco/00077...


batch  5  done


## split training & testing sets

In [8]:
TESTING_RATIO = 0.1
for i, f in enumerate(os.listdir(f'{BATCH_FOLDER}/raw')):
    if '.ipynb_checkpoints' in f:
        continue
    print(f)
    df = pd.read_csv(f'{BATCH_FOLDER}/raw/{f}')
    trainingLen = round((1-TESTING_RATIO) * df.shape[0])
    testingLen = df.shape[0] - trainingLen
    df[i * trainingLen:(i + 1) * trainingLen].to_csv(f'{BATCH_FOLDER}/training/training-{i}.csv', index=False)
    df[i * testingLen:(i + 1) * testingLen].to_csv(f'{BATCH_FOLDER}/testing/testing-{i}.csv', index=False)

raw-3.csv
raw-2.csv
raw-1.csv
raw-4.csv
raw-0.csv
