# Data Preprocessing
## MFCC 추출 및 레이블 작업
- 기존에 사용했던 Chromagram, Tempogram을 제외하고 오직 MFCC만으로 !
- 대신 MFCC의 계수를 20개에서 40개로 늘렸음!

In [32]:
import librosa
import numpy as np
import os

def ext_mfcc(file_path):
    
    y, sr = librosa.load(file_path)
    mfcc = librosa.feature.mfcc(y = y, sr = sr, n_mfcc = 40)
    mfcc = mfcc[:,:1200] # 데이터 크기 => 앞의 1200 프레임만 보자!
    return mfcc

In [33]:
from IPython.display import clear_output # for clear prints

data_folder = 'music_raw_data'
extracted_mfccs = []
labels = [] # one hot encoding

genre_count = -1 # for labeling
for subfolder in os.listdir(data_folder):
    genre_count += 1
    genre = subfolder
    subfolder_path = os.path.join(data_folder,subfolder)
    for file in os.listdir(subfolder_path):
        # .ipynb_checkpoints 무시
        if file == '.ipynb_checkpoints':
            continue
        print(f'now_file: {file}')
        file_path = os.path.join(data_folder,subfolder,file)
        
        # label 작업
        label = np.zeros(10)
        label[genre_count] = 1
        print(f'label: {label}')
        label = np.expand_dims(label, axis = 0) # for concat
        print(f'label.shape: {label.shape}')
        labels.append(label)

        # MFCC 추출
        mfcc = ext_mfcc(file_path)
        mfcc = np.expand_dims(mfcc, axis = 0) # for concat
        print(f'mfcc.shape: {mfcc.shape}')
        extracted_mfccs.append(mfcc)

        clear_output()

# concat
extracted_mfccs = np.concatenate(extracted_mfccs, axis = 0)
labels = np.concatenate(labels, axis = 0)

print(f'extracted_mfccs.shape: {extracted_mfccs.shape}')
print(f'labels.shape: {labels.shape}')

# 저장
np.savez("extracted_mfccs_and_labels.npz", mfccs = extracted_mfccs, labels = labels)

extracted_mfccs.shape: (1000, 40, 1200)
labels.shape: (1000, 10)
