In [1]:
# feature extractoring and preprocessing data
# 음원 데이터를 분석
import librosa

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# notebook을 실행한 브라우저에서 바로 그림을 볼 수 있게 해주는 것
%matplotlib inline

# 운영체제와의 상호작용을 돕는 다양한 기능을 제공
# 1. 현재 디렉토리 확인하기
# 2. 디렉토리 변경
# 3. 현재 디렉토리의 파일 목록 확인하기
# 4. csv 파일 호출
import os

# 파이썬에서의 이미지 처리
from PIL import Image

import pathlib
import csv

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error

#Keras
import keras

# 경고 메시지를 무시하고 숨기거나  -> warnings.filterwarnings(action='ignore')
# 일치하는 경고를 인쇄하지 않습니다 = ('ignore')
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# 원하는 종류의 색깔만 넘겨주는 것
cmap = plt.get_cmap('inferno')

plt.figure(figsize=(10,10))
genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
for g in genres:
    pathlib.Path(f'img_data/{g}').mkdir(parents=True, exist_ok=True)     
    for filename in os.listdir(f'./MIR/genres/{g}'):
        songname = f'./MIR/genres/{g}/{filename}'
        y, sr = librosa.load(songname, mono=True, duration=5)
        plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
        plt.axis('off');
        plt.savefig(f'img_data/{g}/{filename[:-3].replace(".", "")}.png')
        plt.clf()

<Figure size 720x720 with 0 Axes>

In [3]:
header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
for i in range(1, 21):
    header += f' mfcc{i}'
header += ' label'
header = header.split()

In [15]:
file = open('data.csv', 'w', newline='')
with file:
    writer = csv.writer(file)
    writer.writerow(header)
genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
for g in genres:
    for filename in os.listdir(f'./MIR/genres/{g}'):
        songname = f'./MIR/genres/{g}/{filename}'
        y, sr = librosa.load(songname, mono=True, duration=30)
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        #rmse = mean_squared_error(y, y_pred=sr)**0.5
        rmse = librosa.feature.rms(y=y)
        to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'    
        for e in mfcc:
            to_append += f' {np.mean(e)}'
        to_append += f' {g}'
        file = open('data.csv', 'a', newline='')
        with file:
            writer = csv.writer(file)
            writer.writerow(to_append.split())

In [None]:
# mfcc = 오디오 신호에서 추출할 수 있는 feature로, 소리의 고유한 특징을 나타내는 수치
#      = 등록된 음성과 현재 입력된 음성의 유사도를 판별하는 근거의 일부로 쓰입니다.
#      = MFCC(Mel-Frequency Cepstral Coefficient)는
#        Mel Spectrum(멜 스펙트럼)에서 Cepstral(켑스트럴) 분석을 통해 추출된 값
#      
# 이해하기 위해 먼저 
# -  Spectrum(스펙트럼)
# -  Cepstrum(켑스트럼)
# -  Mel Spectrum(멜 스펙트럼)  들을 알아야 한다.

In [16]:
data = pd.read_csv('data.csv')
data.head()

# chroma_stft = 채도_? , 크로마 표준
# spectral_centroid = 스펙트럼 중심
# spectral_bandwidth = 스펙트럼 대역폭
# rolloff = 롤 오프
# zero_crossing_rate = 제로 크로싱 비율
#        
# mfcc[n] = 

Unnamed: 0,filename,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,blues.00061.wav,0.451905,0.141766,2504.026852,2180.438691,5077.152632,0.167584,-82.454712,94.147758,-27.120918,...,8.139763,-8.494857,8.28336,-10.346549,-3.462061,-5.223508,-2.595848,-8.363733,-6.978243,blues
1,blues.00020.wav,0.302732,0.075387,1389.95551,1911.485152,3005.773491,0.05256,-230.412994,127.094185,7.1511,...,2.561243,-4.329453,6.81776,-6.15766,-6.214757,-4.515433,-1.850599,-0.539056,1.508026,blues
2,blues.00051.wav,0.393756,0.196723,1977.172377,1927.803692,3942.834492,0.106627,-55.579243,114.935852,-37.05283,...,12.782317,-16.528681,3.793787,-7.890871,8.477611,-4.06521,3.207442,-5.178251,-1.279523,blues
3,blues.00077.wav,0.408876,0.243217,2206.771246,2191.473506,4657.388504,0.111526,-29.01099,104.532921,-30.974205,...,10.786453,-10.558812,6.877709,-10.294858,6.967846,-10.256099,0.705014,-6.000722,1.348955,blues
4,blues.00087.wav,0.336773,0.158098,1442.190271,1870.534155,3083.414688,0.050889,-155.504929,125.638863,1.596553,...,-0.792893,-7.748057,0.413548,-7.030263,3.997679,-6.256611,0.958227,2.019821,-5.742188,blues


In [17]:
data.shape

(1000, 28)

In [18]:
# Dropping unneccesary columns
data = data.drop(['filename'],axis=1)

In [19]:
genre_list = data.iloc[:, -1]
encoder = LabelEncoder()
y = encoder.fit_transform(genre_list)

In [20]:
scaler = StandardScaler()
X = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [22]:
len(y_train)

800

In [23]:
len(y_test)

200

In [24]:
X_train[10]

array([ 0.84236989,  0.92287085,  2.35855648,  1.95610118,  2.21430833,
        2.09035242,  0.91790935, -1.98182487,  1.06244477, -1.0872832 ,
        1.4675165 , -0.73282042,  1.16466683, -0.70750656,  0.79220495,
       -0.56810447,  0.3069385 , -0.42743785,  0.82600079, -0.3880957 ,
        0.68975655, -0.58861142,  1.32844375,  0.18445546,  1.84079664,
        0.71231544])

In [25]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))

model.add(layers.Dense(128, activation='relu'))

model.add(layers.Dense(64, activation='relu'))

model.add(layers.Dense(10, activation='softmax'))






In [26]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])





In [27]:
history = model.fit(X_train,
                    y_train,
                    epochs=20,
                    batch_size=128)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [28]:
test_loss, test_acc = model.evaluate(X_test,y_test)



In [29]:
print('test_acc: ',test_acc)

test_acc:  0.73


In [30]:
x_val = X_train[:200]
partial_x_train = X_train[200:]

y_val = y_train[:200]
partial_y_train = y_train[200:]

In [31]:

model = models.Sequential()
model.add(layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(partial_x_train,
          partial_y_train,
          epochs=30,
          batch_size=512,
          validation_data=(x_val, y_val))
results = model.evaluate(X_test, y_test)

Train on 600 samples, validate on 200 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [32]:
results

[0.857143030166626, 0.67]

In [33]:
predictions = model.predict(X_test)

In [34]:
predictions[0].shape

(10,)

In [35]:
np.sum(predictions[0])

0.99999994

In [36]:
np.argmax(predictions[0])

5