In [None]:
import pandas as pd
import numpy as np
import os
import sys
import librosa
import librosa.display
import struct
import numpy as np 
import struct
import matplotlib.pyplot as plt

class WavFileHelper():
    
    def read_file_properties(self, filename):

        wave_file = open(filename,"rb")
        
        riff = wave_file.read(12)
        fmt = wave_file.read(36)
        
        num_channels_string = fmt[10:12]
        num_channels = struct.unpack('<H', num_channels_string)[0]

        sample_rate_string = fmt[12:16]
        sample_rate = struct.unpack("<I",sample_rate_string)[0]
        
        bit_depth_string = fmt[22:24]
        bit_depth = struct.unpack("<H",bit_depth_string)[0]
                     
        #print("num_channels : {}, sample_rate : {}, bit_depth : {}".format(num_channels, sample_rate, bit_depth))
        return (num_channels, sample_rate, bit_depth)
#from helpers.wavfilehelper import WavFileHelper

def extract_features(file_name, max_pad=174, n=40):
   
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n)
        # padding
        pad_width = max_pad - mfccs.shape[1]
        mfccs_pad = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        mfccsscaled = np.mean(mfccs.T,axis=0)
        
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None 
     
    return mfccs, mfccs_pad, mfccsscaled

# Drawing

In [None]:
FIG_SIZE = (15, 10)

def draw_chart(signal, sample_rate):
    plt.figure(figsize=FIG_SIZE)
    plt.xlabel("Time(s)")
    plt.ylabel("Amplitude")
    plt.title("Wave form")
    librosa.display.waveshow(signal, sample_rate, alpha = 0.4)
    
def draw_chart_half(signal, signal_spectrum):
    plt.figure(figsize=FIG_SIZE)
    plt.xlabel("Frequency")
    plt.ylabel("Magnitude")
    plt.title("Power Spectrum")
    plt.plot(signal, signal_spectrum, alpha=0.4)
    
    
def draw_chart_spectrogram(spectrogram, sample_rate, hop_length):
    plt.figure(figsize=FIG_SIZE)
    librosa.display.specshow(spectrogram, sr=sample_rate, hop_length=hop_length)
    plt.xlabel("Time")
    plt.ylabel("Frequency")
    plt.colorbar()
    plt.title("Spectrogram")
    
    
def draw_chart_log_spectrogram(log_spectrogram, sample_rate, hop_length):
    plt.figure(figsize=FIG_SIZE)
    librosa.display.specshow(log_spectrogram, sr=sample_rate, hop_length =hop_length)
    plt.xlabel("Time")
    plt.ylabel("Frequency")
    plt.colorbar(format="%+2.0f dB")
    plt.title("spectrogram * dB")
    
def draw_chart_mfccs(mfccs, signal, sample_rate, hop_length):
    plt.figure(figsize=FIG_SIZE)
    librosa.display.specshow(mfccs, sr=sample_rate, hop_length=hop_length)
    plt.xlabel("Time")
    plt.ylabel("MFCC Coefficients")
    plt.colorbar()
    plt.title("MFCCS")
    
def draw_chart_input_data(sound_sample):    
    plt.title("input sound")
    librosa.display.specshow(sound_sample[0]['mfccs'], x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()



In [None]:
# Preprocessing Feature Extract
'''
        mfccs parameters 
        1. sample rate
            sample rate = Number of Sample / time
            ex) 3초에 90000개의 샘플 -> sr = 30kHz
            
        2. n_mfcc (선택)
            n_mfcc는 리턴할 mfcc의 개수, Default = 20. 
            n_mfcc가 어느정도 많을 수록 Feature가 잘 나타난다.
            
        3. n_fft (Sample Rate 필요)
            n_fft는 Frame Length를 결정한다. 또한 n_fft는 window size로 자동 결정된다. 
            n_fft > 인 경우 window size는 0을 Padding하여 계산한다. 
            따라서 n_fft >= window size를 지켜야 한다. 
            가청 주파수를 기준으로 25ms를 사용한다. 
            n_fft = Frame length x Sample Rate
            ex) 1. Sample rate = 4kHz, n_fft = 100 
                   Frame length = 100 / 4000 = 0.025
                2. Sample rate = 8kHz, Frame length = 0.025
                   n_fft = 200
            
        4. hop_length (Sample Rate 필요)
           hop_length는 읽어들이는 보폭을 의미한다. Frame stride의 경우 10ms로 기본적으로 사용한다. 
           hop_length = Sample Rate x Frame stride 
           ex) 8kHz x 10ms = 8000 x 0.01 = 80 (hop_length)
'''
        # num_channels : 2, sample_rate : 44100, bit_depth : 16
def pre_extract_features(file_name):
   
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        
        #FFT Fourier Transform for Calculate Power Spectrum
        fft = np.fft.fft(audio)
        signal_spectrum = np.abs(fft)
        
        # Frequency Feature Extract
        #f = np.linspace(0, sample_rate, len(signal_spectrum))
        #half_spectrum = signal_spectrum[:int(len(signal_spectrum)/2)]
        #half_f = f[:int(len(signal_spectrum)/2)]
        
        #stft
        #hop_length = 512
        #n_fft = 2048
        n_fft = int(sample_rate * 0.025)
        hop_length = int(sample_rate * 0.01)
        
        frame_stride = float(hop_length)/sample_rate  #  0.01
        frame_length = float(n_fft)/sample_rate # 0.025
        
        stft = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
        spectrogram = np.abs(stft)
        
        log_spectrogram = librosa.amplitude_to_db(spectrogram)
        
        #mfccs = librosa.feature.mfcc(audio, sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=100)
        #mfccs = librosa.feature.mfcc(audio, sample_rate, log_spectrogram, 100)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, S=log_spectrogram, n_mfcc=50)
        pad_width = 0
        
        mfccs_pad = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        mfccsscaled = np.mean(mfccs.T,axis=0)
        
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None 
     
    return mfccs, mfccs_pad, mfccsscaled

In [None]:
metadata = pd.read_csv('/home/park/coding/study/Sound/urban/UrbanSound8K/metadata/UrbanSound8K.csv')
metadata.head()

In [None]:
# extract the audio files property and standarized the properties of the audio files then extract features
wavfilehelper = WavFileHelper()
audio_data = []
audio_feature = []
for index, row in metadata.iterrows():
    file_name = os.path.join(os.path.abspath('/home/park/coding/study/Sound/urban/UrbanSound8K/audio'),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    # extract the property
    proper = wavfilehelper.read_file_properties(file_name)
    audio_data.append(proper)
    # extract the features
    label = row['classID']
    #mfccs, mfccs_pad, mfccsscaled = extract_features(file_name)
    mfccs, mfccs_pad, mfccsscaled = pre_extract_features(file_name)
    audio_feature.append([mfccs, mfccs_pad, mfccsscaled, label])

# Convert into a Panda dataframe
audio_property = pd.DataFrame(audio_data, columns=['num_channels','sample_rate','bit_depth'])
audio_feature = pd.DataFrame(audio_feature, columns=['mfccs', 'mfccs_pad','feature','class_label'])


In [None]:
# save the variables
audio_property.to_pickle("audio_property.pkl")
audio_feature.to_pickle("audio_feature.pkl")

# read the variables
audio_property = pd.read_pickle("audio_property.pkl")
audio_feature = pd.read_pickle("audio_feature.pkl")

In [None]:
# Draw the MFCC Spectrogram for each of the class of the sound data, comparing the graph
# https://librosa.github.io/librosa/generated/librosa.feature.mfcc.html
import matplotlib.pyplot as plt

# choose a sample of sound from each class
sound_class_dic = metadata.groupby(['class', 'classID']).size()
sound_sample = []
for c in range(10):
    for index, row in audio_feature.iterrows():
        if c == row['class_label']:
            sound_sample.append(row)
            break

# Draw the MFCC Spectrogram for each class
fig = plt.figure(figsize=(20, 8))
fig.subplots_adjust(hspace = 0.5, wspace = 0.5)
plt.title('MFCC for different class of sounds on log Hz', y = 1.05, fontsize = 18)
plt.axis('off')
for i in range(10):
    fig.add_subplot(2,5,i+1)
    plt.title(f'{sound_class_dic.index[i][0]}')
    librosa.display.specshow(sound_sample[i]['mfccs'], x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    
for i in range(10):
    print(f"label {i} : {sound_class_dic.index[i][0]}")    

In [None]:
# encode the categorical text data
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert features and corresponding classification labels into numpy arrays
# ??? what suppose to be the input of CNN
X = np.array(audio_feature.feature.tolist())
y = np.array(audio_feature.class_label.tolist())

#X_2D = np.array(audio_feature.mfccs_pad.tolist())
#_2D = np.array(audio_feature.class_label.tolist())

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y)) 
#yy_2D = to_categorical(le.fit_transform(y_2D)) 

In [None]:
# prepare the input data
# split the data into train and test data sets
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
print("x_train shape", x_train.shape)
print("y_train shape", y_train.shape)
print("x_test shape", x_test.shape)
print("y_test shape", y_test.shape)
#x_train_2D, x_test_2D, y_train_2D, y_test_2D = train_test_split(X_2D, yy_2D, test_size=0.2, random_state = 42)

In [None]:
# SVM
from sklearn import svm
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV



In [None]:
# search the hyperparameter space to find the best model
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4, 1e-5],'C': [1, 10 ,20,30,40,50]}]
#                    , {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
searchpara = GridSearchCV(svm.SVC(), tuned_parameters)
searchpara.fit(x_train, y_train)
searchpara.best_params_

In [None]:
import pickle
import joblib

# construct the model by the best hyperparameters found above
SVM = svm.SVC(C=40.0, gamma=0.0001)

# train the model
SVM.fit(x_train,y_train)

save_model = pickle.dumps(SVM)
joblib.dump(SVM,'/home/park/coding/study/Sound/urban/Sound_classification_urbansound8k-master/model/svm')

y_pred = SVM.predict(x_test)

train_accu =  SVM.score(x_train, y_train)
test_accu = SVM.score(x_test, y_test)
print('Training Accuracy:', train_accu)
print('Test Accuracy:', test_accu)

print("y_pred : ",y_pred)
print("y_pred shape: ",y_pred.shape)

In [None]:
# wav file load 
input_audio_feature = []
input_sound_sample = []

def file_select(num_of_fold):
    FOLD_PATH = "/home/park/coding/study/Sound/urban/UrbanSound8K/audio/test"
    FILE_NAME = FOLD_PATH + str(num_of_fold) + "/air.wav"
    return FILE_NAME

input_file_name = file_select(1)

#pre_extract_feature
input_signal, input_sample_rate = librosa.load(input_file_name) #7061-6-0-0.wav file
input_mfccs, input_mfccs_pad, input_mfccsscaled = pre_extract_features(input_file_name)
input_audio_feature.append([input_mfccs, input_mfccs_pad, input_mfccsscaled])


input_audio_feature = pd.DataFrame(input_audio_feature, columns=['mfccs', 'mfccs_pad','feature'])

input_audio_feature.to_pickle("input_audio_feature.pkl")

# read the variables
input_audio_feature = pd.read_pickle("input_audio_feature.pkl")

hop_length = int(input_sample_rate * 0.01)

#draw_chart_mfccs(mfccs, input_signal, input_sample_rate, hop_length)

x_input = np.array(input_audio_feature.feature.tolist())


y_input = SVM.predict(x_input)
print("y_pred : ",y_input[0])
print("y_pred : ",y_input)
print("y_pred shape: ",y_input.shape)

#input_accu = SVM.score(x_input, y_input)
#print('input Accuracy:', input_accu)

'''
label 0 : air_conditioner
label 1 : car_horn
label 2 : children_playing
label 3 : dog_bark
label 4 : drilling
label 5 : engine_idling
label 6 : gun_shot
label 7 : jackhammer
label 8 : siren
label 9 : street_music
'''



In [None]:
# draw confusion matrix
# new ver: https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
# old ver: https://scikit-learn.org/0.21/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
#from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues,
                          percentage = False):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    print("cm[0] : ", cm[0])
    print("y_true :", y_true)
    print("y_pred :", y_pred)
    print("classes :", classes)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    #print(cm)

    fig, ax = plt.subplots(figsize = (8,8))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

class_names = metadata.groupby(['class', 'classID'], as_index = False).sum()['class']
plot_confusion_matrix(y_test, y_pred, classes=class_names, cmap=plt.cm.Blues, normalize= False)
plt.show()