In [1]:
import os
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import librosa
import scipy.signal as sig
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from PIL import Image

In [2]:
def audio_matrix(path):
    fs_v = []
    audio_m = []

    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        
        
        y, sr = librosa.load(file_path, sr=None)
        y = librosa.to_mono(y)
        y = y.reshape(-1, 1)
        y = y.flatten()
        
        fs_v.append(sr)
        audio_m.append(y)

    fs = np.mean(fs_v)

    
    max_length = max(len(y) for y in audio_m)
    audio_m_padded = np.array([np.pad(y, (0, max_length - len(y)), 'constant') for y in audio_m])

    return audio_m_padded, fs

def welch_vector(X, fs, window, nperseg, nfft):
    return sig.welch(X, fs, window=window, nperseg=nperseg, noverlap=(nperseg//2), nfft=nfft)

def psd_matrix(audio_m,  fs, window):
    M = audio_m.shape[0]
    N = audio_m.shape[1]

    psd_matrix = np.zeros((M,N))          
    for i in range(M):
        audio_v = audio_m[i]
        f, psd = welch_vector(audio_v, fs,  window, (N-1), (N*2)-1)
        psd_matrix[i] = psd
    return f, psd_matrix

def audio_classif(classif,  audio_m):
    N = audio_m.shape[1]

    one_count = np.sum(classif)
    zero_count = len(classif) - one_count

    audio_one_m = np.zeros((one_count, N))
    audio_zero_m = np.zeros((zero_count, N))

    audio_one_m = audio_m[classif == 1]
    audio_zero_m = audio_m[classif == 0]

    return  audio_one_m, audio_zero_m

def firwin_filter(signal, lowcut, highcut, numtaps, fs): #Filter FIR

    filter_coeficients = sig.firwin(numtaps, [lowcut, highcut], pass_zero=False, fs=fs)

    return sig.lfilter(filter_coeficients, 1.0, signal)

def audio_filter(audio_m, lowcut, highcut, numtaps, fs):
    M = audio_m.shape[0]
    N = audio_m.shape[1]

    filtered_signal_m = np.zeros((M,N))
    for i in range(M):
        filtered_signal_m[i] = firwin_filter(audio_m[i], lowcut, highcut, numtaps, fs)
    return filtered_signal_m


def spectre_to_image(fs, signal_m):
    image_spec = []

    for signal_v in signal_m:  
        
        _, _, Sxx = sig.spectrogram(signal_v, fs)

        Sxx_dB = 10 * np.log10(Sxx)

        Sxx_normalized = 255 * (Sxx_dB - np.min(Sxx_dB)) / (np.max(Sxx_dB) - np.min(Sxx_dB))
        Sxx_grayscale = Sxx_normalized.astype(np.uint8)

        img_3d = np.expand_dims(Sxx_grayscale, axis=-1)

        image_spec.append(img_3d)
    image_spec = np.array(image_spec)

    return image_spec

In [3]:
path = r'C:\Audio_fm'
audio_m, fs = audio_matrix(path)
df = pd.read_excel('audio_fm_classification.xlsx')
df.head()

Unnamed: 0,audio_names,Classification
0,100.0_0_.wav,0
1,100.0_1_.wav,0
2,100.0_2_.wav,0
3,100.0_3_.wav,1
4,100.0_4_.wav,0


In [4]:
lowcut = 30
highcut = 15000
numtaps = 301
audio_m_filter = audio_filter(audio_m, lowcut, highcut, numtaps, fs)

In [5]:
classif = df['Classification'].to_numpy()

audio_img = spectre_to_image(fs, audio_m_filter)

np.save('audio_img.npy', audio_img)

In [6]:
audio_ones, audio_zeros = audio_classif(classif, audio_m_filter)

print(audio_ones.shape, audio_zeros.shape)

(356, 83886) (849, 83886)
