In [73]:
import pandas as pd
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt

In [78]:
#Create database values
snr=['+6db']
machine=['valve'] #, 'pump', 'fan', 'slide'
id=['id_00', 'id_02', 'id_04', 'id_06']
condition=['abnormal', 'normal']

In [79]:
rows=[[a, b, c,d, e ]  for a in snr for b in machine for c in id for d in condition for e in os.listdir(a+'/'+b+'/'+c+'/'+d)]


In [80]:
#Create Df
df = pd.DataFrame(rows, columns=["snr", "machine", "id","condition", "file"])

In [81]:
#Create path for each audio file, path will we use by Librosa to retrieve features. 
df['path']=df.snr+'/'+df.machine+'/'+df.id+'/'+df.condition+'/'+df.file


In [93]:
df

Unnamed: 0,snr,machine,id,condition,file,path
0,+6db,valve,id_00,abnormal,00000000.wav,+6db/valve/id_00/abnormal/00000000.wav
1,+6db,valve,id_00,abnormal,00000001.wav,+6db/valve/id_00/abnormal/00000001.wav
2,+6db,valve,id_00,abnormal,00000002.wav,+6db/valve/id_00/abnormal/00000002.wav
3,+6db,valve,id_00,abnormal,00000003.wav,+6db/valve/id_00/abnormal/00000003.wav
4,+6db,valve,id_00,abnormal,00000004.wav,+6db/valve/id_00/abnormal/00000004.wav
...,...,...,...,...,...,...
4165,+6db,valve,id_06,normal,00000987.wav,+6db/valve/id_06/normal/00000987.wav
4166,+6db,valve,id_06,normal,00000988.wav,+6db/valve/id_06/normal/00000988.wav
4167,+6db,valve,id_06,normal,00000989.wav,+6db/valve/id_06/normal/00000989.wav
4168,+6db,valve,id_06,normal,00000990.wav,+6db/valve/id_06/normal/00000990.wav


In [83]:
#get y values, sr is always 16000 (accordign to paper)
def load_sound(p):
    y, sr = librosa.load(p, sr=None)
    return y

In [84]:
# get fast Fourier transform (FFT) 
#not added to the features_df
def ftt(y):
    ft = np.abs(librosa.stft(y[:2048], hop_length = 2048+1))
    return ft

In [85]:
#get spectral centroid
#not added to the features_df
def centroid(y, sr):
    cent=librosa.feature.spectral_centroid(y=y, sr=sr)
    return cent

In [91]:
def chromagram(y, sr):
    spec=np.abs(librosa.stft(y))
    chroma=np.mean(librosa.feature.chroma_stft(S=spec, sr=sr).T,axis=0)
    return chroma

def melspec(y, sr):
    mel=np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T,axis=0)
    return mel
 
def mfcc(y, sr):
    mfc=np.mean(librosa.feature.mfcc(y=y, sr=sr).T, axis=0)
    return mfc

In [88]:
def get_features(y):
    sr=160000
    chroma=chromagram(y, sr=sr)
    mel=melspec(y, sr=sr)
    mfc_coef=mfcc(y, sr=sr)
    feature_matrix=np.array([])
    feature_matrix=np.hstack((chroma, mel, mfc_coef))
    return feature_matrix

In [89]:
def load_features():
    X, y = [], []
    for i in df.path.values:
        wave=load_sound(i)
        features=get_features(wave)
        X.append(features)
        if i.split('/')[3] == 'normal':
            target= int(1)
        else:
            target=int(0)
        y.append(target)
    return np.array(X), np.array(y)

In [92]:
features, condition = load_features()


In [99]:
features_df=pd.DataFrame(features)
target_df=pd.DataFrame(condition)

0:11 -> chromagram ( 12 columns)  <br />
12:number?-> melspec ( 128 columns)<br />
number? : number? -> mfcc ( 20 columns)

In [100]:
features_df.to_csv('features_df.csv')
target_df.to_csv('target_df.csv')


In [32]:
X=pd.read_csv('features_df.csv')
y=pd.read_csv('target_df.csv')