In [167]:
import pandas as pd
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt

In [168]:
#Create database values
snr=['+6b']
machine=['valve'] #, 'pump', 'fan', 'slide'
id=['id_00', 'id_02', 'id_04', 'id_06']
condition=['abnormal', 'normal']

In [169]:
rows=[[a, b, c,d, e ]  for a in snr for b in machine for c in id for d in condition for e in os.listdir(a+'/'+b+'/'+c+'/'+d)]


In [170]:
#Create Df
df = pd.DataFrame(rows, columns=["snr", "machine", "id","condition", "file"])

In [171]:
#Create path for each audio file, path will we use by Librosa to retrieve features. 
df['path']=df.snr+'/'+df.machine+'/'+df.id+'/'+df.condition+'/'+df.file


In [172]:
df

Unnamed: 0,snr,machine,id,condition,file,path
0,+6b,valve,id_00,abnormal,00000000.wav,+6b/valve/id_00/abnormal/00000000.wav
1,+6b,valve,id_00,abnormal,00000001.wav,+6b/valve/id_00/abnormal/00000001.wav
2,+6b,valve,id_00,abnormal,00000002.wav,+6b/valve/id_00/abnormal/00000002.wav
3,+6b,valve,id_00,abnormal,00000003.wav,+6b/valve/id_00/abnormal/00000003.wav
4,+6b,valve,id_00,abnormal,00000004.wav,+6b/valve/id_00/abnormal/00000004.wav
...,...,...,...,...,...,...
4165,+6b,valve,id_06,normal,00000987.wav,+6b/valve/id_06/normal/00000987.wav
4166,+6b,valve,id_06,normal,00000988.wav,+6b/valve/id_06/normal/00000988.wav
4167,+6b,valve,id_06,normal,00000989.wav,+6b/valve/id_06/normal/00000989.wav
4168,+6b,valve,id_06,normal,00000990.wav,+6b/valve/id_06/normal/00000990.wav


In [173]:
#get y values, sr is always 16000 (accordign to paper)
def load_sound(p):
    y, sr = librosa.load(p, sr=None)
    return y

In [174]:
# get fast Fourier transform (FFT) 
#not added to the features_df
def ftt(y):
    ft = np.abs(librosa.stft(y[:2048], hop_length = 2048+1))
    return ft

In [175]:
#get spectral centroid
#not added to the features_df
def centroid(y, sr):
    cent=librosa.feature.spectral_centroid(y=y, sr=sr)
    return cent

In [176]:
def chromagram(y, sr):
    spec=np.abs(librosa.stft(y))
    chroma=np.mean(librosa.feature.chroma_stft(S=spec, sr=sr).T,axis=0)
    return chroma

def melspec(y, sr):
    mel=np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T,axis=0)
    return mel
 
def mfcc(y, sr):
    mfc=np.mean(librosa.feature.mfcc(y=y, sr=sr).T, axis=0)
    return mfc

In [177]:
def get_features(y):
    sr=160000
    chroma=chromagram(y, sr=sr)
    mel=melspec(y, sr=sr)
    mfc_coef=mfcc(y, sr=sr)
    feature_matrix=np.array([])
    feature_matrix=np.hstack((chroma, mel, mfc_coef))
    return feature_matrix

In [178]:
def load_features():
    X, y = [], []
    for i in df.path.values:
        wave=load_sound(i)
        features=get_features(wave)
        X.append(features)
        target=i.split('/')[3]
        y.append(target)
    return np.array(X), np.array(y)

In [179]:
features, condition = load_features()
features_df=pd.DataFrame(features)
target_df=pd.DataFrame(condition)

0:11 -> chromagram ( 12 columns)  <br />
12:number?-> melspec ( 128 columns)<br />
number? : number? -> mfcc ( 20 columns)

In [190]:
features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,150,151,152,153,154,155,156,157,158,159
0,0.861475,0.922338,0.832785,0.885525,0.828158,0.793287,0.824301,0.732386,0.697710,0.660165,...,-10.530272,-9.137860,-8.667274,-1.808080,-6.800573,-6.219642,0.257486,1.497342,-3.951734,-2.955382
1,0.763870,0.852937,0.828722,0.852717,0.848668,0.843809,0.836646,0.762158,0.727656,0.718589,...,-14.294190,-15.456995,-9.299699,-3.360145,-3.474428,-0.281339,-4.229411,-1.966501,4.157502,1.135244
2,0.939885,0.766322,0.597016,0.664217,0.857982,0.649230,0.633266,0.593182,0.655766,0.763900,...,-10.398990,-10.941664,-12.466786,-1.782350,-0.597224,-5.187380,-0.188078,2.209370,-5.275289,-4.154563
3,0.787585,0.778727,0.755230,0.749415,0.757591,0.876939,0.932356,0.875719,0.791082,0.738816,...,-9.592726,-3.938925,-5.946343,-3.800183,-0.003994,1.676360,0.155179,0.732040,-5.744483,-1.820551
4,0.713702,0.668932,0.653729,0.692693,0.793410,0.953053,0.908813,0.807453,0.712799,0.653515,...,-10.074761,-4.548989,-6.987258,-1.930489,-0.286458,4.609173,0.296392,0.992723,-5.606578,-5.110719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4165,0.807396,0.747871,0.689748,0.679681,0.696236,0.855339,0.928516,0.856283,0.736254,0.674512,...,-5.779197,-5.877187,-4.560539,-5.993191,-1.797127,2.737551,-2.211200,-0.603449,-3.642722,-4.773500
4166,0.801957,0.730358,0.695457,0.696980,0.793078,0.851262,0.851622,0.836584,0.755296,0.750253,...,-7.859712,-8.102440,-3.361666,-3.220085,-0.924855,0.911005,-4.517318,-2.695435,-4.084613,-3.829922
4167,0.939088,0.874993,0.812836,0.771866,0.735158,0.785117,0.749053,0.709017,0.680094,0.677016,...,-8.287986,-9.246897,-10.444616,-10.635346,-6.745395,-7.870466,-4.863323,-0.534730,-1.738270,0.182439
4168,0.866860,0.912564,0.759544,0.741769,0.695552,0.736734,0.772516,0.710146,0.703852,0.765094,...,-9.480988,-11.618817,-11.452623,-9.841748,-8.344163,-7.533532,-10.918378,-3.100890,-1.119408,-1.216275


In [180]:
features_df.to_csv('features_df.csv')
target_df.to_csv('target_df.csv')
