In [32]:
import numpy as np
import pandas as pd
import os
import librosa
import librosa.display as display
import matplotlib.pyplot as plt
import soundfile as sf
import matplotlib.pyplot as plt

### Variable definitions, Parameter
___

In [137]:
# y_raw = Audio Sequence from wav file, raw data
hl = 1024*4     # Hop length to the next frame, standard 4 times the n_fft window
srs = 44100   # Sample rate (Time resolution)
nfft = 1024    # Number of samples in window of fast fourier transform. Set to 23ms for spech, 93ms for music (Freqnecy Resolution)
resr = 4410
nmels = 13    # Number of mel bins

### Fast Fourier Transform
___

In [118]:
# Fast fourier transformation, Frequencydomain, unbiased
# Returns for every Audio Sequence a matrix of (frequency x frames) which will contain complex values

def stfft(y_raw, hl, nfft, window='hann'):
    result = []
    for row in y_raw:
        result.append(librosa.stft(row, hop_length=hl, n_fft = nfft, window=window))
    result = np.array(result)
    return result

In [119]:
# Get actual frequencies from stfft, returns a list of frequencies

def get_frequencies(srs, nfft):
    result = librosa.core.fft_frequencies(srs, nfft)
    return result

In [120]:
# Short term fourier transform
short_term_fft = stfft(data_new, hl, nfft)
short_term_fft.shape

(3, 513, 33)

In [98]:
# See the frequencies
frequencies = get_frequencies(srs, nfft)
print(frequencies.shape)
frequencies[::50]

(513,)


array([    0.       ,  2153.3203125,  4306.640625 ,  6459.9609375,
        8613.28125  , 10766.6015625, 12919.921875 , 15073.2421875,
       17226.5625   , 19379.8828125, 21533.203125 ])

### Downsampling
___

In [75]:
# Basic Downsampler, Timedomain, unbiased
# Returns for every Audio Sequence a downsampled Audio Sequence


def downsampler(y_raw, srs, resr):
    result = []
    for row in y_raw:
        result.append(librosa.resample(row, orig_sr=srs, target_sr=resr))
    result = np.array(result)
    return result

In [101]:
# Down sampled Raw Data. Note at the moment the Input is a array with a list inside
down = downsampler(data_new, srs, resr=11025)
print(data_new[1].shape)
print(down.shape)

(132300,)
(3, 33075)


### Normalizing the naive way
___

In [76]:
# Simple normalizer

def normalize_wav(y_raw):
    result = []
    for row in y_raw:
        result.append((row + 1) / 2)
    result = np.array(result)
    return result    

### MEL Spectogram
___

In [172]:
# Doing calculation step wise for clarity
# Returns for every Audio Sequence a matrix of (mel_frequency x frames) which will contain power per mel-bin

def mel_spectogram(y_raw, hl, nfft, nmels, srs, window='hann', fmin=0.0, fmax=11025.0):
    result = []
    for row in y_raw:
        fft = librosa.stft(row, hop_length=hl, n_fft = nfft, window=window)
        D = np.abs(fft)**2 #Calculaing the Power
        result.append(librosa.feature.melspectrogram(S=D, sr=srs, n_mels=nmels, fmin=fmin, fmax=fmax))
    result = np.array(result)
    return result

In [169]:
# See the mel frequencies, note, we use standard fmax here
def get_mel_frequencies(n_mels, fmin=0.0, fmax=11025.0):
    result = librosa.mel_frequencies(n_mels, fmin=fmin, fmax=fmax)
    return result

In [170]:
mel_s = mel_spectogram(data_new, hl, nfft, nmels=13, srs=srs)
print(mel_s.shape)

(3, 13, 33)


In [171]:
# See the frequencies
frequencies = get_mel_frequencies(13)
print(frequencies.shape)
frequencies

(13,)


array([    0.        ,   277.28108045,   554.56216089,   831.84324134,
        1119.11407321,  1489.57504283,  1982.6699184 ,  2638.99427173,
        3512.58204988,  4675.35408823,  6223.03921729,  8283.05543689,
       11025.        ])

### MEL Spectogram
___

In [None]:
# Doing calculation step wise for clarity
# Returns for every Audio Sequence a matrix of (mfcc x frames) which will contain power per mel-bin