In [None]:
!python3 -m pip install PyMySQL
!python3 -m pip install SQLAlchemy
!python3 -m pip install google-cloud-storage
!python3 -m pip install --upgrade --quiet scikit-sound
!python3 -m pip install --upgrade --quiet pygame
!sudo apt-get -y install ffmpeg
!sudo apt-get -y install python3-pymysql

In [1]:
import sqlalchemy
import numpy as np

from google.cloud import storage
from numpy.fft import fft, ifft
from sksound.sounds import Sound

from tensorflow.keras.models import load_model

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
highpass=25

class StreamSpectrogram:
    
    def __init__(self, filename, win=128):
        sound = Sound(filename)        
        self.batch = sound.rate * 60
        self.data  = sound.data
        if len(self.data.shape) > 1:
            self.data = self.data[:, 0]    
        self.win = win
        
    def __iter__(self):
        self.i = 0
        return self

    def __next__(self):
        if (self.i + 1) * self.batch < len(self.data): 
            start = self.i       * self.batch
            stop  = (self.i + 1) * self.batch
            raw   = self.data[start:stop]
            spec  = fwd_spectrogram(raw, win=512 + 2 * highpass)[:, 0:256] 
            t,d   = spec.shape
            current = []        
            for i in range(self.win, t, self.win // 2):
                x      = np.reshape(spec[i - self.win:i], (self.win, d, 1))
                mu     = np.mean(x)
                std    = np.std(x) + 1.0
                window = (x - mu) / std
                current.append(window)
            self.i += 1
            return np.stack(current)
        else:
            raise StopIteration
        
def fwd_spectrogram(audio, win=512, step=64):
    '''
    Compute the spectrogram of audio data

    audio: one channel audio
    win: window size for dft sliding window
    step: step size for dft sliding windo
    '''
    spectrogram = []
    hanning = np.hanning(win)
    for i in range(win, len(audio), step):
        start = win // 2
        dft = np.abs(fft(audio[i - win: i] * hanning))[start:win]
        spectrogram.append(dft)
    return np.array(spectrogram)

In [3]:
# PASSWORDS AND STUFF HERE

In [4]:
settings = {
   'user': db_user,
   'pass': db_password,
   'host': host,
     'db': db_name
}
url = 'mysql+pymysql://{user}:{pass}@{host}/{db}'.format(**settings)  # 5432 is the default port
db = sqlalchemy.create_engine(url)

In [5]:
def run_query(query):
    with db.connect() as conn:
        rows = []
        for row in conn.execute(query).fetchall():
            rows.append(dict(row.items()))
        return rows

In [6]:
files = run_query("""
    SELECT 
        x.encoding, y.year, x.filename 
    FROM 
        wdp_ds.audio x 
    JOIN wdp_ds.encoding y ON x.encoding = y.encoding;
""")

In [7]:
paths = ["audio_files/{}/{}".format(file['year'], file['filename']) for file in files]

In [8]:
max_id = run_query("""
    SELECT 
        max(id)
    FROM 
        wdp_ds.not_silent
    """)
if max_id[0]['max(id)'] is None:
    max_id = 0
else:
    max_id = max_id[0]['max(id)']
print(max_id)

2113


In [9]:
noise_classifier = load_model('../models/lstm_v3/v3.5/sil.h5')
client = storage.Client.from_service_account_json('../secret.json')
bucket = client.get_bucket('wdp-data')

In [10]:
id = max_id + 1
skip = 34
c = 0
for path, file_dict in zip(paths, files):
    if c > skip:
        print(path, file_dict, c)
        with open("/tmp/audio.m4a", "wb") as file_obj:
            blob = bucket.blob(path)
            blob.download_to_file(file_obj)
        stream = StreamSpectrogram("/tmp/audio.m4a")
        not_noise = []
        for x in stream:
            y = noise_classifier.predict(x).flatten()
            not_noise.extend([int(np.round(sample)) == 0 for sample in y])

        regions = []
        for i in range(0, len(not_noise)):
            if not_noise[i]:
                # win: 32, step: 256
                #start = i * 16 * 256
                #stop  = (i + 1) * 16 * 256 

                # win: 128, step: 64
                start = i * 64 * 64
                stop  = (i + 1) * 64 * 64 

                if len(regions) > 0: 
                    last  = regions[-1]
                    if start - last[1] < 48000 * 0.1:
                        start       = regions[-1][0]
                        regions[-1] = (start, stop)
                    else:
                        regions.append((start, stop))
                else:
                    regions.append((start, stop))
        regions = [(start, stop) for start, stop in regions if stop - start > (64 * 64)]
        if len(regions) > 0:
            with db.connect() as conn:
                for start, stop in regions:                
                    conn.execute("INSERT INTO wdp_ds.not_silent VALUES ({}, {}, '{}', {}, {})".format(id, file_dict['encoding'], file_dict['filename'], start, stop))  
                    id += 1
        print(file_dict['encoding'], ": ", regions)
    c += 1

7161102 :  [(4587520, 4612096), (4722688, 4747264), (4759552, 4771840), (5214208, 5226496), (5472256, 5492736), (5505024, 5529600), (5824512, 5906432), (9125888, 9134080), (9142272, 9199616), (9822208, 9842688), (9973760, 9998336), (10014720, 10027008), (10035200, 10047488), (10829824, 10838016), (10854400, 10866688), (10899456, 10907648), (18403328, 18411520), (21164032, 21172224), (45977600, 45985792), (50511872, 50520064), (61165568, 61177856), (68014080, 68022272), (80130048, 80138240), (83447808, 83456000), (83795968, 83804160)]
audio_files/2011/07161103.m4a {'encoding': 7161103, 'year': 2011, 'filename': '07161103.m4a'} 51
Infile converted from .m4a to ".wav"
data read in!
7161103 :  [(430080, 438272), (1171456, 1179648), (1318912, 1327104), (5976064, 5984256), (5992448, 6000640), (6524928, 6553600), (6561792, 6594560), (6602752, 6615040), (6647808, 6656000), (7184384, 7196672), (7286784, 7294976), (7311360, 7319552), (8183808, 8192000), (11862016, 11874304), (12804096, 12816384)