In [None]:
import pandas as pd
import os
from matplotlib import pyplot as plt
import numpy as np
from pathlib import Path
import tensorflow as tf
import shutil
import librosa
from tqdm import tqdm_notebook
from tensorflow.keras.mixed_precision import experimental as mixed_precision
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, Callback
from tensorflow.keras.models import load_model

In [None]:
dataset = 'npys_32000'

In [None]:
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)

In [None]:
def lstm(input_shape = (None, 37), lr = 1e-3):
    inputs = layers.Input(input_shape)
    X = layers.LSTM(64, return_sequences=True)(inputs)
    X = layers.LSTM(64, return_sequences=True)(X)
    X = layers.Dropout(rate=0.5)(X)
    X = layers.LSTM(128, return_sequences=True)(X)
    X = layers.LSTM(128, return_sequences=True)(X)
    X = layers.Dropout(rate=0.5)(X)
    X = layers.LSTM(256, return_sequences=False)(X)
    X = layers.Dropout(rate=0.5)(X)
    X = layers.Dense(128, activation = 'relu')(X)
    X = layers.Dropout(rate=0.5)(X)
    X = layers.Dense(1)(X)
    X = layers.Activation('sigmoid', dtype='float32', name='predictions')(X)
    model = models.Model(inputs = inputs, outputs = X)
    model.compile(optimizer = Adam(lr), loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [None]:
from tensorflow.keras.utils import plot_model

In [None]:
def get_input(filepath):
    clip = np.load(filepath)
    sample_rate = 16000
    rmse = librosa.feature.rms(y=clip)
    chroma_stft = librosa.feature.chroma_stft(y=clip, sr=sample_rate)
    spec_cent = librosa.feature.spectral_centroid(y=clip, sr=sample_rate)
    spec_bw = librosa.feature.spectral_bandwidth(y=clip, sr=sample_rate)
    rolloff = librosa.feature.spectral_rolloff(y=clip, sr=sample_rate)
    zcr = librosa.feature.zero_crossing_rate(clip)
    mfcc = librosa.feature.mfcc(y=clip, sr=sample_rate)
    arr = np.concatenate([rmse,chroma_stft,spec_cent,spec_bw,rolloff,zcr,mfcc],axis=0).T
    # Do either
    # 1. Normalise
    #arr = (arr - arr.min(axis=0))/(arr.max(axis=0)-arr.min(axis=0))
    # 2. Standardise
    arr = (arr - arr.mean(axis=0))/(arr.std(axis=0))
    # 3. Standardise then divide
    #arr = arr/arr.max(axis=0)
    # 4. Normalise in [-1,1]
    #arr = 2*(arr - arr.min(axis=0))/(arr.max(axis=0)-arr.min(axis=0)) - 1
    return [arr]

def data_generator(dataset, split = 'train', batch_size = 8):

    assert batch_size % 2 == 0
    real_files = [f'{dataset}/{split}/real/{f}' for f in os.listdir(f'{dataset}/{split}/real') if '.ipynb' not in f]
    fake_files = [f'{dataset}/{split}/fake/{f}' for f in os.listdir(f'{dataset}/{split}/fake') if '.ipynb' not in f]

    while True:
        real_batch_paths = np.random.choice(a = real_files, size = batch_size // 2)
        fake_batch_paths = np.random.choice(a = fake_files, size = batch_size // 2)
        batch_input  = []
        batch_output = []

        for real_input_path, fake_input_path in zip(real_batch_paths, fake_batch_paths):
            batch_input += get_input(real_input_path)
            batch_input += get_input(fake_input_path)
            batch_output += [[0.],[1.]]
        # Return a tuple of (input, output) to feed the network
        batch_x = np.array(batch_input,dtype=np.float32)
        batch_y = np.array(batch_output,dtype=np.float32)

        yield(batch_x, batch_y)

In [None]:
batch_size = 32

In [None]:
train_gen = data_generator(dataset, 'train', batch_size = batch_size)
dev_gen = data_generator(dataset, 'dev', batch_size = batch_size)
eval_gen = data_generator(dataset, 'eval', batch_size = batch_size)

In [None]:
train_spe = len(os.listdir(f'{dataset}/train/real'))//batch_size
dev_spe = len(os.listdir(f'{dataset}/dev/real'))//batch_size
eval_spe = len(os.listdir(f'{dataset}/eval/real'))//batch_size
print(train_spe, dev_spe, eval_spe)

In [None]:
x,y = next(train_gen)
pd.options.display.float_format = '{:,.6f}'.format
pd.DataFrame(x[0]).describe()

In [None]:
clip = np.load('npys_32000/train/real/LA_T_3565683.npy')
sample_rate = 16000
rmse = librosa.feature.rms(y=clip)
chroma_stft = librosa.feature.chroma_stft(y=clip, sr=sample_rate)
spec_cent = librosa.feature.spectral_centroid(y=clip, sr=sample_rate)
spec_bw = librosa.feature.spectral_bandwidth(y=clip, sr=sample_rate)
rolloff = librosa.feature.spectral_rolloff(y=clip, sr=sample_rate)
zcr = librosa.feature.zero_crossing_rate(clip)
mfcc = librosa.feature.mfcc(y=clip, sr=sample_rate)
arr = np.concatenate([rmse,chroma_stft,spec_cent,spec_bw,rolloff,zcr,mfcc],axis=0).T
# Normalise
arr = (arr - arr.min(axis=0))/(arr.max(axis=0)-arr.min(axis=0))

In [None]:
model_path = f'models/{dataset}'
Path(model_path).mkdir(parents=True, exist_ok = True)
checkpointer = ModelCheckpoint(model_path+f'/{dataset}_lstm_standardised_'+'epochs:{epoch:02d}_acc:{val_accuracy:.4f}.h5', monitor = 'val_loss', save_best_only = True, verbose = 1, mode = 'min')
earlystopper = EarlyStopping(monitor = 'val_loss', patience = 5, verbose = 1, mode = 'min')
reduceLR = ReduceLROnPlateau(monitor = 'val_loss', factor = 1/np.sqrt(10), patience = 3, cooldown = 1, verbose = 1, mode = 'min')
model = lstm()
history = model.fit(train_gen, steps_per_epoch = train_spe, verbose = 1, epochs = 50, callbacks = [checkpointer, earlystopper, reduceLR], validation_data=dev_gen, validation_steps = dev_spe)
print(model.evaluate(eval_gen, steps = eval_spe))