In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "8,9"

In [None]:
#imports 
import numpy as np
import datetime
import pandas as pd
import librosa
import soundfile as sound

import keras
import tensorflow
from keras.optimizers import SGD
from keras.callbacks import CSVLogger
from keras.utils import multi_gpu_model, to_categorical

import sys, subprocess
sys.path.append(os.path.abspath(os.getcwd()))


from DCASE_training_functions import LR_WarmRestart,MixupGenerator, categorical_focal_loss, ckpt

print("Librosa version = ",librosa.__version__)
print("Pysoundfile version = ",sound.__version__)
print("keras version = ",keras.__version__)
print("tensorflow version = ",tensorflow.__version__)

In [None]:
WhichTask = '1a'
MODE = 'DEV'    # 'DEV' uses the official data fold; 'VAL' uses all data in development set for training


if WhichTask =='1a':
    ThisPath = '../../commonData/dcase2020/TAU-urban-acoustic-scenes-2020-mobile-development/'
    num_audio_channels = 1    
    sr = 44100
    
    if MODE == 'DEV':
        TrainFile = ThisPath + 'evaluation_setup/fold1_train_st.csv'
        ValFile = ThisPath + 'evaluation_setup/fold1_evaluate_st.csv'

    elif MODE == 'VAL':
        TrainFile = ThisPath + '/meta_st.csv'

    
SampleDuration = 10

In [None]:
#log-mel spectrogram parameters
NumFreqBins = 128
NumFFTPoints = 2048
HopLength = int(NumFFTPoints/2)
NumTimeBins = int(np.ceil(SampleDuration*sr/float(HopLength)))

multi_gpus = 2    # the number of GPUs used for training 
model_selection = 1
'''
0) ResNet with no freq split
1) ResNet with freq split (domain adaptation is not supported)
'''

#training parameters
max_lr = 0.1
batch_size = 32/multi_gpus
num_epochs = 310
mixup_alpha = 0.4
crop_length = 400 # 
delta = True
num_filters = 28
output_num_filters_factor = 1
wd = 1e-3

num_stacks = 4    # number of residual stacks 
stacking_frames = None # put None if not applied


'''
Applying domain adaptation OR using focal loss function
(Set TRUE for both flags is not supported)
'''

domain_aux = False     # whether to add an auxiliary classifier to apply mild domain adaptation
beta = 0.1            # apply weighting to this new loss

focal_loss = True    # whether to use focal loss
gamma=1.0
alpha=0.3

TEST = 1    #use 1/n data to verify the model before training; put 1 if not applied

assert((domain_aux and focal_loss) == False)

In [None]:
#load filenames and labels
dev_train_df = pd.read_csv(TrainFile,sep='\t', encoding='ASCII')
wavpaths_train = dev_train_df['filename'].tolist()
y_train_labels =  dev_train_df['scene_label'].astype('category').cat.codes.values

ClassNames = np.unique(dev_train_df['scene_label'])
NumClasses = len(ClassNames)
y_train = keras.utils.to_categorical(y_train_labels, NumClasses)

if MODE == 'DEV':
    dev_val_df = pd.read_csv(ValFile,sep='\t', encoding='ASCII')
    wavpaths_val = dev_val_df['filename'].tolist()
    y_val_labels =  dev_val_df['scene_label'].astype('category').cat.codes.values
    y_val = keras.utils.to_categorical(y_val_labels, NumClasses)

if domain_aux:
    y_train_domain_labels =  dev_train_df['source_label'].astype('category').cat.codes.values
    y_train_domain = keras.utils.to_categorical(y_train_domain_labels, 2)
    if MODE == 'DEV':
        y_val_domain_labels =  dev_val_df['source_label'].astype('category').cat.codes.values
        y_val_domain = keras.utils.to_categorical(y_val_domain_labels, 2)

In [None]:
#load wav files and get log-mel spectrograms, deltas, and delta-deltas
def deltas(X_in):
    X_out = (X_in[:,:,2:,:]-X_in[:,:,:-2,:])/10.0
    X_out = X_out[:,:,1:-1,:]+(X_in[:,:,4:,:]-X_in[:,:,:-4,:])/5.0
    return X_out

if TEST>1 and MODE == 'DEV':
    train_size = int(len(wavpaths_train)/TEST)
    train_idx = np.random.choice(range(len(wavpaths_train)), int(len(wavpaths_train)/TEST), replace=False)
    val_idx = np.random.choice(range(len(wavpaths_val)), int(train_size/4), replace=False)
    
    wavpaths_train = np.array(wavpaths_train)[train_idx]
    wavpaths_val = np.array(wavpaths_val)[val_idx]
    y_train = y_train[train_idx]
    y_val = y_val[val_idx]
    num_epochs = 30
    
    if domain_aux:
        y_train_domain = y_train_domain[train_idx]
        y_val_domain = y_val_domain[val_idx]


LM_train = np.zeros((len(wavpaths_train),NumFreqBins,NumTimeBins,num_audio_channels),'float32')
for i in range(len(wavpaths_train)):
    sig,fs = sound.read(ThisPath + wavpaths_train[i],stop=SampleDuration*sr)
   # print (sig.shape, fs)
    
    for channel in range(num_audio_channels):
        if len(sig.shape)==1:
            sig = np.expand_dims(sig,-1)
        LM_train[i,:,:,channel] = librosa.feature.melspectrogram(sig[:,channel], 
                               sr=sr,
                               n_fft=NumFFTPoints,
                               hop_length=HopLength,
                               n_mels=NumFreqBins,
                               fmin=0.0,
                               fmax=sr/2,
                               htk=True,
                               norm=None)  
        
    if i%1500 == 1499:
        print "%i/%i training samples done" % (i+1, len(wavpaths_train))
print "Done"


LM_train = np.log(LM_train+1e-8)

if delta:
    LM_deltas_train = deltas(LM_train)
    LM_deltas_deltas_train = deltas(LM_deltas_train)
    LM_train = np.concatenate((LM_train[:,:,4:-4,:],LM_deltas_train[:,:,2:-2,:],LM_deltas_deltas_train),axis=-1)

if MODE == 'DEV':
    
    LM_val = np.zeros((len(wavpaths_val),NumFreqBins,NumTimeBins,num_audio_channels),'float32')
    for i in range(len(wavpaths_val)):
        sig,fs = sound.read(ThisPath + wavpaths_val[i],stop=SampleDuration*sr)
        for channel in range(num_audio_channels):
            if len(sig.shape)==1:
                sig = np.expand_dims(sig,-1)
            LM_val[i,:,:,channel]= librosa.feature.melspectrogram(sig[:,channel], 
                                           sr=sr,
                                           n_fft=NumFFTPoints,
                                           hop_length=HopLength,
                                           n_mels=NumFreqBins,
                                           fmin=0.0,
                                           fmax=sr/2,
                                           htk=True,
                                           norm=None)
        if i%700 == 699:
            print "%i/%i val samples done" % (i+1, len(wavpaths_val))
    print "Done" 
    LM_val = np.log(LM_val+1e-8)
    if delta: 
        LM_deltas_val = deltas(LM_val)
        LM_deltas_deltas_val = deltas(LM_deltas_val)
        LM_val = np.concatenate((LM_val[:,:,4:-4,:],LM_deltas_val[:,:,2:-2,:],LM_deltas_deltas_val),axis=-1)



    if model_selection == 2:
        pre_padding_length = LM_val.shape[2] # padding may be required

        if np.mod(pre_padding_length,8) != 0:
            pad_size = 8-np.mod(pre_padding_length,8)
            temp  = np.tile(LM_val[:,:,-1,:],pad_size)
            temp = np.reshape(temp,(LM_val.shape[0],LM_val.shape[1],-1,LM_val.shape[-1]))
            LM_val = np.concatenate((LM_val,temp),axis=2)
            
if delta:
    num_audio_channels *= 3


print ('training data dimension: ', LM_train.shape)
if MODE == 'DEV':
    print ('validation data dimension: ', LM_val.shape) 
    
print ('training labels dimension: ', y_train.shape)
if MODE == 'DEV':
    print ('validation labels dimension: ', y_val.shape)

In [None]:
stamp = datetime.datetime.now().strftime('%y%m%d%H%M')
tag = stamp + '_' + WhichTask + '_' + MODE + '_'+ str(num_epochs)
savedir = os.path.join(os.getcwd(), tag)
print "Model path: %s" % savedir
try:
    os.makedirs(savedir)
except OSError:
    if not os.path.isdir(savedir):
        raise

In [None]:
#create and compile the model

subprocess.check_call(["cp", "DCASE2020_ResNet.py", savedir])

if model_selection == 0: #resnet with no split
    
    from DCASE2020_ResNet import model_resnet_no_split
    model = model_resnet_no_split(NumClasses,
                         input_shape =[NumFreqBins,None,num_audio_channels], 
                         num_filters = num_filters,
                         wd=wd,
                         num_stacks = num_stacks,
                         output_num_filters_factor = output_num_filters_factor)  
    
elif model_selection == 1: #resnet with split
    
    from DCASE2020_ResNet import model_resnet    
    model = model_resnet(NumClasses,
                         input_shape =[NumFreqBins,None,num_audio_channels], 
                         num_filters = num_filters,
                         wd=wd,
                         num_stacks = num_stacks,
                         output_num_filters_factor = output_num_filters_factor,
                         stacking_frames = stacking_frames,
                         domain_aux = domain_aux)    

    
    
model.summary()
if multi_gpus > 1:
    model = multi_gpu_model(model,gpus=multi_gpus)
    model.summary()


if domain_aux:
    model.compile(loss=['categorical_crossentropy','binary_crossentropy'],
          loss_weights=[1-beta, beta],
          optimizer =SGD(lr=max_lr,decay=0, momentum=0.9, nesterov=False),
          metrics=['accuracy'])

else:

    if focal_loss:
        model.compile(loss=[categorical_focal_loss(gamma=gamma, alpha=alpha)],
              optimizer =SGD(lr=max_lr,decay=0, momentum=0.9, nesterov=False),
              metrics=['accuracy'])        
    else:
        model.compile(loss='categorical_crossentropy',
              optimizer =SGD(lr=max_lr,decay=0, momentum=0.9, nesterov=False),
              metrics=['accuracy'])
    

In [None]:
#set learning rate schedule
lr_scheduler = LR_WarmRestart(nbatch=np.ceil(LM_train.shape[0]/batch_size), Tmult=2, T0 = 10,
                              initial_lr=max_lr, min_lr=max_lr*1e-4,
                              epochs_restart = [11.0, 31.0, 71.0, 151.0, 311.0, 631.0]) 

log_path = savedir + "/log.csv"
log_cb = CSVLogger(log_path)

ckpt_path=savedir+'/model-{epoch:02d}.h5'
ckpt = ckpt(filepath=ckpt_path, ckpts=[2, 70, 150]) 

callbacks = [lr_scheduler,log_cb,ckpt]


#create data generator
if domain_aux:
    TrainDataGen = MixupGenerator(LM_train, 
                                  y_train, 
                                  batch_size=batch_size,
                                  alpha=mixup_alpha,
                                  crop_length=crop_length,
                                  y_train_2 = y_train_domain)()
    
    
    #train the model
    if MODE == 'DEV':
        history = model.fit_generator(TrainDataGen,
                                      validation_data=(LM_val, [y_val, y_val_domain]),
                                      epochs=num_epochs, 
                                      verbose=1, 
                                      workers=4,
                                      max_queue_size = 100,
                                      callbacks=callbacks,
                                      steps_per_epoch=np.ceil(LM_train.shape[0]/batch_size)
                                      )     
    elif MODE == 'VAL':
        history = model.fit_generator(TrainDataGen,
                                      epochs=num_epochs, 
                                      verbose=1, 
                                      workers=4,
                                      max_queue_size = 100,
                                      callbacks=callbacks,
                                      steps_per_epoch=np.ceil(LM_train.shape[0]/batch_size)
                                      )         
        
else:
    TrainDataGen = MixupGenerator(LM_train, 
                                  y_train, 
                                  batch_size=batch_size,
                                  alpha=mixup_alpha,
                                  crop_length=crop_length)()

    #train the model
    if MODE == 'DEV':
        history = model.fit_generator(TrainDataGen,
                                      validation_data=(LM_val, y_val),
                                      epochs=num_epochs, 
                                      verbose=1, 
                                      workers=4,
                                      max_queue_size = 100,
                                      callbacks=callbacks,
                                      steps_per_epoch=np.ceil(LM_train.shape[0]/batch_size)
                                      ) 
    elif MODE == 'VAL':
        history = model.fit_generator(TrainDataGen,
                                      epochs=num_epochs, 
                                      verbose=1, 
                                      workers=4,
                                      max_queue_size = 100,
                                      callbacks=callbacks,
                                      steps_per_epoch=np.ceil(LM_train.shape[0]/batch_size)
                                      ) 

In [None]:
model.save(savedir + '/model.h5')