In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import base as features
import scipy.io.wavfile as wav
import os
import subprocess
import csv
import numpy as np
import pickle
import tensorflow as tf
import time
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error,f1_score
from scipy.stats import pearsonr
from collections import Counter
from imblearn.over_sampling import SMOTE

#***********************************************************
#              PREPROCESSING CONFIGURATION
#***********************************************************
directory_in_audio = '/home2/plopez/BBDD/AVEC2014/Audio/audio/'
directory_label = '/home2/plopez/BBDD/AVEC2014/Testing/Depression/'
directory_out = '/home/projects/fgnovo/workspace/Tesis/BBDD/AVEC2014/'
winlen = 0.02
winstep = 0.01
nfilt = 40
nfft = 512
lowfreq = 0
preemph = 0.97
time_segment = 3
log_mel_filterbank_features = 40
rate_mpeg = "16k"



THE NEXT TWO CELLS ONLY HAVE TO BE EXECUTED THE FIRST TIME, OR WHEN YOU LIKE CHANGE THE PREPROCESSING

In [2]:
def process_audio_file_logmel(nfile_audio, directory_out,directory_in_audio,label, time_segment,rate_mpeg,total_data_log_coef,data_matrix):
    name_out = "salida.wav"
    command = "ffmpeg -i "+directory_in_audio+nfile+" -ab "+rate_mpeg +" -ac 1 -ar "+rate_mpeg+" -vn "+directory_out+name_out
    subprocess.call(command,shell=True)
    (rate,data) = wav.read(directory_out+name_out)
    os.remove(directory_out+name_out)
    len_utterance = time_segment*rate
    num_utterance = len(data)//len_utterance
    print ("Num utterances: {0}".format(num_utterance))
    for i in range(num_utterance):
        data_utterance = data[i*len_utterance:(i+1)*len_utterance]
        data_log_coef  = features.logfbank(data_utterance,samplerate = rate,winlen = winlen,winstep = winstep,nfilt = nfilt,nfft = nfft,lowfreq = lowfreq,highfreq = rate/2,preemph = preemph)
        element = [data_log_coef,label]
        if not total_data_log_coef :
            total_data_log_coef = [element]
        else:
            total_data_log_coef.append(element)
        if data_matrix.shape[0] == 0:
            data_matrix = data_log_coef
        else:
            data_matrix = np.vstack((data_matrix,data_log_coef))
    return(total_data_log_coef,data_matrix)     

def process_labels(directory_label):
    files = os.listdir(directory_label)
    list_labels = {}
    for nfile in files:
        lname = nfile.split('_')
        name = lname[0]+lname[1]
        f = open(directory_label+nfile)
        label = int(f.readline())
        list_labels.update({name:label})
    return(list_labels)
    

In [6]:
files = os.listdir(directory_in_audio)
labels = process_labels(directory_label)
total_data_log_coef = list()
data_matrix = np.array([])
i = 1
for nfile in files:
    lname = nfile.split('_')
    name = lname[0]+lname[1]
    label = labels[name]
    print ('Processing file {0} with label {1}, number {2}'.format(nfile,label,i))
    i+=1
    (total_data_log_coef,data_matrix) = process_audio_file_logmel(nfile, directory_out,directory_in_audio,label, time_segment,rate_mpeg,total_data_log_coef,data_matrix)
print ('Procesado de archivos finalizados')
print ('Standarization......')
mean = np.mean(data_matrix,axis = 0)
standard_deviation = np.std(data_matrix,axis=0)
data_matrix = []
for utterance in total_data_log_coef:
    utterance[0] = (utterance[0]-mean)/standard_deviation
print('Saving data')
output = open(directory_out+'datos_procesados','wb') 
pickle.dump(total_data_log_coef,output)
output.close()
total_data_log_coef=[]
print('Finished')



Processing file 218_3_Northwind_audio.mp4 with label 22, number 1
Num utterances: 13
Processing file 246_2_Freeform_audio.mp4 with label 29, number 2
Num utterances: 36
Processing file 241_2_Freeform_audio.mp4 with label 44, number 3
Num utterances: 2
Processing file 217_2_Northwind_audio.mp4 with label 30, number 4
Num utterances: 22
Processing file 368_1_Northwind_audio.mp4 with label 11, number 5
Num utterances: 13
Processing file 358_1_Freeform_audio.mp4 with label 22, number 6
Num utterances: 37
Processing file 219_3_Northwind_audio.mp4 with label 19, number 7
Num utterances: 13
Processing file 218_2_Northwind_audio.mp4 with label 24, number 8
Num utterances: 13
Processing file 236_1_Northwind_audio.mp4 with label 23, number 9
Num utterances: 14
Processing file 242_3_Northwind_audio.mp4 with label 9, number 10
Num utterances: 13
Processing file 310_4_Northwind_audio.mp4 with label 19, number 11
Num utterances: 13
Processing file 364_1_Northwind_audio.mp4 with label 0, number 12
Nu

Execute the next cell to load the data:

In [2]:
por_test=0.1
por_validation = 0.1
por_train = 0.8
num_labels = 46

def ohenc(num_labels,a):
    aux = np.zeros((a.shape[0],num_labels))
    aux[np.arange(a.shape[0]),a]=1
    return(aux)
#with open(directory_out+'datos_procesados', 'rb') as f:
#    u = pickle._Unpickler(f)
#    datos = np.array(u.load())
f = open(directory_out+'datos_procesados','r')
datos=np.array(pickle.load(f))
#datos = categories(datos)
length = len(datos)
print('Labels')
print(Counter(datos[:,1]))
print ('Number of samples: {0}'.format(length))
index = np.random.permutation(length)
index_test =index[:int(por_test*length)]
data_test = datos[index_test][:,0]
label_test = datos[index_test][:,1]
print ('Number of test samples: {0}'.format(data_test.shape))
index_validation = index[index_test.shape[0]:(index_test.shape[0]+int(por_validation*length))]
data_validation = datos[index_validation][:,0]
label_validation = datos[index_validation][:,1]
print ('Number of validation samples: {0}'.format(data_validation.shape))
index_train = index[index_test.shape[0]+index_validation.shape[0]:]
data_train = datos[index_train][:,0]
label_train = datos[index_train][:,1]
print ('Number of train samples: {0}'.format(data_train.shape))
data = []
dimensions = (data_test[0].shape[0],data_test[0].shape[1])
lentest=len(data_test)
lentrain=len(data_train)
lenvalidation=len(data_validation)
#print(data_test[0])
data_test = np.concatenate(np.concatenate(data_test))
data_train = np.concatenate(np.concatenate(data_train))
data_validation = np.concatenate(np.concatenate(data_validation))
data_test=np.expand_dims(np.transpose(data_test.reshape(dimensions[1],dimensions[0],lentest,order='F'),(2,0,1)),axis=3)
data_train=np.expand_dims(np.transpose(data_train.reshape(dimensions[1],dimensions[0],lentrain,order='F'),(2,0,1)),axis=3)
data_validation=np.expand_dims(np.transpose(data_validation.reshape(dimensions[1],dimensions[0],lenvalidation,order='F'),(2,0,1)),axis=3)
label_train= ohenc(num_labels,label_train.astype(int))
label_test= ohenc(num_labels,label_test.astype(int))
label_validation= ohenc(num_labels,label_validation.astype(int))
print('Dimension of data_test: {}'.format(data_test.shape))
print('Dimension of data_train: {}'.format(data_train.shape))
print('Dimension of data_validation: {}'.format(data_validation.shape))
print('Dimension of label_test: {}'.format(label_test.shape))
print('Dimension of label_train: {}'.format(label_train.shape))
print('Dimension of label_validation: {}'.format(label_validation.shape))
#print('Máximo label: {}'.format(max([max(label_test),max(label_train),max(label_validation)])))

Labels
Counter({0: 671, 3: 319, 9: 279, 6: 259, 1: 241, 5: 206, 11: 195, 12: 189, 16: 169, 17: 155, 14: 150, 4: 148, 7: 144, 25: 141, 29: 136, 21: 134, 19: 133, 33: 123, 15: 121, 22: 120, 24: 120, 27: 113, 10: 110, 23: 93, 43: 93, 30: 83, 34: 77, 32: 76, 8: 74, 18: 72, 2: 68, 41: 54, 37: 47, 13: 46, 35: 43, 31: 34, 39: 32, 26: 27, 20: 17, 28: 17, 45: 16, 44: 15, 40: 14})
Number of samples: 5374
Number of test samples: (537,)
Number of validation samples: (537,)
Number of train samples: (4300,)
Dimension of data_test: (537, 40, 299, 1)
Dimension of data_train: (4300, 40, 299, 1)
Dimension of data_validation: (537, 40, 299, 1)
Dimension of label_test: (537, 46)
Dimension of label_train: (4300, 46)
Dimension of label_validation: (537, 46)


{'ldepression': 800, 'ndepression': 2949, 'mdepression': 782, 'hdepression': 843}


## Dnn configuration

In [3]:
batch_size=21
dropout=0.7
#CNN CONFIGURATION
input_size= data_train[0,:,:].shape
num_cnn_nodes = 1024
num_cnn_1x1 = 400
fl_kernel_shape = (40,5)
f2_kernel_shape = (40,3)
num_labels = 46
num_layers = 2
num_pooling = 2
#MULTILAYER PERCEPTRON
num_layersp = 2
dimension_l1 = (59200,5024)
dimension_l2 = (5024,512)
learning_rate=1e-4
#TENSORBOARD
logs_path = "/home/projects/fgnovo/workspace/Tesis/cnn-log"
print(input_size)

def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 2) == np.argmax(labels, 1)) / predictions.shape[1])
def MAE(predictions,labels):
    return(np.sum(np.absolute(np.argmax(predictions,2)[0]-np.argmax(labels,1)))/predictions.shape[1])
def RMSE(predictions,labels):
    return(mean_squared_error(np.argmax(labels,1),np.argmax(predictions,2)[0])**0.5)
def pearson(predictions,labels):
    return(pearsonr(np.argmax(predictions,2)[0],np.argmax(labels,1)))
def _activation_summary(x):
    tf.histogram_summary(x.op.name + '/activations', x)
    tf.scalar_summary(x.op.name + '/sparsity', tf.nn.zero_fraction(x))

(40, 299, 1)


# DNN MODEL

## DNN GRAPH

In [4]:
#*****************************************
#           DNN GRAPH
#*****************************************
graph = tf.Graph()
with graph.as_default():
    #********* Input data***************************
    #***********************************************
    tf_train_dataset = tf.placeholder(tf.float32, shape=(None, input_size[0], input_size[1], 1))
    tf_train_labels = tf.placeholder(tf.float32, shape=(None,num_labels))
    #tf_valid_dataset = tf.constant(data_validation.astype(np.float32))
    #tf_test_dataset = tf.constant(data_test.astype(np.float32))

    def build_cnn_graph(data,dropout):
        #CNN
        with tf.variable_scope("conv1"):
            layer1_weights = tf.get_variable("weights",[fl_kernel_shape[0],fl_kernel_shape[1],1, num_cnn_nodes],
                                                 initializer=tf.random_normal_initializer(0,0.1))
            layer1_biases = tf.get_variable("biases",[num_cnn_nodes],
                                            initializer = tf.constant_initializer(0.0))
        #CNN1x1
        with tf.variable_scope("conv1x1"):
            layer1_conv1x1 = tf.get_variable("weights",[1,1 ,num_cnn_nodes,num_cnn_1x1],
                                             initializer=tf.random_normal_initializer(0,0.01))   
        #MLP
        with tf.variable_scope("mlp1"):
            w1mlp = tf.get_variable("weights",[dimension_l1[0],dimension_l1[1]],
                                        initializer=tf.random_normal_initializer(0,0.006))
            b1mlp = tf.get_variable("biases",[dimension_l1[1]],
                                       initializer = tf.constant_initializer(0.0))  
        with tf.variable_scope("mlp2"):
            w2mlp = tf.get_variable("weights",[dimension_l2[0],dimension_l2[1]],
                                        initializer=tf.random_normal_initializer(0,0.014))
            b2mlp = tf.get_variable("biases",[dimension_l2[1]],
                                       initializer = tf.constant_initializer(0.0)) 
        #SOFTMAX
        with tf.variable_scope("softmax"):
            weight_sm = tf.get_variable("weights",[dimension_l2[1], num_labels], 
                                            initializer=tf.random_normal_initializer(0,0.044))
            bias_sm = tf.get_variable("biases",[num_labels],
                                       initializer = tf.constant_initializer(0.0)) 

        #********* Model *******************************
        #***********************************************
        #CNN
        conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='VALID')
        hidden = tf.nn.relu(conv + layer1_biases,name="conv")
        _activation_summary(hidden)
        hidden_pool = tf.nn.max_pool(hidden, ksize=[1, 1, 2, 1], strides=[1, 1, 2, 1], padding='SAME')
        hidden_d = tf.nn.dropout(hidden_pool,dropout)
        #CNN 1x1 Dimension Reduction from 1024 to 512
        conv1x1= tf.nn.conv2d(hidden_d,layer1_conv1x1,[1,1,1,1],padding='SAME')
        # conv1x1 = hidden_d
        shape = conv1x1.get_shape().as_list()
        reshape = tf.reshape(conv1x1,[-1, shape[1] *shape[2]*shape[3]])
        #MLP
        omlp1 = tf.nn.relu(tf.matmul(reshape,w1mlp)+b1mlp,name="mlp1")
        _activation_summary(omlp1)
        domlp1 = tf.nn.dropout(omlp1,dropout)
        omlp2 = tf.nn.relu(tf.matmul(domlp1,w2mlp)+b2mlp,name="mlp2")
        _activation_summary(omlp2)
        domlp2 = tf.nn.dropout(omlp2,dropout)       
        #SOFTMAX
        return(tf.matmul(domlp2, weight_sm) + bias_sm)


    for var in tf.trainable_variables():
        tf.histogram_summary(var.op.name, var)
    logits = build_cnn_graph(tf_train_dataset,dropout)
    #ERROR
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    #TRAINING
    optimizer = tf.train.AdamOptimizer(1e-4).minimize(loss)
    #PREDICTIONS
    #train_prediction = tf.nn.softmax(logits)
    tf.get_variable_scope().reuse_variables()
    valid_prediction = tf.nn.softmax(build_cnn_graph(tf_train_dataset,1.0))
    test_prediction = tf.nn.softmax(build_cnn_graph(tf_train_dataset,1.0))
    #TENSORBOARD
    merged = tf.merge_all_summaries()
    

        
        
        
        
        

In [5]:
def train_network(num_steps=2301): 
    print('vamos')
    #config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    #config.log_device_placement=True
    print('sigue')
    
    with tf.Session(graph=graph) as session:
        print('Inicializando variables')
        tf.initialize_all_variables().run()
        print('Inicializando summaries')
        summary_writer = tf.train.SummaryWriter(logs_path, graph=tf.get_default_graph())
        print('Initialized')
        for step in range(num_steps):
            offset = (step * batch_size) % (label_train.shape[0] - batch_size)
            batch_data = data_train[offset:(offset + batch_size), :, :, :]
            batch_labels = label_train[offset:(offset + batch_size), :]
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels: batch_labels}
            _, l,summary = session.run([optimizer, loss,merged], feed_dict=feed_dict)
            summary_writer.add_summary(summary, step )
            if (step % 50 == 0):
                print('Minibatch loss at step %d: %f' % (step, l))
                #print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
                feed_dict={tf_train_dataset:data_validation.astype(np.float32)}
                #tf.get_variable_scope().reuse_variables()
                v_prediction = session.run([valid_prediction],feed_dict=feed_dict)
                print('Validation accuracy: %.1f%%' % accuracy(np.array(v_prediction), label_validation))
                #print(np.argmax(np.array(v_prediction),axis=2))
                #print(np.array(np.array(v_prediction).shape))
                #print('********')
                #print(np.argmax(label_validation,axis=1))
                #print(label_validation.shape)
                #print(w1mlp.eval())
        feed_dict={tf_train_dataset:data_test.astype(np.float32)}
        t_prediction=session.run([test_prediction],feed_dict=feed_dict)
        print('Test accuracy: %.1f%%' % accuracy(np.array(t_prediction), label_test))
        print('Test MAE: %.2f' % MAE(np.array(t_prediction),label_test))
        print(np.argmax(np.array(t_prediction),2)[0].shape)
        print(np.argmax(label_test,1).shape)
        print('Test RSE: %.2f' % RMSE(np.array(t_prediction),label_test))
        p = pearson(np.array(t_prediction),label_test)
        print('Test Pearson Correlation:')
        print(p)
        a = np.argmax(label_test,1)
        b= np.argmax(np.array(t_prediction),2)
        print(a)
        print(b)
        f1=f1_score(np.argmax(label_test,1),np.argmax(np.array(t_prediction),2)[0],average='weighted')
        print("F1: {}".format(f1))
         

        
            

In [6]:
print('Empezando')
train_network()   

Empezando
vamos
sigue
Inicializando variables
Inicializando summaries
Initialized
Minibatch loss at step 0: 3.850353
Validation accuracy: 4.5%
Minibatch loss at step 50: 3.400596
Validation accuracy: 12.8%
Minibatch loss at step 100: 3.355978
Validation accuracy: 15.3%
Minibatch loss at step 150: 3.228923
Validation accuracy: 19.7%
Minibatch loss at step 200: 3.340618
Validation accuracy: 23.1%
Minibatch loss at step 250: 2.276955
Validation accuracy: 19.9%
Minibatch loss at step 300: 1.604687
Validation accuracy: 28.1%
Minibatch loss at step 350: 2.041882
Validation accuracy: 28.9%
Minibatch loss at step 400: 1.864711
Validation accuracy: 30.7%
Minibatch loss at step 450: 1.895155
Validation accuracy: 29.2%
Minibatch loss at step 500: 0.930715
Validation accuracy: 30.4%
Minibatch loss at step 550: 1.028863
Validation accuracy: 26.8%
Minibatch loss at step 600: 0.943340
Validation accuracy: 33.1%
Minibatch loss at step 650: 0.710616
Validation accuracy: 32.0%
Minibatch loss at step 700

  'precision', 'predicted', average, warn_for)
