In [1]:
import numpy as np
import pandas as pd
import csv
import keras.backend as K
import matplotlib.pyplot as plt
import os
import fnmatch

from tqdm import tqdm

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score



In [5]:
# train_len = 25253
# validation_len = 9471
# test_len = 13794
train_length = 2684
val_length = 726
test_length = 1498

def load_features(filename,
                  skip_header=True,
                  skip_instname=True,
                  delim=' ',
                  num_lines=0):
    if num_lines == 0:
        num_lines = get_num_lines(filename, skip_header)

    data = np.empty(
        (num_lines, 50), float)

    with open(filename, 'r') as csv_file:
        if skip_header:
            next(csv_file)
        c = 0
        for line in tqdm(csv_file):
            offset = 0
            if skip_instname:
                offset = line.find(delim) + 1
            data[c, :] = np.fromstring(line[offset:], dtype=float, sep=delim)
            c += 1

    return data

def load_batch_features(filename, start_index=0, amount=0):
    delim = ' '
    
    data = np.empty((amount, 25), float)
    
    with open(filename, 'r') as csv_file:
        for i, line in tqdm(enumerate(csv_file)):
            if i < start_index:
                continue
            if i >= start_index + amount:
                break
            index = i - start_index
            data[index, :] = np.fromstring(line, dtype=float, sep=delim)
    return data
    
def load_batch_labels(filename, start_index=1, amount=0):
    labels = np.empty((amount, 3), float)
    delim = ','
    
    with open(filename, 'r') as csv_file:
        for i, line in tqdm(enumerate(csv_file)):
            if i < start_index:
                continue
            if i >= start_index + amount:
                break
            cols = np.fromstring(line, dtype=float, sep=delim)
            index = i - start_index
            labels[index, :] = cols[1:]
    return labels
    
def get_num_lines(filename, skip_header):
    with open(filename, 'r') as csv_file:
        if skip_header:
            next(csv_file)
        c = 0
        for line in csv_file:
            c += 1
    return c

def load_labels(filename,
                  skip_header=True,
                  skip_instname=True,
                  delim=' ',
                  num_lines=0):
    if num_lines == 0:
        num_lines = get_num_lines(filename, skip_header)

    data = np.empty(
        (num_lines, 3), float)

    with open(filename, 'r') as csv_file:
        if skip_header:
            next(csv_file)
        c = 0
        for line in tqdm(csv_file):
            offset = 0
            if skip_instname:
                offset = line.find(delim) + 1
            data[c, :] = np.fromstring(line[offset:], dtype=float, sep=delim)
            c += 1

    return data
    
    
def get_scaler(x):
    x_scaler = StandardScaler()
    x_scaler.fit(x)
  
    return x_scaler

def scale_data(scaler, data):
    if data.ndim > 2:
        data = data.reshape(-1, data.shape[2])
    scaled = scaler.transform(data)
    
    return scaled

def ccc(gold, pred):
    gold       = K.squeeze(gold, axis=-1)
    pred       = K.squeeze(pred, axis=-1)
    gold_mean  = K.mean(gold, axis=-1, keepdims=True)
    pred_mean  = K.mean(pred, axis=-1, keepdims=True)
    covariance = (gold-gold_mean)*(pred-pred_mean)
    gold_var   = K.mean(K.square(gold-gold_mean), axis=-1,  keepdims=True)
    pred_var   = K.mean(K.square(pred-pred_mean), axis=-1, keepdims=True)
    ccc        = K.constant(2.) * covariance / (gold_var + pred_var + K.square(gold_mean - pred_mean) + K.epsilon())
    return ccc

def ccc_loss(gold, pred):
    ccc_loss = K.constant(1.) - ccc(gold, pred)
    return ccc_loss


### Batch loading to train LSTM-RNN

- First, load all data to get scalers that covers for each partition data
- Batching the data to train
- Batching the data to validation

In [7]:
data_path = './Functional_features/'

seq_len = 388
n_features = 25


# load all data to get a scaler that covers all data
print("Loading training samples...")
x_train = load_features(data_path+'train.txt', skip_header=False, skip_instname=False)
train_labels = load_labels(data_path+'train_labels.txt', skip_header=False, skip_instname=False)
y_train = train_labels.reshape((train_length, seq_len, 3))

print("Loading finished, Scaling...")
x_scaler = get_scaler(x_train)

# Scaling acoustic features
x_train_scaled = scale_data(x_scaler, x_train)
# Scaling labels from [-100,100] to [-1, 1]
f = lambda x: x * 0.01
y_train_scaled = f(y_train)
x_train_scaled = x_train_scaled.reshape((train_length, seq_len, n_features * 2))
print('x_train shape:', x_train_scaled.shape)
print('y_train shape:', y_train_scaled.shape)
print("End of loading and preprocessing training samples")

print("Loading validation samples...")
x_validation = load_features(data_path+'validation.txt', skip_header = False, skip_instname=False)
val_labels = load_labels(data_path+'validation_labels.txt', skip_header=False, skip_instname=False)
y_validation = val_labels.reshape((val_length, seq_len, 3))
print("Loading finished, Scaling...")
x_val_scaled = scale_data(x_scaler, x_validation)
y_val_scaled = f(y_validation)
x_val_scaled = x_val_scaled.reshape((val_length, seq_len, n_features * 2))

print('x_validation shape:', x_val_scaled.shape)
print('y_validation shape:', y_val_scaled.shape)
print("End of loading and preprocessing validation samples")

print("Loading testing samples...")
x_test = load_features(data_path+'test.txt', skip_header = False, skip_instname=False)
test_labels = load_labels(data_path+'test_labels.txt', skip_header=False, skip_instname=False)
y_test = test_labels.reshape((test_length, seq_len, 3))
print("Loading finished, Scaling...")
x_test_scaled = scale_data(x_scaler, x_test)
y_test_scaled = f(y_test)
x_test_scaled = x_test_scaled.reshape((test_length, seq_len, n_features * 2))

print('x_test shape:', x_test_scaled.shape)
print('y_test shape:', y_test_scaled.shape)
print("End of loading and preprocessing test samples")

Loading training samples...


1041392it [00:09, 107926.84it/s]
1041392it [00:02, 384258.72it/s]


Loading finished, Scaling...
x_train shape: (2684, 388, 50)
y_train shape: (2684, 388, 3)
End of loading and preprocessing training samples
Loading validation samples...


281688it [00:02, 108125.03it/s]
281688it [00:00, 339885.09it/s]


Loading finished, Scaling...
x_validation shape: (726, 388, 50)
y_validation shape: (726, 388, 3)
End of loading and preprocessing validation samples
Loading testing samples...


581224it [00:05, 110949.91it/s]
581224it [00:01, 374315.75it/s]


Loading finished, Scaling...
x_test shape: (1498, 388, 50)
y_test shape: (1498, 388, 3)
End of loading and preprocessing test samples


In [8]:
# headers = y_train.columns
# print(headers)
headers = ['Arousal', 'Valence', 'Dominance']

In [9]:
len(y_train_scaled)

2684

### Building RNN-LSTM model


In [10]:
import keras.backend as K
from keras.models import Model, save_model, load_model, Sequential
from keras.layers import Input, Dense, Masking, LSTM, Dropout, TimeDistributed, Bidirectional, Flatten, Embedding, Conv1D, BatchNormalization, MaxPool1D
from tensorflow.keras.optimizers import RMSprop, Adam

from numpy.random import seed
from tensorflow.keras.utils import set_random_seed
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

### Multi-task learning

three models are integrated and each model covers Arousal, Valence, Dominance respectively.

In [11]:
# train_length = 25253
# val_length = 9471
# test_length = 13794
# train_length = 41815
# val_length = 13451
# test_length = 22633

n_features = 50
random_seed = 42

epochs = 50
batch_size = 32


def create_model(n_units1=64, n_units2=32):
    model = Sequential()
    inputs = Input(shape=(seq_len, n_features), dtype=float)
    mask = Masking()(inputs)
    lstm_1 = LSTM(n_units1, return_sequences=True)(mask)
    lstm_2 = LSTM(n_units2, return_sequences=True)(lstm_1)
    lstm_2 = Dropout(0.3)(lstm_2)
    modes = lstm_2
    output = [TimeDistributed(Dense(1), name=name)(modes) for i, name in enumerate(headers)]
    
    model = Model(inputs=inputs, outputs=output)
    
    rmsprop = RMSprop(lr=0.0001)
    model.compile(optimizer=rmsprop, loss=ccc_loss, metrics=[ccc])
    return model
#
# def create_arousal(n_units1=64, n_units2=32, dropout, bidirection=False):
#     a_input = Input(shape=(time_step, n_features), dtype=float, name='arousal_model_input')
#     mask = Masking()(a_input)
#     if bidirection:
#         a_lstm1 = Bidirectional(LSTM(n_units1, return_sequences=True))(mask)
#     else:
#         a_lstm1 = LSTM(n_units1, return_sequences=True)(mask)
#     a_lstm1 = Dropout(dropout)(a_lstm1)
#     if bidirection:
#         a_lstm2 = Bidirectional(LSTM(n_units2, return_sequences=False)(a_lstm1))
#     else:
#         a_lstm2 = LSTM(n_units2, return_sequences=False)(a_lstm1)
#     a_lstm2 = Dropout(dropout)(a_lstm2)
#     a_dense = Dense(
    
    
    

In [12]:
model = create_model(n_units1=256, n_units2=256)
print(model.summary())

2022-07-20 22:40:28.647772: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-20 22:40:28.710258: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-20 22:40:28.710522: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-20 22:40:28.711331: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 388, 50)]    0           []                               
                                                                                                  
 masking (Masking)              (None, 388, 50)      0           ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    (None, 388, 256)     314368      ['masking[0][0]']                
                                                                                                  
 lstm_1 (LSTM)                  (None, 388, 256)     525312      ['lstm[0][0]']                   
                                                                                              

  super(RMSprop, self).__init__(name, **kwargs)


In [13]:
def transform_mtl(y):
    y_dta = []
    index = [0,1,2]
    for i in index:
        dim = np.empty((len(y), seq_len, 1))
        dim[:,:,0] = y[:,:,i]
        y_dta.append(dim)
    return y_dta

y_train_dta = transform_mtl(y_train_scaled)
y_val_dta = transform_mtl(y_val_scaled)
y_test_dta = transform_mtl(y_test_scaled)

In [14]:
callback = EarlyStopping(monitor='val_loss', patience=5, mode='min')

history = model.fit(x_train_scaled, y_train_dta, 
                    epochs=epochs, 
                    batch_size=batch_size,                     
                    validation_data=(x_val_scaled, y_val_dta), 
                    callbacks=[callback])
# plot_learningCurve(history, epochs)

Epoch 1/50


2022-07-20 22:41:27.086131: W tensorflow/core/common_runtime/forward_type_inference.cc:231] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_LEGACY_VARIANT
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	while inferring type of node 'cond_40/output/_24'
2022-07-20 22:41:27.935758: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8200


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


In [15]:
x_combined = np.concatenate([x_train_scaled, x_val_scaled])
y_combined = []
for i in range(3):
    temp = np.concatenate([y_train_dta[i], y_val_dta[i]])
    y_combined.append(temp)
print("Combined features shape %s" % (str(x_combined.shape)))
print("Combined labels shape and target num %s and %s" % (str(y_combined[0].shape), str(len(y_combined))))

Combined features shape (3410, 388, 50)
Combined labels shape and target num (3410, 388, 1) and 3


In [85]:
model = create_model(n_units1=256, n_units2=256)

history = model.fit(x_combined, y_combined, 
                    epochs=epochs, 
                    batch_size=batch_size,                     
                    validation_data=(x_test_scaled, y_test_dta), 
                    callbacks=[callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


### Linguistic model

In [18]:
import nltk
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/changhyun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
def clean_text(text):
    
    punctuation = [i for i in ',./\\;:\'@#~[{]}=+-_)(*&^%$£"!`)]']
    STOPWORDS = set(stopwords.words('english'))
    
    text = text.replace("'s", "")
    text = "".join([" " if t in punctuation else t for t in text]).lower()
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwords from text
    return text

In [42]:
import json

transcripts_path = './transcripts/'
sections = ['train/', 'validation/', 'test/']
segment_path = os.path.relpath('./MSP Data/Time Labels/segments.json')
f = open(segment_path, 'r')
timing_data = json.load(f)
min_time = 1.0

for i, section in enumerate(sections):
    files = fnmatch.filter(os.listdir(transcripts_path+section), '*.txt')
    files.sort()
    text = []
    filename = []
    for file in files:
        inst = file.split('.')[0]
        
        if 'MSP-PODCAST_0153' in inst or 'MSP-PODCAST_1188_0020' in inst:
            continue
        start = timing_data[inst]['Start_Time']
        end = timing_data[inst]['End_Time']
        if end - start < min_time:
            continue
        filename.append(inst)        
        with open(transcripts_path+section+file) as f:
            lines = f.readlines()
        if len(lines) == 0:
            text.append('')
        else:
            text.append(lines[0])
    if i == 0:
        df_train = pd.DataFrame({'Filename': filename, 'txt':text})
        
    elif i == 1:
        df_val = pd.DataFrame({'Filename': filename, 'txt':text})
        
    else:
        df_test = pd.DataFrame({'Filename': filename, 'txt':text})
         

In [43]:
train_cleaned = df_train['txt'].apply(clean_text)
val_cleaned = df_val['txt'].apply(clean_text)
test_cleaned = df_test['txt'].apply(clean_text)
combined_cleaned = pd.concat([train_cleaned, val_cleaned, test_cleaned], ignore_index=True)

print("Cleaned train text: %d" % (len(train_cleaned)))
print("Cleaned validation text: %d" % (len(val_cleaned)))
print("Cleaned test text: %d" % (len(test_cleaned)))
print("Cleaned all text: %d" % (len(combined_cleaned)))

Cleaned train text: 2684
Cleaned validation text: 726
Cleaned test text: 1498
Cleaned all text: 4908


In [44]:
vocab = set()

for i in range(len(combined_cleaned)):
    text = combined_cleaned[i]
    tokens = text.split()
    for token in tokens:
        vocab.add(token)
vocab_size = len(vocab)
vocab_size

8196

In [69]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

sentence_len = [len(sent.split()) for sent in combined_cleaned.tolist()]
sent_len = max(sentence_len)
print('Max sentence length: %d' % (sent_len))
text_vec = TextVectorization(max_tokens=vocab_size, 
                             pad_to_max_tokens=True, 
                             output_sequence_length=388,
                             output_mode='int')
text_vec.adapt(train_cleaned)

Max sentence length: 395


In [70]:
from numpy import array

# loading pre-trained weights and build embedding layer
embeddings_index = dict()
embedding_dim = 300
f = open('embeddings/glove.6B.300d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400001 word vectors.


In [71]:
embedding_matrix = np.zeros((vocab_size+ 1, embedding_dim))
for i, word in enumerate(text_vec.get_vocabulary()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [72]:
embedding_layer = Embedding(vocab_size + 1, 
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=seq_len,
                            trainable=False,
                            name='GloVe')

In [79]:
from keras.layers import Concatenate

def create_bimodal_model(n_units1=64, n_units2=32):
    ling_model = Sequential()
    ling_inputs = Input(shape=(1,), dtype=tf.string)
    vec = text_vec(ling_inputs)
    embed = embedding_layer(vec)
    lstm1 = LSTM(n_units1, return_sequences=True)(embed)
    lstm2 = LSTM(n_units2, return_sequences=True)(lstm1)
    lstm2 = Dropout(0.3)(lstm2)
    ling_model = Model(inputs=ling_inputs, outputs=lstm2)
    
    acoustic_model = Sequential()
    inputs = Input(shape=(seq_len, n_features), dtype=float)
    mask = Masking()(inputs)
    lstm_1 = LSTM(n_units1, return_sequences=True)(mask)
    lstm_2 = LSTM(n_units2, return_sequences=True)(lstm_1)
    lstm_2 = Dropout(0.3)(lstm_2)
    
    acoustic_model = Model(inputs=inputs, outputs=lstm_2)
    
    concat = Concatenate()([ling_model.output, acoustic_model.output])
    lstm_last = LSTM(32, return_sequences=True)(concat)
    output = [TimeDistributed(Dense(1), name=name)(lstm_last) for i, name in enumerate(headers)]
    
    bimodal_model = Model(inputs=[ling_inputs, inputs], outputs=output)
    
    rmsprop = RMSprop(lr=0.0001)
    bimodal_model.compile(optimizer=rmsprop, loss=ccc_loss, metrics=[ccc])
    return bimodal_model
    

In [82]:
ling_model = create_bimodal_model()

train_val = pd.concat([train_cleaned, val_cleaned], ignore_index=True)


In [84]:
callback = EarlyStopping(monitor='val_loss', patience=5, mode='min')

history = ling_model.fit([train_val, x_combined], y_combined, 
                    epochs=epochs, 
                    batch_size=batch_size,                     
                    validation_data=([test_cleaned, x_test_scaled], y_test_dta), 
                    callbacks=[callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50


In [76]:
ling_model.summary()

Model: "model_18"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_21 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization_2 (TextVect  (None, 388)         0           ['input_21[0][0]']               
 orization)                                                                                       
                                                                                                  
 input_22 (InputLayer)          [(None, 388, 50)]    0           []                               
                                                                                                  
 GloVe (Embedding)              (None, 388, 300)     2459100     ['text_vectorization_2[0][