## End-of-speech classification

In [1]:
import os # for reading the .json files
import json # for reading the .json files (dialog dataset)
import numpy as np

In [2]:
# from MultiWOZ-Parser; reading the names of the files for training, testing, validation datasets
# https://github.com/jojonki/MultiWOZ-Parser/blob/master/parser.py

def load_json(data_file):
    if os.path.isfile(data_file):
        with open(data_file, 'r') as read_file:
            data = json.load(read_file)
            return data

def load_list_file(list_file):
    with open(list_file, 'r') as read_file:
        dialog_id_list = read_file.readlines()
        dialog_id_list = [l.strip('\n') for l in dialog_id_list]
        return dialog_id_list
    return

In [3]:
dialog_data_file = '../MULTIWOZ/data.json'
dialog_data = load_json(dialog_data_file)
dialog_id_list = list(set(dialog_data.keys()))

valid_list_file = '../MULTIWOZ/valListFile.json'
test_list_file = '../MULTIWOZ/testListFile.json'

valid_id_list = list(set(load_list_file(valid_list_file)))
test_id_list = load_list_file(test_list_file)
train_id_list = [did for did in dialog_id_list if did not in (valid_id_list + test_id_list)]

train_data = [v for k, v in dialog_data.items() if k in train_id_list]
valid_data = [v for k, v in dialog_data.items() if k in valid_id_list]
test_data = [v for k, v in dialog_data.items() if k in test_id_list]

In [4]:
# merging all datasets together
data = train_data + valid_data + test_data

In [5]:
import re # for regex

def text_tokens(text):
    # transforming to lowercase
    text = text.lower()
    # replacing whitespace characters with spaces
    text = re.sub("\\s", " ", text)
    # removing everything that is not a letter
    text = re.sub("[^a-zA-Z ']", "", text)
    
    # splitting string into array based on spaces
    tokens = text.split(' ')
    
    # removing empty strings from the tokens array
    tokens = list(filter(('').__ne__, tokens))
    
    return tokens

In [6]:
# from MultiWOZ-Parser
# https://github.com/jojonki/MultiWOZ-Parser/blob/master/parser.py

def get_dst_diff(prev_d, crnt_d):
    assert len(prev_d) == len(crnt_d)
    diff = {}
    for ((k1, v1), (k2, v2)) in zip(prev_d.items(), crnt_d.items()):
        assert k1 == k2
        if v1 != v2: # updated
            diff[k2] = v2
    return diff

def get_lines(d):
    assert 'log' in d
    assert 'goal' in d
    domains = []
    ignore_keys_in_goal = ['eod', 'messageLen', 'message'] # eod (probably) means the user archieved the goal. 
    for dom_k, dom_v  in d['goal'].items():
        if dom_v and dom_k not in ignore_keys_in_goal: # check whether contains some goal entities
            domains.append(dom_k)
            
    lines = []
    
    prev_d = None
    for i, t in enumerate(d['log']):
        spk = 'Usr' if i % 2 == 0 else 'Sys' # Turn 0 is always a user's turn in this corpus.
        if spk == 'Sys':
            if prev_d is None:
                prev_d = t['metadata']
            else:
                crnt_d = t['metadata']
                dst_diff = get_dst_diff(prev_d, crnt_d)
                prev_d = crnt_d

        lines.append(text_tokens(t['text']))

    return lines

In [7]:
def to_array(data):
    # extracting dialogs from data
    dialogs = [get_lines(dialog) for dialog in data]

    s = []
    # extracting individual dialogs
    for sentences in dialogs:
        # extracting each conversation turn
        for sentence in sentences: 
            s.append(sentence)
        
    return s

In [8]:
# list of individual conversation turns from MultiWOZ dataset
data_full = to_array(data)

In [9]:
from collections import Counter # counting the occurrences of words in the dataset

# creating a vocabulary of top 3000 most frequent words in the dataset
vocab = []
for sentence in data_full:
    for word in sentence:
        vocab.append(word)

VOCAB_SIZE=3000 # constant for the vocabulary size

d = Counter(vocab) # count number of occurrences
q = d.most_common()[0:VOCAB_SIZE] # create vocabulary of VOCAB_SIZE most common words
vocab_top = [q[i][0] for i in range(VOCAB_SIZE)] # extracting the words from Counter structure

---

In [10]:
# From Deep Learning for Text Processing Workshop at Machine Learning Prague 2018
# https://github.com/rossumai/mlprague18-nlp

# Creating dictionary of each word in the pre-trained GloVe embeddings, saving its location indexes 

EMBEDDING_DIM = 50

GLOVE_DIR = "../glove"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.%dd.txt' % EMBEDDING_DIM))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [11]:
# a matrix that is indexed by our vocabulary, containing GloVe embedding for each vocabulary element
embedding_matrix = np.zeros((len(vocab_top) + 1, EMBEDDING_DIM))
for i, word in enumerate(vocab_top):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros; also, [0] is reserved for padding.
        embedding_matrix[i + 1] = embedding_vector

In [12]:
def vocab_indices_vector(tokens):
    vector = [0] * seq_cutoff
    
    for i, t in enumerate(tokens):
        try:
            vector[i] = vocab_top.index(t) + 1 # reserving 0 for padding
        except:
            pass # ignore missing words
        
    return vector

In [48]:
import random

def split_data(data):
    X = []
    Y = []
    for i in range(len(data)):
        # first half of the dataset is made out of interrupted conversation turns 
        if (len(data[i]) > 4) and (i <= len(data)//2):
            # picking random point for splitting the conversation turn
            l = random.randrange(1, len(data[i]) - 3)
            # splitting data
            X.append(data[i][:l])
            # adding 0 to the target list -> 0 -- interrupted turn 
            Y.append(0)
                
        # second half of the dataset is made out of full conversation turns
        else:
            # adding the full uninterrupted conversation turn
            X.append(data[i])
            # adding 1 to the target list -> 1 -- uninterrupted turn 
            Y.append(1)
            
    # shuffling the dataset
    c = list(zip(X, Y, data))
    random.shuffle(c)
    X, Y, data = zip(*c)
    
    
    # vectorizing the data
    X = [vocab_indices_vector(x) for x in X]
    
    return X,Y,data

In [49]:
lengths = sorted([len(x) for x in data_full]) # sorted lengths of the all reviews
percentile = 0.90 # we are looking at reviews that are short, the 10% making the long reviews
seq_cutoff = lengths[-1]
print(
    'Longest: %d, Average: %f, Median: %d, %d%% percentile: %d tokens' % 
    (lengths[-1], np.mean(lengths), lengths[int(len(lengths)*0.5)], percentile*100, seq_cutoff)
)

Longest: 61, Average: 12.765652, Median: 12, 90% percentile: 61 tokens


In [50]:
# ratio of words not included in the GloVe matrix
1. * np.count_nonzero(np.all(embedding_matrix == 0, axis=1)) / len(vocab)

0.0002371172240746815

In [51]:
X, Y, data_clean = split_data(data_full)

In [32]:
# splitting into training and validating parts
# 2/3 for training, 1/3 for validating
s = (len(X)//3)*2

X_train = X[:s]
Y_train = Y[:s]

X_val = X[s+1:]
Y_val = Y[s+1:]

---

In [33]:
# NN from Deep Learning for Text Processing Workshop at Machine Learning Prague 2018
# https://github.com/rossumai/mlprague18-nlp

from tensorflow.keras.layers import Activation, Conv1D, Dense, Embedding, GlobalMaxPooling1D, Input, Dropout
from tensorflow.keras.models import Model
import numpy as np

class GloveCNNAwesomeSentimentModel(object):
    def __init__(self, N=256, size=3):
        self.model = self.create(N, size)
        self.model.summary()
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(seq_cutoff,), name='seq_input') 
        seq_embedded = Embedding(input_dim=len(vocab_top) + 1, output_dim=EMBEDDING_DIM,
                                 input_length=seq_cutoff)(seq_indices)
        seq_conv = Conv1D(N, size, activation='relu')(Dropout(0.2)(seq_embedded)) # dropout - 0.2
        max_conv = GlobalMaxPooling1D()(seq_conv)
        hidden_repr = Dense(N, activation='relu')(max_conv)
        sentiment = Dense(1, activation='sigmoid')(Dropout(0.2)(hidden_repr))

        return Model(inputs=[seq_indices], outputs=[sentiment])

    def train(self, X, y, X_val, y_val):
        print('Fitting...')
        return self.model.fit(np.array(X), np.array(y), 
                              validation_data=(np.array(X_val), np.array(y_val)),
                              epochs=7, verbose=1)

    def predict(self, X):
        return self.model.predict(np.array(X))
    
sentiment = GloveCNNAwesomeSentimentModel()
history = sentiment.train(
    X_train, Y_train, X_val, Y_val
)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
seq_input (InputLayer)       [(None, 61)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 61, 50)            150050    
_________________________________________________________________
dropout_2 (Dropout)          (None, 61, 50)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 59, 256)           38656     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0   

In [47]:
# test of the network
for i in range(200,210):
    if np.random.uniform() > 0.5:
        if (len(data_clean[i]) > 4):
            # picking random point for splitting the conversation turn
            l = random.randrange(1, len(data_clean[i]) - 3)

            test_tokens = text_tokens(' '.join(str(e) for e in data_clean[i][:l]))
            prediction = sentiment.predict([vocab_indices_vector(test_tokens)])[0]

            print(0, prediction, ' '.join(str(e) for e in data_clean[i][:l]))
    else:
        test_tokens = text_tokens(' '.join(str(e) for e in data_clean[i]))
        prediction = sentiment.predict([vocab_indices_vector(test_tokens)])[0]

        print(1, prediction, ' '.join(str(e) for e in data_clean[i]))

1 [0.9977252] no that's ok how about colleges can you recommend a good one in the centre of town
1 [0.7784515] i am actually leaving from cote
1 [0.90955937] the tr arrives at will this be okay
1 [0.9627397] certainly what is your departure location and your arrival location
0 [0.8038908] that is all for now
0 [0.99434185] i found restaurants please tell me what type of food you like to help narrow the search
0 [0.0003536] no but could you give me
1 [0.99978024] have a lovely day goodbye
0 [0.2711899] your contact number


In [35]:
# test of the network
sentiment.model.evaluate(X_val[10000:20000], Y_val[10000:20000], verbose=0, batch_size=100)

[0.2454347674548626, 0.9069]