In [1]:
import os
import json

def build_label_dict():
    d = {}
    label_path = r"./Frames-dataset/labels.txt"
    label_file = open(label_path, encoding='utf-8')
    for line in label_file:
        k, v = line.split(",")
        d[k] = True if v.strip()=="True" else False
    return d




In [2]:
#returns chat dictionary that include the label of each chat, and turns where each turn has sentences list and elapsed 
#time + the speaker id

def gen_chat_data():
    chat_path = r"./Frames-dataset/chats"
    chats = {}
    d = build_label_dict()
    for filename in os.listdir(chat_path):
        chat_file = open(os.path.join(chat_path, filename), encoding='utf-8')
        chat = json.load(chat_file)
        
        turns = []
        if 'turns' in chat:
            tsp = chat['turns'][0]['timestamp']
       
            for turn in chat['turns']:
               
                ts = turn['timestamp'] - tsp
                tsp = turn['timestamp']
                turns.append({"ti":ts,"text":turn["text"],"author":turn["author"]})

        chats[filename[:-5]] = {}
        chats[filename[:-5]]["turns"] = turns
        chats[filename[:-5]]["label"] = d[filename[:-5]]
    return chats


In [3]:
import numpy as np
# import pandas as pd
import _pickle as cPickle
from collections import defaultdict
import re



import sys
import os

os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers.merge import concatenate
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers
import theano

import functools
import nltk
nltk.download('punkt')
from nltk import tokenize

MAX_SENT_LENGTH = 40
MAX_SENTS = 20
MAX_NB_WORDS = 20000
MAX_TURNS = 60
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.2
GLOVE_DIR = "./data/glove"

Using Theano backend.


[nltk_data] Downloading package punkt to /home/efrat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
def create_data(chats,flag=False,length=0):
    texts = []
    labels=[]
    turns=[]
    for idx in chats.keys():
        if flag:
            text = "\n".join([x["text"] for x in chats[idx]["turns"][:length]])
            
        else:
            text = "\n".join([x["text"] for x in chats[idx]["turns"]])
        texts.append(text)
        sentences = tokenize.sent_tokenize(text)
        turns.append(sentences)
        labels.append(chats[idx]["label"])
        
    tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)

    data = np.zeros((l, MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
    for i, sentences in enumerate(turns):
        if flag and i==length:
            break;
        for j, sent in enumerate(sentences):
            if j< MAX_SENTS:
                wordTokens = text_to_word_sequence(sent)
                k=0
                for _, word in enumerate(wordTokens):
                    if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                        data[i,j,k] = tokenizer.word_index[word]
                        k=k+1   
    return data



In [5]:
def create_data_with_turns(chats, l,flag=False,length=0):
    texts = []
    labels=[]
    chats_txt=[]
    vecs=[]
    for idx in chats.keys():
        if flag:
            text = "\n".join([x["text"] for x in chats[idx]["turns"][:length]])
            
        else:
            text = "\n".join([x["text"] for x in chats[idx]["turns"]])
        turns=[]
        vec=[]
        for turn in chats[idx]["turns"]:
            vec.append([len(turn["text"]),turn["ti"],0 if turn["author"].lower()=="wizard" else 1])
            texts.append(turn["text"])
            sentences = tokenize.sent_tokenize(turn["text"])
            turns.append(sentences)
        vecs.append(vec)
        chats_txt.append(turns)
        labels.append(chats[idx]["label"])
        
    tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)

    data = np.zeros((l, MAX_TURNS,MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
    for m, turns in enumerate(chats_txt):
        for i, sentences in enumerate(turns):
            if flag and i==length:
                break;
            for j, sent in enumerate(sentences):
                if j< MAX_SENTS:
                    wordTokens = text_to_word_sequence(sent)
                    k=0
                    for _, word in enumerate(wordTokens):
                        if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                            data[m,i,j,k] = tokenizer.word_index[word]
                            k=k+1   
    return data,vecs



In [6]:
def create_texts_labels(chats,flag=False,length=0):
    texts=[]
    labels=[]
    for idx in chats.keys():
        if flag:
        
            text = "\n".join([x["text"] for x in chats[idx]["turns"][:length]])
        else:
            text = "\n".join([x["text"] for x in chats[idx]["turns"]])
        texts.append(text)
        sentences = tokenize.sent_tokenize(text)
        labels.append(chats[idx]["label"])   
    return texts,labels


In [7]:



def prepare_datasets(chats,flag=False, length =0):
    if flag:
        chats = {k: v for k, v in chats.items() if len(v["turns"])>length}
    
    texts,labels = create_texts_labels(chats,flag,length)

    tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)
    data = create_data(chats,len(texts),flag,length)
    word_index = tokenizer.word_index
    print('Total %s unique tokens.' % len(word_index))

    labels = to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)
#     data = data[:13]
#     labels = labels[:13]
    indices = np.arange(len(data))
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
    nb_test_samples = int(TEST_SPLIT * data.shape[0])

    x_train = data[:-(nb_validation_samples+nb_test_samples)]
    y_train = labels[:-(nb_validation_samples+nb_test_samples)]
    x_val = data[-(nb_validation_samples+nb_test_samples):-nb_test_samples]
    y_val = labels[-(nb_validation_samples+nb_test_samples):-nb_test_samples]
    x_test = data[-nb_test_samples:]
    y_test = labels[-nb_test_samples:]
    
    return word_index,x_train, y_train,x_val, y_val,x_test,y_test


In [8]:

def prepare_datasets_turns(chats,flag=False, length =0):
    if flag:
        chats = {k: v for k, v in chats.items() if len(v["turns"])>length}
    
    texts,labels = create_texts_labels(chats,flag,length)

    tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)
    data,_ = create_data_with_turns(chats,len(texts),flag,length)
    word_index = tokenizer.word_index
    print('Total %s unique tokens.' % len(word_index))

    labels = to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)
#     data = data[:13]
#     labels = labels[:13]
    indices = np.arange(len(data))
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
    nb_test_samples = int(TEST_SPLIT * data.shape[0])

    x_train = data[:-(nb_validation_samples+nb_test_samples)]
    y_train = labels[:-(nb_validation_samples+nb_test_samples)]
    x_val = data[-(nb_validation_samples+nb_test_samples):-nb_test_samples]
    y_val = labels[-(nb_validation_samples+nb_test_samples):-nb_test_samples]
    x_test = data[-nb_test_samples:]
    y_test = labels[-nb_test_samples:]
    
    return word_index,x_train, y_train,x_val, y_val,x_test,y_test

In [9]:
def prepare_datasets_turns_props(chats,flag=False, length =0):
    if flag:
        chats = {k: v for k, v in chats.items() if len(v["turns"])>length}
    
    texts,labels = create_texts_labels(chats,flag,length)

    tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)
    data,vecs = create_data_with_turns(chats,len(texts),flag,length)
    aux_data = np.zeros((len(texts), MAX_TURNS,3), dtype='int32')
    for i,vec in enumerate(vecs):
        for j,v in enumerate(vec):
            aux_data[i,j,:]= np.array(v)
    
    word_index = tokenizer.word_index
    print('Total %s unique tokens.' % len(word_index))

    labels = to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)
#     data = data[:13]
#     labels = labels[:13]
    indices = np.arange(len(data))
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
    nb_test_samples = int(TEST_SPLIT * data.shape[0])

    x_train = (data[:-(nb_validation_samples+nb_test_samples)],aux_data[:-(nb_validation_samples+nb_test_samples)])
    y_train = labels[:-(nb_validation_samples+nb_test_samples)]
    x_val = (data[-(nb_validation_samples+nb_test_samples):-nb_test_samples],aux_data[-(nb_validation_samples+nb_test_samples):-nb_test_samples])
    y_val = labels[-(nb_validation_samples+nb_test_samples):-nb_test_samples]
    x_test = (data[-nb_test_samples:],aux_data[-nb_test_samples:])
    y_test = labels[-nb_test_samples:]
    
    return word_index,x_train, y_train,x_val, y_val,x_test,y_test

In [10]:
def create_embedding_index():
    embeddings_index = {}
    f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return embeddings_index

In [11]:



def create_embedding_matrix(word_index):
    embeddings_index= create_embedding_index();
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix



In [12]:
def hierarcical_lstm_network(word_index):
    embedding_matrix = create_embedding_matrix(word_index)
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SENT_LENGTH,
                                trainable=True)
    sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sentence_input)

    l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
    sentEncoder = Model(sentence_input, l_lstm)
    
    review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')

    review_encoder = TimeDistributed(sentEncoder)(review_input)
    print("review_encoder %s"%str(review_encoder._keras_shape))
    l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)

    preds = Dense(2, activation='softmax')(l_lstm_sent)
    model = Model(review_input, preds)

    model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
    return model


In [13]:
def hierarcical_lstm_network_with_turns(word_index):
    embedding_matrix = create_embedding_matrix(word_index)
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SENT_LENGTH,
                                trainable=True)
    sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sentence_input)

    l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
    sentEncoder = Model(sentence_input, l_lstm)
    print(sentEncoder.summary())
    turns_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
    turns_encoder = TimeDistributed(sentEncoder)(turns_input)
    
    l_lstm_sent = Bidirectional(LSTM(100))(turns_encoder)
    turnsEncoder = Model(turns_input, l_lstm_sent)
    
    print(turnsEncoder.summary())
    chats_input = Input(shape=(MAX_TURNS,MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')

    chats_encoder_layer = TimeDistributed(turnsEncoder)
#     print("chats_encoder_layer %s"%str(chats_encoder_layer.output_shape))
#     print("chats_encoder_layer %s"%str(chats_encoder_layer.input_shape))
    chats_encoder = chats_encoder_layer(chats_input)
    
    l_lstm_turns = Bidirectional(LSTM(100))(chats_encoder)

    preds = Dense(2, activation='softmax')(l_lstm_turns)
    model = Model(chats_input, preds)
    print(model.summary())
    model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
    return model


In [14]:
def hierarcical_lstm_network_with_turns_props(word_index):
    embedding_matrix = create_embedding_matrix(word_index)
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SENT_LENGTH,
                                trainable=True)
    sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sentence_input)

    l_lstm = Bidirectional(LSTM(25))(embedded_sequences)
    sentEncoder = Model(sentence_input, l_lstm)
    print(sentEncoder.summary())
    turns_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
    turns_encoder = TimeDistributed(sentEncoder)(turns_input)
    

    auxiliary_input = Input(shape=(MAX_TURNS,3,), name='aux_input')
    l_lstm_sent = Bidirectional(LSTM(25))(turns_encoder)
    turnsEncoder = Model(turns_input, l_lstm_sent)

    print(turnsEncoder.summary())



    chats_input = Input(shape=(MAX_TURNS,MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')




    chats_encoder_layer = TimeDistributed(turnsEncoder)

    chats_encoder = chats_encoder_layer(chats_input)


    l_lstm_turns = Bidirectional(LSTM(25))(concatenate([chats_encoder, auxiliary_input]))

    preds = Dense(2, activation='softmax')(l_lstm_turns)
    model = Model([chats_input,auxiliary_input], preds)
    print(model.summary())
    model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
    return model

In [15]:
def hierarcical_attention_network(word_index):
    embedding_matrix = create_embedding_matrix(word_index)
    embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True)


    class AttLayer(Layer):
        def __init__(self, **kwargs):
            self.init = initializers.get('normal')
            #self.input_spec = [InputSpec(ndim=3)]
            super(AttLayer, self).__init__(**kwargs)

        def build(self, input_shape):
            assert len(input_shape)==3
            #self.W = self.init((input_shape[-1],1))
            self.W = self.init((input_shape[-1],))
            #self.input_spec = [InputSpec(shape=input_shape)]
            self.trainable_weights = [self.W]
            super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

        def call(self, x, mask=None):
            eij = K.tanh(K.dot(x, self.W))

            ai = K.exp(eij)
            weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')

            weighted_input = x*weights.dimshuffle(0,1,'x')
            return weighted_input.sum(axis=1)

        def get_output_shape_for(self, input_shape):
            return (input_shape[0], input_shape[-1])

    sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sentence_input)
    # print("embedded_sequences ndim %d"%K.ndim(embedded_sequences))

    l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
    # print("l_lstm ndim %d"%K.ndim(l_lstm))

    l_dense = TimeDistributed(Dense(200))(l_lstm)
    l_att = AttLayer()(l_dense)
    sentEncoder = Model(sentence_input, l_att)
    review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')

    review_encoder = TimeDistributed(sentEncoder)(review_input)
    from keras.layers import Reshape
    #l_reshape = Reshape((50,20000))(review_encoder)

    l_lstm_sent = Bidirectional(GRU(MAX_SENTS, return_sequences=True))(review_encoder)

    l_dense_sent = TimeDistributed(Dense(200))(l_lstm_sent)



    l_att_sent = AttLayer()(l_dense_sent)

    preds = Dense(2, activation='softmax')(l_att_sent)
    model = Model(review_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    
    return model

In [16]:

def evaluate_model(model,x_train, y_train,x_val, y_val,x_test,y_test):
    print(model.metrics_names)
    model.fit(x_train, y_train, validation_data=(x_val, y_val),
    nb_epoch=10, batch_size=50)

    return model.evaluate(x_test,y_test,batch_size=5, verbose=1, sample_weight=None)



In [17]:

def evaluate_model_multi(model,x_train, y_train,x_val, y_val,x_test,y_test):
    print(model.metrics_names)
    model.fit([x_train[0],x_train[1]], y_train, validation_data=([x_val[0],x_val[1]], y_val),
    nb_epoch=10, batch_size=50)

    return model.evaluate([x_test[0],x_test[1]],y_test,batch_size=5, verbose=1, sample_weight=None)



In [18]:
  
def run_net(net,file,word_index,x_train, y_train,x_val, y_val,x_test,y_test):
    model = net(word_index)
    res = evaluate_model(model,x_train, y_train,x_val, y_val,x_test,y_test)
    file.write(str(net)+"\n")
    file.write(str(res)+"\n")

    
  

    
def run_networks(network_funcs, configurations,file):

    with open(file,"w") as results_file:
        chats = gen_chat_data()
        word_index,x_train, y_train,x_val, y_val,x_test,y_test = prepare_datasets(chats)
        for net in network_funcs:
            
            run_net(net,results_file,word_index,x_train, y_train,x_val, y_val,x_test,y_test)
       
        for conf in configurations:
            
            word_index,x_train, y_train,x_val, y_val,x_test,y_test = prepare_datasets(chats,True,conf)
            results_file.write("config %d\n"%conf)
            for net in network_funcs:
                run_net(net,results_file,word_index,x_train, y_train,x_val, y_val,x_test,y_test)
            
               
          



                

In [19]:
def run_net_multi(net,file,word_index,x_train, y_train,x_val, y_val,x_test,y_test):
    model = net(word_index)
    res = evaluate_model_multi(model,x_train, y_train,x_val, y_val,x_test,y_test)
    file.write(str(net)+"\n")
    file.write(str(res)+"\n")
    

In [20]:
def run_networks_turns(network_funcs, configurations,file):

    with open(file,"w") as results_file:
        chats = gen_chat_data()
        word_index,x_train, y_train,x_val, y_val,x_test,y_test = prepare_datasets_turns(chats)
        for net in network_funcs:
            
            run_net(net,results_file,word_index,x_train, y_train,x_val, y_val,x_test,y_test)
       
        for conf in configurations:
            
            word_index,x_train, y_train,x_val, y_val,x_test,y_test = prepare_datasets_turns(chats,True,conf)
            results_file.write("config %d\n"%conf)
            for net in network_funcs:
                run_net(net,results_file,word_index,x_train, y_train,x_val, y_val,x_test,y_test)
            
               
          


In [21]:
def run_networks_turns_props(network_funcs, configurations,file):

    with open(file,"w") as results_file:
        chats = gen_chat_data()
        word_index,x_train, y_train,x_val, y_val,x_test,y_test = prepare_datasets_turns_props(chats)
        for net in network_funcs:
            run_net_multi(net,results_file,word_index,x_train, y_train,x_val, y_val,x_test,y_test)
       
        for conf in configurations:
            
            word_index,x_train, y_train,x_val, y_val,x_test,y_test = prepare_datasets_turns(chats,True,conf)
            results_file.write("config %d\n"%conf)
            for net in network_funcs:
                run_net_multi(net,results_file,word_index,x_train, y_train,x_val, y_val,x_test,y_test)
            
               
          


In [22]:

# run_networks_turns_props([hierarcical_lstm_network_with_turns_props],[4,8],"resultsturns_props.txt")
results_file = open("rtp_4.txt","w")
chats = gen_chat_data()
word_index,x_train, y_train,x_val, y_val,x_test,y_test = prepare_datasets_turns(chats,True,4)

run_net_multi(hierarcical_lstm_network_with_turns_props,results_file,word_index,x_train, y_train,x_val, y_val,x_test,y_test)





Total 6953 unique tokens.
Shape of data tensor: (1368, 60, 20, 40)
Shape of label tensor: (1368, 2)
Total 400000 word vectors.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 40)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 40, 100)           695400    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50)                25200     
Total params: 720,600
Trainable params: 720,600
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 20, 40)            0         
_________________________________________________________________
time

  """


Train on 959 samples, validate on 136 samples
Epoch 1/10


KeyboardInterrupt: 