In [1]:
import os
import json

def build_label_dict():
    d = {}
    label_path = r"./Frames-dataset/labels.txt"
    label_file = open(label_path, encoding='utf-8')
    for line in label_file:
        k, v = line.split(",")
        d[k] = True if v.strip()=="True" else False
    return d




In [2]:
#returns chat dictionary that include the label of each chat, and turns where each turn has sentences list and elapsed 
#time + the speaker id

def gen_chat_data():
    chat_path = r"./Frames-dataset/chats"
    chats = {}
    d = build_label_dict()
    for filename in os.listdir(chat_path):
        chat_file = open(os.path.join(chat_path, filename), encoding='utf-8')
        chat = json.load(chat_file)
        
        turns = []
        if 'turns' in chat:
            tsp = chat['turns'][0]['timestamp']
       
            for turn in chat['turns']:
               
                ts = turn['timestamp'] - tsp
                tsp = turn['timestamp']
                turns.append({"ti":ts,"text":turn["text"],"author":turn["author"]})

        chats[filename[:-5]] = {}
        chats[filename[:-5]]["turns"] = turns
        chats[filename[:-5]]["label"] = d[filename[:-5]]
    return chats


In [3]:
import numpy as np
# import pandas as pd
import _pickle as cPickle
from collections import defaultdict
import re



import sys
import os

os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializations

import functools

MAX_SENT_LENGTH = 100
MAX_SENTS = 50
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

Using Theano backend.


In [4]:
chats = gen_chat_data()



turns = []
labels = []
texts = []

import nltk
nltk.download('punkt')
from nltk import tokenize

for idx in chats.keys():
    text = "\n".join([x["text"] for x in chats[idx]["turns"]])
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    turns.append(sentences)
    
    labels.append(chats[idx]["label"])

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(turns):
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    data[i,j,k] = tokenizer.word_index[word]
                    k=k+1      


[nltk_data] Downloading package punkt to /home/efrat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(len(data))
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

GLOVE_DIR = "./data/glove"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Total %s word vectors.' % len(embeddings_index))

Total 6954 unique tokens.
Shape of data tensor: (1369, 50, 100)
Shape of label tensor: (1369, 2)
Number of positive and negative reviews in traing and validation set
[ 422.  674.]
[ 110.  163.]
Total 400000 word vectors.


In [6]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True)


In [None]:

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)

l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
sentEncoder = Model(sentence_input, l_lstm)


In [None]:
              
                    





review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')

review_encoder = TimeDistributed(sentEncoder)(review_input)
print("review_encoder %s"%str(review_encoder._keras_shape))
l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)

preds = Dense(2, activation='softmax')(l_lstm_sent)
model = Model(review_input, preds)

model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])

print("model fitting - Hierachical LSTM")
print(model.summary())
model.fit(x_train, y_train, validation_data=(x_val, y_val),nb_epoch=1, batch_size=50)




review_encoder (None, 50, 200)
model fitting - Hierachical LSTM
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 50, 100)       0                                            
____________________________________________________________________________________________________
timedistributed_1 (TimeDistribut (None, 50, 200)       856300      input_2[0][0]                    
____________________________________________________________________________________________________
bidirectional_2 (Bidirectional)  (None, 200)           240800      timedistributed_1[0][0]          
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 2)             402         bidirectional_2[0][0]            
Total params: 1,097,502
Tra

In [None]:
# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True)


class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializations.get('normal')
        #self.input_spec = [InputSpec(ndim=3)]
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        #self.W = self.init((input_shape[-1],1))
        self.W = self.init((input_shape[-1],))
        #self.input_spec = [InputSpec(shape=input_shape)]
        self.trainable_weights = [self.W]
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))
        
        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
        
        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[-1])


In [None]:

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
# print("embedded_sequences ndim %d"%K.ndim(embedded_sequences))

l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
# print("l_lstm ndim %d"%K.ndim(l_lstm))

l_dense = TimeDistributed(Dense(200))(l_lstm)
# print("l_dense ndim %d"%K.ndim(l_dense))

l_att = AttLayer()(l_dense)
# print("l_att ndim %d"%K.ndim(l_att))

sentEncoder = Model(sentence_input, l_att)
print(sentEncoder.summary())

In [None]:

review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
# print("review_input ndim %d"%K.ndim(review_input))

review_encoder = TimeDistributed(sentEncoder)(review_input)
# print("review_encoder ndim %d"%K.ndim(review_encoder))
# print("review_encoder %s"%str(review_encoder._keras_shape))
from keras.layers import Reshape
#l_reshape = Reshape((50,20000))(review_encoder)

l_lstm_sent = Bidirectional(GRU(MAX_SENTS, return_sequences=True))(review_encoder)
# l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)
# l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)
# print("l_lstm_sent ndim %d"%K.ndim(l_lstm_sent))

m = Model(review_input,l_lstm_sent)

print(m.summary())

In [None]:
# from keras.models import Sequential
print("l_lstm_sent %s"%str(l_lstm_sent._keras_shape))
print("l_lstm_sent %d"%K.ndim(l_lstm_sent))
l_dense_sent = TimeDistributed(Dense(200))(l_lstm_sent)

# print("l_dense_sent %s"%str(l_dense_sent._keras_shape))

l_att_sent = AttLayer()(l_dense_sent)

preds = Dense(2, activation='softmax')(l_att_sent)
model = Model(review_input, preds)
# print(model.summary())
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
# x_train.get_value()
print("model fitting - Hierachical attention network")

model.fit(x_train, y_train, validation_data=(x_val, y_val),
nb_epoch=10, batch_size=50)
print(model.summary())