In [1]:
import os
import json

def build_label_dict():
    d = {}
    label_path = r"./Frames-dataset/labels.txt"
    label_file = open(label_path, encoding='utf-8')
    for line in label_file:
        k, v = line.split(",")
        d[k] = True if v.strip()=="True" else False
    return d


In [2]:

def gen_chat_data():
    chat_path = r"./Frames-dataset/chats"
    chats = {}
    d = build_label_dict()
    for filename in os.listdir(chat_path):
        chat_file = open(os.path.join(chat_path, filename), encoding='utf-8')
        chat = json.load(chat_file)
        
        turns = []
        if 'turns' in chat:
            tsp = chat['turns'][0]['timestamp']
       
            for turn in chat['turns']:
               
                ts = turn['timestamp'] - tsp
                tsp = turn['timestamp']
                turns.append({"ti":ts,"text":turn["text"],"author":turn["author"]})

        chats[filename[:-5]] = {}
        chats[filename[:-5]]["turns"] = turns
        chats[filename[:-5]]["label"] = d[filename[:-5]]
    return chats


In [3]:
import numpy as np
# import pandas as pd
import _pickle as cPickle
from collections import defaultdict
import re

from numpy.linalg import norm

import sys
import os

os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers.merge import concatenate
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers
import theano

import functools
import nltk
nltk.download('punkt')
from nltk import tokenize

MAX_SENT_LENGTH = 40
MAX_SENTS = 20
MAX_NB_WORDS = 20000
MAX_TURNS = 60
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.2
RNN_DIM = 50
GLOVE_DIR = "./data/glove"

Using Theano backend.


[nltk_data] Downloading package punkt to /home/efrat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
def create_texts_labels(chats,flag=False,length=0):
    texts=[]
    labels=[]
    for idx in chats.keys():
        if flag:
        
            text = "\n".join([x["text"] for x in chats[idx]["turns"][:length]])
        else:
            text = "\n".join([x["text"] for x in chats[idx]["turns"]])
        texts.append(text)
        sentences = tokenize.sent_tokenize(text)
        labels.append(chats[idx]["label"])   
    return texts,labels

In [5]:
def create_data_with_turns(chats, flag=False,length=0):
    texts = []
    labels=[]
    chats_txt=[]
    vecs=[]
    for idx in chats.keys():
        if flag:
            text = "\n".join([x["text"] for x in chats[idx]["turns"][:length]])
            
        else:
            text = "\n".join([x["text"] for x in chats[idx]["turns"]])
        turns=[]
        vec=[]
        for turn in chats[idx]["turns"]:
            vec.append([len(turn["text"]),turn["ti"],0 if turn["author"].lower()=="wizard" else 1])
            texts.append(turn["text"])
            sentences = tokenize.sent_tokenize(turn["text"])
            turns.append(sentences)
        vecs.append(vec)
        chats_txt.append(turns)
        labels.append(chats[idx]["label"])
        

    return vecs


In [6]:
def prepare_datasets(chats,flag=False, length =0):
    if flag:
        chats = {k: v for k, v in chats.items() if len(v["turns"])>length}
    
    texts,labels = create_texts_labels(chats,flag,length)
    vecs = create_data_with_turns(chats, flag,length)

    aux_data = np.zeros((len(texts), MAX_TURNS,3), dtype='float32')
    for i,vec in enumerate(vecs):
        if flag:
            for j,v in enumerate(vec[:length]):
                aux_data[i,j,:]= np.array(v)
        else:
            for j,v in enumerate(vec):
                aux_data[i,j,:]= np.array(v)
    
    
    
    norm_hlp = norm(aux_data,axis=2,ord=2)
    aux_data = aux_data.astype(np.float)/norm_hlp[:,:,None]

    labels = to_categorical(np.asarray(labels))

    print('Shape of label tensor:', labels.shape)
    print('Shape of aux tensor:', aux_data.shape)
#     data = data[:13]
#     labels = labels[:13]
    indices = np.arange(len(aux_data))
    np.random.shuffle(indices)
    aux_data = aux_data[indices]

    labels = labels[indices]
    nb_validation_samples = int(VALIDATION_SPLIT * aux_data.shape[0])
    nb_test_samples = int(TEST_SPLIT * aux_data.shape[0])

    x_train = aux_data[:-(nb_validation_samples+nb_test_samples)]
    y_train = labels[:-(nb_validation_samples+nb_test_samples)]
    x_val = aux_data[-(nb_validation_samples+nb_test_samples):-nb_test_samples]
    y_val = labels[-(nb_validation_samples+nb_test_samples):-nb_test_samples]
    x_test = aux_data[-nb_test_samples:]
    y_test = labels[-nb_test_samples:]
    
    return x_train, y_train,x_val, y_val,x_test,y_test

In [16]:
def evaluate_model(model,x_train, y_train,x_val, y_val,x_test,y_test):
    print(model.metrics_names)
    model.fit(x_train, y_train, validation_data=(x_val, y_val),
    nb_epoch=10, batch_size=10)

    return model.evaluate(x_test,y_test,batch_size=10, verbose=1, sample_weight=None)



In [17]:
def rnn_network():

    review_input = Input(shape=(MAX_TURNS,3), dtype='float32')
    l_lstm = Bidirectional(LSTM(10))(review_input)


    preds = Dense(2, activation='softmax')(l_lstm)
    model = Model(review_input, preds)

    model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
    return model

In [18]:
  def run_net(net,file,x_train, y_train,x_val, y_val,x_test,y_test):
    model = net()
    res = evaluate_model(model,x_train, y_train,x_val, y_val,x_test,y_test)
    file.write(str(net)+"\n")
    file.write(str(res)+"\n")


In [None]:
chats = gen_chat_data()
with open("rnnresults","w") as results_file:
    x_train, y_train,x_val, y_val,x_test,y_test = prepare_datasets(chats)
    run_net(rnn_network,results_file,x_train, y_train,x_val, y_val,x_test,y_test)
        

Shape of label tensor: (1368, 2)
Shape of aux tensor: (1368, 60, 3)
['loss', 'acc']


  after removing the cwd from sys.path.


Train on 959 samples, validate on 136 samples
Epoch 1/10
Epoch 2/10

  % delta_t_median)
  % delta_t_median)
  % delta_t_median)
  % delta_t_median)




  % delta_t_median)


Epoch 3/10
Epoch 4/10
120/959 [==>...........................] - ETA: 1235s - loss: nan - acc: 0.4167ETA: 230s - loss: nan - acc: 0.40 - ETA: 267s - loss: nan - acc: 0.43 - ETA: 460s - loss: nan - acc: 

  % delta_t_median)


Epoch 5/10
 50/959 [>.............................] - ETA: 1574s - loss: nan - acc: 0.3800