In [1]:
import os
import re
import multiprocessing
import random
import math

import numpy as np


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.models import Sequential, load_model
from my_data_generator import DataGenerator

Using TensorFlow backend.


In [2]:
import copy

In [3]:
textfile = './Will.capp'
train_all_data = True
cpus = multiprocessing.cpu_count()
batch_size = 32
output_size = 100
hidden_size = 50
epochs = 15
shuffle = False


In [3]:
model = load_model('../../models/lstm-baseline/Adam_model.h5')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [13]:
def get_seq_prob(word, context, test_generator):
    global model
    sub_seq = copy.deepcopy(context)
    sub_seq.append(word)

    x, y = test_generator.prepare_seq(sub_seq)

    p_pred = model.predict(x)
    
    log_p_seq = 0

    for i, prob in enumerate(p_pred):
        prob_word = prob[y[i]]
        log_p_seq += np.log(prob_word)

    return log_p_seq
    #return np.exp(log_p_seq)

In [16]:
def beam_search_decoder(seq, test_generator): #TO DO
    print('seq ')
    print(seq)
    result = 0
    vocab = list(seq)
    k = 3
    beams = [[list(), vocab, np.log(1.0)]]

    for i in range(len(seq)):
        candidates = []
        for (context, vocab, score) in beams:
            for v in range(len(vocab)):
                score = get_seq_prob(vocab[v], context, test_generator)
                new_vocab = vocab[:v]+vocab[(v+1):]
                new_context = copy.deepcopy(context)
                new_context.append(vocab[v])
                candidates.append([new_context, new_vocab, score])
        ordered = sorted(candidates, key=lambda prob:prob[2])
        print("ordered:")
        for beam in ordered:
            print(beam)
        if k < len(ordered):
            beams = ordered[-k:]
        else:
            beams = ordered
    #print(seq)    
    #print('beams ')
    #print(beams)
    for context, vocab, score in beams:
        
        if context == seq :
            result = 1

    return result

In [22]:
a = [1,2,3,4,5,6,7]





[5, 6, 7]

In [81]:
new_vocab


['v']

In [8]:
def greedy_decoder(seq, test_generator):
    result = 0
    vocab = list(seq)
    context = []

    while vocab != [] :
        #(next_word, max_prob) = max([(v, np.exp(get_seq_prob(v, context, test_generator))) for v in vocab], key=lambda prob:prob[1])
        (next_word, max_prob) = max([(v, get_seq_prob(v, context, test_generator)) for v in vocab], key=lambda prob:prob[1])
        context.append(next_word)
        vocab.remove(next_word)
        print(max_prob)
    
    #print(context)
    if context == seq :
        result = 1

    return result

In [9]:

def get_seq_bylength(seqs) :
    seqs_bylength = dict()
    for seq in seqs :
        seqlen = len(seq)
        if seqlen > 1:
            if seqlen in seqs_bylength:
                seqs_bylength[seqlen].append(seq)
            else :
                seqs_bylength[seqlen] = [seq]
    return seqs_bylength

def get_seq_bylength16(seqs) :
    seqs_bylength = dict()
    for seq in seqs :
        seqlen = len(seq)
        if seqlen > 1 and seqlen < 17:
            if seqlen in seqs_bylength:
                seqs_bylength[seqlen].append(seq)
            else :
                seqs_bylength[seqlen] = [seq]
    return seqs_bylength


# decoder is either 'greedy' or 'beam_search'
def get_performance_bylength(test_generator, decoder) :
    if decoder == 'greedy':
        seqs_bylength = get_seq_bylength(test_generator.seqs)
        results_bylength = dict()
        for length,seqs in seqs_bylength.items():
            results_bylength[length] = [0, len(seqs)]
            print(str(length))
            for seq in seqs:
                results_bylength[length][0] += greedy_decoder(seq, test_generator)
    else:
        seqs_bylength = get_seq_bylength16(test_generator.seqs)
        results_bylength = dict()
        for length,seqs in seqs_bylength.items():
            results_bylength[length] = [0, len(seqs)]
            print(str(length))
            for seq in seqs:
                results_bylength[length][0] += beam_search_decoder(seq, test_generator)

    return results_bylength


In [10]:
def is_test_sent(p):
    return True if random.random() < p else False

In [24]:

train = []
test = []
with open(textfile,'r') as f :
    lines = f.readlines()
for sent in lines :
    if '*CHI:' in sent :
        sent = re.sub('\*[A-Z]+: ', '', sent)
        if train_all_data:
            if is_test_sent(0.4):
                test.append(sent)
            else:
                train.append(sent)
        else:
            test.append(sent)
    else :
        sent = re.sub('\*[A-Z]+: ', '', sent)
        train.append(sent)

with open(textfile.split('.capp')[0]+'.train.txt','w') as f :
    for line in train:
        f.write(line)
with open(textfile.split('.capp')[0]+'.test.txt','w') as f :
    for line in test:
        f.write(line)  
        
        
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train+test)

vocab = tokenizer.word_index
vocab_size = len(vocab)+1

train_seqs = tokenizer.texts_to_sequences(train)
test_seqs = tokenizer.texts_to_sequences(test)
maxlen = max([len(seq) for seq in train_seqs])

steps_per_epoch = math.ceil(len(train_seqs)/ batch_size)

print('vocab_size = '+str(vocab_size))
print('train_maxlen = '+str(maxlen))
print('INITIALIZE DATA GENERATORS...\n')

train_generator = DataGenerator(seqs = train_seqs,
                                       vocab = vocab,
                                       vocab_size = vocab_size,
                                       maxlen = maxlen,
                                       batch_size = batch_size,
                                       shuffle = shuffle)
test_generator = DataGenerator(seqs = test_seqs,
                                   vocab = vocab,
                                   vocab_size = vocab_size,
                                   maxlen = maxlen,
                                   batch_size = batch_size,
                                   shuffle = shuffle)

print('TRAINING MODEL...\n')

model = Sequential()
model.add(Embedding(input_dim = vocab_size,  # vocabulary size
                    output_dim = output_size,  # size of embeddings
                    input_length = maxlen-1))  # length of the padded sequences
model.add(LSTM(hidden_size))
model.add(Dense(vocab_size, activation='softmax'))
model.compile('rmsprop', 'categorical_crossentropy')

# Train network
model.fit_generator(train_generator,
                    steps_per_epoch = steps_per_epoch,
                    epochs = epochs,
                    verbose=2,
                    max_queue_size=10,
                    shuffle=False)

model.save(str('./Will_model.h5'))


with open('./train/Will.train.txt','w') as f :
    for line in train:
        f.write(line)
with open('./test/Will.test.txt','w') as f :
    for line in test:
        f.write(line)   

vocab_size = 338
train_maxlen = 17
INITIALIZE DATA GENERATORS...

TRAINING MODEL...

Epoch 1/15
 - 1s - loss: 5.7572
Epoch 2/15
 - 0s - loss: 5.2766
Epoch 3/15
 - 0s - loss: 5.0832
Epoch 4/15
 - 0s - loss: 5.0409
Epoch 5/15
 - 0s - loss: 4.9802
Epoch 6/15
 - 0s - loss: 4.9832
Epoch 7/15
 - 0s - loss: 4.9662
Epoch 8/15
 - 0s - loss: 4.9396
Epoch 9/15
 - 0s - loss: 4.9385
Epoch 10/15
 - 0s - loss: 4.9084
Epoch 11/15
 - 0s - loss: 4.8986
Epoch 12/15
 - 0s - loss: 4.8492
Epoch 13/15
 - 0s - loss: 4.8284
Epoch 14/15
 - 0s - loss: 4.8071
Epoch 15/15
 - 0s - loss: 4.7582


In [97]:


with open('./train/Will.train.txt','w') as f :
    for line in train:
        f.write(line)
with open('./test/Will.test.txt','w') as f :
    for line in test:
        f.write(line)        

In [101]:
results = get_performance_bylength(test_generator, "greedy")


2
0
-6.204007148742676
0
-6.037370681762695
0
-3.250887393951416
0
-3.250887393951416
0
-3.250887393951416
0
-3.250887393951416
0
-6.251455307006836
0
-4.979897499084473
0
-4.623321533203125
0
-5.956155776977539
0
-6.279054164886475
0
-5.022375583648682
0
-6.027204990386963
0
-6.778756141662598
0
-5.229837417602539
0
-5.229837417602539
0
-5.102345943450928
0
-11.922183990478516
0
-5.3092041015625
0
-6.89111328125
0
-5.0710062980651855
0
-6.737586498260498
0
-6.876916408538818
0
-7.639740467071533
0
-4.879254341125488
0
-5.209024429321289
0
-10.621611595153809
0
-5.595567226409912
0
-4.879254341125488
0
-12.1350736618042
3
0
-5.929113388061523
-18.550618171691895
0
-3.755916118621826
-8.68897008895874
0
-3.5767016410827637
-8.786733627319336
0
-3.3291611671447754
-8.005652904510498
0
-4.890578746795654
-10.711774349212646
0
-3.5907137393951416
-6.787052392959595
0
-5.72865104675293
-17.033093452453613
0
-4.917946815490723
-10.958446502685547
0
-4.487176418304443
-11.420335292816162
0
-6

In [102]:
results

{2: [30, 30], 3: [6, 10], 4: [1, 4], 5: [0, 1]}

In [84]:
results

{2: [30, 30], 3: [6, 10], 4: [1, 4], 5: [0, 1]}

In [17]:
results2 = get_performance_bylength(test_generator, "beam")

2
seq 
[15, 50]
ordered:
[[15], [50], 0]
[[50], [15], 0]
ordered:
[[50, 15], [], -5.027212619781494]
[[15, 50], [], -4.752691268920898]
seq 
[52, 94]
ordered:
[[52], [94], 0]
[[94], [52], 0]
ordered:
[[52, 94], [], -6.691259860992432]
[[94, 52], [], -5.813196659088135]
seq 
[30, 6]
ordered:
[[30], [6], 0]
[[6], [30], 0]
ordered:
[[6, 30], [], -4.975263595581055]
[[30, 6], [], -3.1479618549346924]
seq 
[30, 6]
ordered:
[[30], [6], 0]
[[6], [30], 0]
ordered:
[[6, 30], [], -4.975263595581055]
[[30, 6], [], -3.1479618549346924]
seq 
[30, 6]
ordered:
[[30], [6], 0]
[[6], [30], 0]
ordered:
[[6, 30], [], -4.975263595581055]
[[30, 6], [], -3.1479618549346924]
seq 
[30, 6]
ordered:
[[30], [6], 0]
[[6], [30], 0]
ordered:
[[6, 30], [], -4.975263595581055]
[[30, 6], [], -3.1479618549346924]
seq 
[181, 159]
ordered:
[[181], [159], 0]
[[159], [181], 0]
ordered:
[[181, 159], [], -11.121313095092773]
[[159, 181], [], -6.602914810180664]
seq 
[5, 331]
ordered:
[[5], [331], 0]
[[331], [5], 0]
ordered:
[

ordered:
[[3, 334, 125, 9, 58], [], -33.67894744873047]
[[3, 334, 9, 125, 58], [], -33.55052423477173]
[[3, 334, 9, 58, 125], [], -33.38467741012573]
seq 
[2, 98, 117, 45, 335]
ordered:
[[2], [98, 117, 45, 335], 0]
[[98], [2, 117, 45, 335], 0]
[[117], [2, 98, 45, 335], 0]
[[45], [2, 98, 117, 335], 0]
[[335], [2, 98, 117, 45], 0]
ordered:
[[117, 335], [2, 98, 45], -12.128674507141113]
[[98, 335], [2, 117, 45], -11.081016540527344]
[[2, 335], [98, 117, 45], -8.725911140441895]
[[98, 117], [2, 45, 335], -6.170744895935059]
[[117, 98], [2, 45, 335], -5.889685153961182]
[[2, 117], [98, 45, 335], -5.746879577636719]
[[2, 98], [117, 45, 335], -5.504593372344971]
[[2, 45], [98, 117, 335], -5.136874675750732]
[[117, 45], [2, 98, 335], -5.04789924621582]
[[98, 45], [2, 117, 335], -4.999536991119385]
[[98, 2], [117, 45, 335], -3.956097364425659]
[[117, 2], [98, 45, 335], -3.707831382751465]
ordered:
[[117, 335, 98], [2, 45], -17.9427227973938]
[[98, 335, 117], [2, 45], -17.193788528442383]
[[117,

In [18]:
np.exp(-6.743821620941162)

0.0011781361526396613

In [19]:
np.exp(-4.6573686599731445)

0.00949140468456269

In [105]:
results2

{2: [30, 30], 3: [8, 10], 4: [0, 4], 5: [0, 1]}

In [60]:
with open(textfile.split('.capp')[0]+'.beam_prod_result.csv','w') as f :
    f.write("iter,utterance_length,nb_utterances,produced,production_score"+'\n')
    for length in results:
        f.write('1,'+str(length)+','+
                str(results[length][1])+','+
                str(results[length][0])+','+
                str(results[length][0]/results[length][1])+'\n')