In [2]:
import os
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../src')
import csv
from keras.preprocessing.text import Tokenizer
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import numpy as np


Using TensorFlow backend.


In [84]:

word_list = "../../../data/transcripts/eng/aoa_words.csv"
model_dir = "../../../models"
result_dir = "."

In [91]:

class AoAWord:
    def __init__(self, word, wordbank_id, maxlen, vocab):
        self.word = word
        self.wordbank_id = wordbank_id
        self.id = -1
        self.maxlen = maxlen
        self.contexts = []
        self.surprisals = []
        if word in vocab:
            self.id = vocab[word]
        
    def get_contexts_surprisals(self, sequences, model):
        if not self.id == -1:
            contexts = []
            for seq in sequences:
                if self.id in seq:
                    context = []
                    for w in seq:
                        if w == self.id:
                            break
                        context.append(w)  
                    context.append(self.id)
                    contexts.append(context)
            contexts = pad_sequences(contexts, maxlen=self.maxlen, padding='pre')
            self.contexts= np.array(contexts)
            X, y = self.contexts[:,:-1],self.contexts[:,-1]
            p_pred = model.predict(X) 
            for i, prob in enumerate(p_pred):
                self.surprisals.append(-np.log(prob[y[i]]))
                
                
    def get_avg_surprisal(self, sequences, model):
        score = 0.0
        if not self.contexts:
            self.get_contexts_surprisals(sequences, model)
        if len(self.surprisals) == 0:
            return "NA"
        else:
            for surprisal in self.surprisals:
                score += surprisal
            score = score/len(self.surprisals)
            return score
        

In [92]:
def get_model_train_test():
    data = []
    for subdir, dirs, files in os.walk(model_dir):
        for file in files:
            if ('.h5' in file):
                model_file = subdir + '/' + file
                this_model = load_model(model_file)
                childname = file.split('_model.h5')[0]
                trainfile = subdir +'/train/' + childname + '.train.txt'
                testfile = subdir + '/test/' + childname + '.test.txt'
                with open(trainfile, 'r') as f:
                    train = f.readlines()
                with open(testfile, 'r') as f:
                    test = f.readlines()
                data.append((childname, this_model, train, test))
    return data

In [93]:

words = []
with open(word_list) as file:
    reader = csv.reader(file, delimiter='\t')
    reader.__next__()
    for row in reader:
        words.append([row[0], int(row[1])])

In [95]:
data = get_model_train_test()
aoa_corpus = dict()
for childname, model, train, test in data:
    print('PREPARE DATA FOR: ' + childname + '\n')
    # Get vocabulary
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train + test)
    vocab = tokenizer.word_index
    #print(vocab)
    seqs = tokenizer.texts_to_sequences(train)
    maxlen = max([len(seq) for seq in seqs])
    aoa_words = dict()
    for word,wordbank_id in words:
        aoa_words[wordbank_id] = AoAWord(word, wordbank_id, maxlen, vocab)
    aoa_corpus[childname] = aoa_words
    
        # write all results to the result_dir
    with open(result_dir + '/' + childname + '.aoa_result.csv', 'w') as f:
        f.write("num_item_id, uni_lemma, avg_surprisal" + '\n')
        for id in aoa_words:
            f.write(str(id) + ',' +
                    aoa_words[id].word + ',' +
                    str(aoa_words[id].get_avg_surprisal(seqs, model)) + '\n')
    
    

PREPARE DATA FOR: Will

