In [2]:
import os
import torch
import torch.nn.functional as F
import sys
sys.path.append('..')

import utils
import os

In [17]:
def make_word_dict(aoa_word_list, vocabulary):
    word_dict = {}
    not_in_vocab = []
    for word in aoa_word_list:
        if word in vocabulary:
            token = vocabulary[word]
            word_dict[token] = word
        else:
            not_in_vocab.append(word)
    return word_dict, not_in_vocab

In [4]:
def get_surprisals(model, dataset, word_dict, device):
    model.eval()
    word_surprisals = {}
    for index in word_dict.keys():
        word_surprisals[word_dict[index]] = [0.0, 0]
    for utt in all_data:
        utt_tensor = torch.tensor(utt)[None, :]
        utt_tensor = utt_tensor.to(device)
        outputs = model(utt_tensor)
        surprisals = -F.log_softmax(outputs, dim=2)
        utt_tensor = torch.squeeze(utt_tensor)
        for word_index in word_dict:
            index_matches = (utt_tensor == word_index).nonzero(as_tuple=False)
            if len(index_matches) > 0:
                for i in index_matches:
                    match = i.item()
                    surprisal = (surprisals[0][match][word_index] + sys.float_info.epsilon).item()
                    word = word_dict[word_index]
                    word_surprisals[word][0] += surprisal
                    word_surprisals[word][1] += 1

    return word_surprisals

In [6]:
all_child_directed_data_path ="../../../Data/model_datasets/eng/all_child_directed_data_vocab_size_5000.pkl"
encoding_dictionary_path="../../../Data/model_datasets/eng/encoding_dictionary_vocab_size_5000.pkl"
aoa_word_list="../../../Data/model_datasets/eng/aoa_words.csv"
experiment_dir="../../../Results/experiments/2021-08-20_lstm_eng_5e_256b_em100_hd100_v5000_run0"
model="model.pt"

In [7]:
device = torch.device('cuda')

In [8]:
vocabulary = utils.open_pkl(encoding_dictionary_path)

In [9]:
model = torch.load(os.path.join(experiment_dir, model))

In [10]:
model

LSTM(
  (word_embeddings): Embedding(5001, 100, padding_idx=0)
  (lstm): LSTM(100, 100, num_layers=2, batch_first=True)
  (linear): Linear(in_features=100, out_features=5001, bias=True)
)

In [11]:
model = model.to(device)

In [12]:
all_data = utils.open_pkl(all_child_directed_data_path)

In [14]:
word_list = utils.open_word_list_csv(aoa_word_list)

In [15]:
word_list

['airplane',
 'all',
 'animal',
 'another',
 'apple',
 'arm',
 'asleep',
 'aunt',
 'away',
 'baby',
 'babysitter',
 'back',
 'bad',
 'ball',
 'balloon',
 'banana',
 'bath',
 'bathroom',
 'bathtub',
 'beach',
 'beads',
 'bear',
 'bed',
 'bedroom',
 'bee',
 'bib',
 'bicycle',
 'big',
 'bird',
 'bite',
 'blanket',
 'block',
 'blow',
 'blue',
 'book',
 'boots',
 'bottle',
 'bowl',
 'box',
 'boy',
 'bread',
 'break',
 'breakfast',
 'bring',
 'broken',
 'broom',
 'brother',
 'brush',
 'bubbles',
 'bug',
 'bump',
 'bunny',
 'bus',
 'butter',
 'butterfly',
 'button',
 'bye',
 'cake',
 'candy',
 'car',
 'careful',
 'carrots',
 'cat',
 'cereal',
 'chair',
 'cheek',
 'cheerios',
 'cheese',
 'chicken',
 'chicken',
 'child',
 'church',
 'clean',
 'clean',
 'clock',
 'close',
 'coat',
 'cockadoodledoo',
 'coffee',
 'cold',
 'comb',
 'cookie',
 'couch',
 'cow',
 'cracker',
 'crib',
 'cry',
 'cup',
 'cute',
 'daddy',
 'dance',
 'dark',
 'day',
 'deer',
 'diaper',
 'dinner',
 'dirty',
 'dish',
 'dog',


In [18]:
word_dict, not_in_vocab_list = make_word_dict(word_list, vocabulary)

In [19]:
word_dict

{2506: 'airplane',
 2585: 'all',
 1693: 'animal',
 1211: 'another',
 1838: 'apple',
 4479: 'arm',
 4375: 'asleep',
 2690: 'aunt',
 4376: 'away',
 4691: 'baby',
 3169: 'babysitter',
 4096: 'back',
 125: 'bad',
 3972: 'ball',
 867: 'balloon',
 739: 'banana',
 3131: 'bath',
 2881: 'bathroom',
 2650: 'bathtub',
 2806: 'beach',
 2941: 'beads',
 1477: 'bear',
 4078: 'bed',
 4809: 'bedroom',
 3316: 'bee',
 1837: 'bib',
 369: 'bicycle',
 1527: 'big',
 3483: 'bird',
 3665: 'bite',
 4396: 'blanket',
 3277: 'block',
 2131: 'blow',
 4742: 'blue',
 3197: 'book',
 2952: 'boots',
 4139: 'bottle',
 560: 'bowl',
 4449: 'box',
 4558: 'boy',
 1675: 'bread',
 2771: 'break',
 3449: 'breakfast',
 1124: 'bring',
 3566: 'broken',
 1767: 'broom',
 4866: 'brother',
 148: 'brush',
 4520: 'bubbles',
 3724: 'bug',
 672: 'bump',
 3750: 'bunny',
 3948: 'bus',
 3339: 'butter',
 3864: 'butterfly',
 1386: 'button',
 35: 'bye',
 3655: 'cake',
 1366: 'candy',
 4364: 'car',
 4373: 'careful',
 4081: 'carrots',
 4394: 'cat'

In [20]:
not_in_vocab_list

['shh/shush/hush']

In [21]:
model.eval()
word_surprisals = {}

In [23]:
for index in word_dict.keys():
    word_surprisals[word_dict[index]] = [0.0, 0]

In [35]:
utt=all_data[1]

In [36]:
utt_tensor = torch.tensor(utt)[None, :]
utt_tensor = utt_tensor.to(device)

In [37]:
utt_tensor

tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0, 2834,  426, 3898, 2221, 2182,
          742, 3762,  488, 2239, 2182, 4136]], device='cuda:0')

In [38]:
outputs = model(utt_tensor)

In [39]:
surprisals = -F.log_softmax(outputs, dim=2)

In [40]:
surprisals

tensor([[[1.1921e-07, 2.9538e+01, 3.1397e+01,  ..., 2.8292e+01,
          3.1907e+01, 2.8387e+01],
         [-0.0000e+00, 3.3593e+01, 3.4193e+01,  ..., 3.1138e+01,
          3.7115e+01, 3.0842e+01],
         [-0.0000e+00, 3.0461e+01, 2.9995e+01,  ..., 2.7714e+01,
          3.4108e+01, 2.6782e+01],
         ...,
         [2.6297e+01, 2.7367e+01, 2.7165e+01,  ..., 2.3221e+01,
          3.2384e+01, 2.2748e+01],
         [2.4904e+01, 2.9954e+01, 3.1297e+01,  ..., 2.6379e+01,
          3.1717e+01, 2.4175e+01],
         [2.7588e+01, 3.3268e+01, 2.7802e+01,  ..., 2.8018e+01,
          3.9908e+01, 2.9187e+01]]], device='cuda:0', grad_fn=<NegBackward>)

In [46]:
utt = torch.squeeze(utt_tensor)
for word_index in word_dict:
    index_matches = ( utt == word_index).nonzero(as_tuple=False)
    if len(index_matches) > 0:
        for i in index_matches:
            match = i.item()
            surprisal = (surprisals[0][match][word_index] + sys.float_info.epsilon).item()
            word = word_dict[word_index]
            word_surprisals[word][0] += surprisal
            word_surprisals[word][1] += 1

In [47]:
word_surprisals

{'airplane': [0.0, 0],
 'all': [0.0, 0],
 'animal': [0.0, 0],
 'another': [0.0, 0],
 'apple': [0.0, 0],
 'arm': [0.0, 0],
 'asleep': [0.0, 0],
 'aunt': [0.0, 0],
 'away': [0.0, 0],
 'baby': [0.0, 0],
 'babysitter': [0.0, 0],
 'back': [0.0, 0],
 'bad': [0.0, 0],
 'ball': [0.0, 0],
 'balloon': [0.0, 0],
 'banana': [0.0, 0],
 'bath': [0.0, 0],
 'bathroom': [0.0, 0],
 'bathtub': [0.0, 0],
 'beach': [0.0, 0],
 'beads': [0.0, 0],
 'bear': [0.0, 0],
 'bed': [0.0, 0],
 'bedroom': [0.0, 0],
 'bee': [0.0, 0],
 'bib': [0.0, 0],
 'bicycle': [0.0, 0],
 'big': [0.0, 0],
 'bird': [0.0, 0],
 'bite': [0.0, 0],
 'blanket': [0.0, 0],
 'block': [0.0, 0],
 'blow': [0.0, 0],
 'blue': [0.0, 0],
 'book': [0.0, 0],
 'boots': [0.0, 0],
 'bottle': [0.0, 0],
 'bowl': [0.0, 0],
 'box': [0.0, 0],
 'boy': [0.0, 0],
 'bread': [0.0, 0],
 'break': [0.0, 0],
 'breakfast': [0.0, 0],
 'bring': [0.0, 0],
 'broken': [0.0, 0],
 'broom': [0.0, 0],
 'brother': [0.0, 0],
 'brush': [0.0, 0],
 'bubbles': [0.0, 0],
 'bug': [0.0, 0

In [None]:
    model.eval()
    word_surprisals = {}
    for index in word_dict.keys():
        word_surprisals[word_dict[index]] = [0.0, 0]
    for utt in all_data:
        utt_tensor = torch.tensor(utt)[None, :]
        utt_tensor = utt_tensor.to(device)
        outputs = model(utt_tensor)
        surprisals = -F.log_softmax(outputs, dim=2)
        utt_tensor = torch.squeeze(utt_tensor)
        for word_index in word_dict:
            index_matches = (utt_tensor == word_index).nonzero(as_tuple=False)
            if len(index_matches) > 0:
                for i in index_matches:
                    match = i.item()
                    surprisal = (surprisals[0][match][word_index] + sys.float_info.epsilon).item()
                    word = word_dict[word_index]
                    word_surprisals[word][0] += surprisal
                    word_surprisals[word][1] += 1

In [54]:
for word in not_in_vocab_list:
        word_surprisals[word] = [0.0, 0]

In [55]:
import csv
def save_surprisals_as_csv(surprisals, experiment_dir):
    with open(experiment_dir + "average_surprisals.csv", mode='w') as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        writer.writerow(["word", "surprisal_value", "n_instances"])
        for word in surprisals:
            _sum, n = surprisals[word]
            if n == 0:
                writer.writerow([word, 'NA', 'NA'])
            else:
                avg = _sum/n
                writer.writerow([word, f"{avg:.16f}" , str(n)])

In [56]:
save_surprisals_as_csv(word_surprisals, "./")

In [None]:
    params = get_parameters()
    #May add batching, optimize, and cuda support
    device = torch.device('cuda') if params.gpu_run == True else torch.device('cpu')
    vocabulary = utils.open_pkl(params.encoding_dictionary_path)
    #word_list = set(utils.open_word_list_csv(params.aoa_word_list))
    #in_word_list_not_vocab = word_list - set(vocabulary.keys())
    #vocab_word_list_intersection = word_list - in_word_list_not_vocab
    model = torch.load(os.path.join(params.experiment_dir, params.model))
    model = model.to(device)
    all_data = utils.open_pkl(params.all_child_directed_data_path)
    word_list = utils.open_word_list_csv(params.aoa_word_list)
    word_dict, not_in_vocab_list = make_word_dict(word_list, vocabulary)
    average_surprisals = get_surprisals(model, all_data, word_dict, device)
    utils.save_surprisals_as_csv(average_surprisals, params.experiment_dir)
    print(average_surprisals)