In [11]:
from nli.data import Vocabulary

from datasets import load_from_disk
from tqdm import tqdm
import torch

In [3]:
dataset_snli = load_from_disk('data/snli')

In [4]:
samples = dataset_snli['train']['premise'] + dataset_snli['train']['hypothesis']

In [18]:
def get_words(sentences, threshold=0):
    words = {}
    for s in tqdm(sentences, desc = 'Creating dictionary'):
        for word in s:
            words[word] = words.get(word, 0) + 1

    if threshold > 0:
        newwords = {}
        for word in words:
            if words[word] >= threshold:
                newwords[word] = words[word]
        words = newwords
    
    words['<s>'] = 1e9 + 4
    words['</s>'] = 1e9 + 3
    words['<p>'] = 1e9 + 2

    sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort

    sorted_words = [w for (w, _) in sorted_words]
    
    return sorted_words

dataset_corpus = get_words(samples)
len(dataset_corpus)

Creating dictionary: 100%|██████████| 1098734/1098734 [00:02<00:00, 422055.69it/s]


36549

In [20]:
def get_wordvec(dataset_corpus, path_to_vec):

    print('Reading word vectors from {}'.format(path_to_vec))
    with open(path_to_vec, "r", encoding="utf8") as f:
        lines = f.readlines()

    wordvec = {}

    wordvec['<unk>'] = torch.normal(mean=0, std=1, size=(300,))
    # wordvec['<pad>'] = torch.normal(mean=0, std=1, size=(300,))
    wordvec['<pad>'] = torch.zeros(300)

    for line in tqdm(lines, desc = 'Reading word vectors'):
        word, vec = line.split(' ', 1)
        if word in dataset_corpus:
            wordvec[word] = torch.tensor(list(map(float, vec.split())))

    assert list( wordvec.keys() )[:2] == ['<unk>', '<pad>']

    return wordvec

wordvec = get_wordvec(dataset_corpus, 'data/glove.840B.300d.txt')
len(wordvec)

Reading word vectors from data/glove.840B.300d.txt


Reading word vectors:   1%|          | 11898/2196017 [00:08<26:20, 1382.32it/s]


KeyboardInterrupt: 

['<s>',
 '</s>',
 '<p>',
 'a',
 '.',
 'the',
 'in',
 'is',
 'man',
 'on',
 'and',
 'are',
 'of',
 'with',
 'woman',
 'two',
 'people',
 ',',
 'to',
 'at',
 'wearing',
 'an',
 'his',
 'young',
 'men',
 'playing',
 'girl',
 'boy',
 'white',
 'shirt',
 'while',
 'black',
 'dog',
 'sitting',
 'blue',
 'standing',
 'her',
 'red',
 'group',
 'for',
 'down',
 'walking',
 'outside',
 'street',
 'person',
 'front',
 'women',
 'holding',
 'child',
 'by',
 'one',
 'three',
 'there',
 'water',
 'their',
 'up',
 'children',
 'looking',
 'as',
 'some',
 'green',
 'from',
 'little',
 'other',
 'large',
 'through',
 'has',
 'running',
 'yellow',
 'riding',
 'ball',
 'out',
 'hat',
 'next',
 'into',
 'brown',
 'building',
 'near',
 'beach',
 'over',
 'dressed',
 'small',
 'girls',
 'another',
 'dogs',
 'around',
 'crowd',
 "'s",
 'bike',
 'stands',
 'field',
 'jacket',
 'sits',
 'it',
 'jumping',
 'table',
 'working',
 'that',
 'behind',
 'park',
 'orange',
 'sidewalk',
 'background',
 'boys',
 'pink',