# K-> inf

In [5]:
from pathlib import Path
pt = Path("/private/home/ccaucheteux/hasson-syntaxe-vs-semantics/")
pt

PosixPath('/private/home/ccaucheteux/hasson-syntaxe-vs-semantics')

In [9]:
ls 

/bin/bash: /public/apps/anaconda3/2020.11/lib/libtinfo.so.6: no version information available (required by /bin/bash)
0_equival_sentences.npy          4_equival_sentences_params.npy
0_equival_sentences_params.npy   50_equival_sentences.npy
19_equival_sentences.npy         50_equival_sentences_params.npy
19_equival_sentences_params.npy  51_equival_sentences.npy
20_equival_sentences.npy         51_equival_sentences_params.npy
20_equival_sentences_params.npy  52_equival_sentences.npy
21_equival_sentences.npy         52_equival_sentences_params.npy
21_equival_sentences_params.npy  53_equival_sentences.npy
22_equival_sentences.npy         53_equival_sentences_params.npy
22_equival_sentences_params.npy  54_equival_sentences.npy
23_equival_sentences.npy         54_equival_sentences_params.npy
23_equival_sentences_params.npy  55_equival_sentences.npy
24_equival_sentences.npy         55_equival_sentences_params.npy
24_equival_sentences_params.npy  56_equival_sentences.npy
25_equival_sentences.n

In [10]:
base_dir = Path("/private/home/ccaucheteux/hasson-syntaxe-vs-semantics/data/syntactic_equivalences/only_10m_wikisent")

In [16]:
import numpy as np
test = np.load(base_dir / "0_equival_sentences.npy", allow_pickle=True).item()

In [34]:
test.keys()

dict_keys(['base_tokens', 'shuffled_tokens', 'distances', 'is_grammatical'])

In [147]:
import logging

import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from tqdm.notebook import tqdm


def extract_hiddens(words, model, tokenizer, agg="sum", cuda=False):
    inputs, mapping = map_word_to_inputs(words, tokenizer)
    assert len(inputs["input_ids"])==1

    # Inference
    with torch.no_grad():
        if cuda:
            inputs = {k:v.cuda() for k, v in inputs.items()}
        output = model(**inputs, output_hidden_states=True)
        hidden_states = output.hidden_states
        hidden_states = torch.stack(hidden_states).squeeze(1)
        if cuda:
            hidden_states = hidden_states.cpu()

        if agg == "mean":
            agg_fun = torch.mean
        else:
            agg_fun = torch.sum
        # Mapping
        word_level_hidden_states = torch.stack(
            [
                agg_fun(hidden_states[:, mapping[i][0] : mapping[i][-1]], dim=1)
                for i in range(len(words))
            ],
            dim=1,
        )
        
    return word_level_hidden_states


def map_word_to_inputs(words, tokenizer):
    mapping = {}
    idx = 0
    inputs = tokenizer("", return_tensors="pt", add_special_tokens=False)
    inputs = {k: v.long() for k, v in inputs.items()}
    for i, word in enumerate(words):
        word_inpt = tokenizer(word, return_tensors="pt")
        for k, v in inputs.items():
            inputs[k] = torch.cat([inputs[k], word_inpt[k]], dim=1).long()
        ntok = word_inpt[k].size(1)
        mapping[i] = torch.arange(idx, idx + ntok + 1)
        idx += ntok
    return inputs, mapping


def get_block_embeddings(blocks, model_name="gpt2", agg="sum", cuda=False):

    """
    Block is a list of "blocks" of sentences of equal len
    Each sentence is a list of words
    """
    
    # Load
    assert agg in ["sum", "mean"]

    print(f"Loading model {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # For inference
    if cuda:
        model.to("cuda")
        print("Model CUDA")
        
    bar = []
    for sentences in blocks:
        outputs = []
        for words in tqdm(sentences):
            print(words[:2])
            hiddens = extract_hiddens(words, model, tokenizer, agg=agg, cuda=cuda)
            assert hiddens.shape[1] == len(words)
            outputs.append(hiddens.detach())
        
        # Gather all sentences embeddings
        outputs = torch.stack(outputs, dim=1) # all sentences the same len
        
        # Append to shuffled embeddings
        bar.append(outputs)
    
    return bar


In [152]:
bar = get_block_embeddings([s[:5] for s in shuffled[:2]], cuda=True, agg="sum", model_name = "bert-large-cased")

Loading model bert-large-cased
Model CUDA


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

['Hornsby' 'is']
['version' 'is']
['figure' 'is']
['volunteer' 'is']
['Freedom' 'becomes']



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

['hand' 'is']
['Lincoln' 'is']
['container' 'is']
['Silicon' 'is']
['man' 'is']



In [136]:
[b.shape for b in bar]

[torch.Size([13, 5, 22, 768]), torch.Size([13, 5, 21, 768])]

In [98]:
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [108]:
inputs, mapping = map_word_to_inputs(words, tokenizer)

In [111]:
mapping

{0: tensor([0, 1]),
 1: tensor([1, 2]),
 2: tensor([2, 3]),
 3: tensor([3, 4]),
 4: tensor([4, 5, 6]),
 5: tensor([6, 7]),
 6: tensor([ 7,  8,  9, 10]),
 7: tensor([10, 11]),
 8: tensor([11, 12]),
 9: tensor([12, 13]),
 10: tensor([13, 14, 15]),
 11: tensor([15, 16, 17, 18, 19]),
 12: tensor([19, 20]),
 13: tensor([20, 21, 22, 23]),
 14: tensor([23, 24]),
 15: tensor([24, 25]),
 16: tensor([25, 26]),
 17: tensor([26, 27, 28]),
 18: tensor([28, 29, 30, 31]),
 19: tensor([31, 32, 33]),
 20: tensor([33, 34, 35, 36]),
 21: tensor([36, 37])}

In [110]:
inputs["input_ids"].shape

torch.Size([1, 37])

In [97]:
print(shuffled[0].shape)

(200, 22)


In [107]:
len(words)

22

In [118]:
outputs = []
for words in tqdm(shuffled[0]):
    print(words[:5])
    print(words.shape)
    hiddens = extract_hiddens(words, model, tokenizer, agg="sum", cuda=False)
    print(hiddens.shape)
    outputs.append(hiddens)

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

['Hornsby' 'is' 'a' 'slight' 'filing']
(22,)
torch.Size([13, 22, 768])
['version' 'is' 'the' 'good' 'thought']
(22,)
torch.Size([13, 22, 768])
['figure' 'is' 'the' 'sharp' 'admiration']
(22,)



KeyboardInterrupt: 

In [None]:
# Extract GPT-2 embedding to see whether it changes 
# Extract bert-large embedding

In [36]:
test.keys()

dict_keys(['base_tokens', 'shuffled_tokens', 'distances', 'is_grammatical'])

In [153]:
%%time
import numpy as np
# Select 500 sentences
base_dir = Path("/private/home/ccaucheteux/hasson-syntaxe-vs-semantics/data/syntactic_equivalences/only_10m_wikisent")
base = []
shuffled = []
i = 0 
while len(shuffled) < 500:
    file = base_dir / f"{i}_equival_sentences.npy"
    if file.is_file():
        tmp = np.load(file, allow_pickle=True).item()
        gram = np.where([g.all() for g in tmp["is_grammatical"]])[0]
        dist = np.where([g.all() for g in tmp["distances"]])[0]
        lens = np.where([(len(g) > 5 and len(g) < 50) for g in tmp["base_tokens"]])[0]
        
        idx = [i for i in gram if i in lens and i in dist]
        
        base.extend([tmp["base_tokens"][i] for i in idx])
        shuffled.extend([tmp["shuffled_tokens"][i] for i in idx])
        print(len(shuffled))
    i+=1

65
124
183
239
297
359
420
475
527
CPU times: user 140 ms, sys: 137 ms, total: 277 ms
Wall time: 276 ms


In [155]:
i

9

In [93]:
np.max([s.shape[1] for s in shuffled])

48

In [87]:
np.where(np.array([s.shape[1]>30 for s in shuffled]))

(array([  2,  22,  39,  46,  47,  85,  92,  93,  96,  99, 110, 112, 113,
        117, 123, 127, 131, 135, 142, 155, 174, 177, 189, 205, 206, 211,
        212, 220, 221, 226, 227, 233, 255, 271, 274, 295, 296, 299, 306,
        307, 312, 316, 318, 319, 325, 332, 334, 355, 362, 369, 373, 377,
        388, 390, 398, 399, 400, 401, 402, 425, 431, 440, 451, 504, 508,
        519, 522]),)

In [88]:
shuffled[2][0]

array(['They', 'have', 'now', 'settled', 'on', 'modern', 'decorator(s',
       'by', 'the', 'modern', 'form', '(', 'modern', 'sky', ')', 'by',
       'the', 'united', 'commander', 'and', 'by', 'capturing', 'the',
       'viable', 'diagnosis', 'in', 'cultural', 'words', 'and', 'sky',
       '.'], dtype='<U18')

In [156]:
len(shuffled)

527

In [49]:
shuffled[0].shape

(200, 22)

In [96]:
get_transformer_embeddings(shuffled[0])

Loading model gpt2


Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 