In [1]:
import os
import sys
import numpy as np
import json
from os.path import join, dirname

from transformers import GPT2Tokenizer

from ridge_utils.interpdata import lanczosinterp2D
from ridge_utils.SemanticModel import SemanticModel
from ridge_utils.dsutils import make_semantic_model, make_word_ds, make_phoneme_ds
from ridge_utils.stimulus_utils import load_textgrids, load_simulated_trfiles

from configs import *



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_story_wordseqs(stories):
	grids = load_textgrids(stories, engram_dir)
	with open(join(engram_dir, "ds003020/derivative/respdict.json"), "r") as f:
		respdict = json.load(f)
	trfiles = load_simulated_trfiles(respdict)
	wordseqs = make_word_ds(grids, trfiles)
	return wordseqs

In [6]:
x = get_story_wordseqs(allstories)

In [13]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')

In [21]:
story_token_lens = []
for k in x.keys():
    # Break up the word list into chunks of 1024 words
    this_story_token_lens = 0
    text_chunks = [x[k].data[i:i + 1024] for i in range(0, len(x[k].data), 1024)]
    for tc in text_chunks:
        full_text = " ".join(tc)
        encoded_input = tokenizer(full_text, return_tensors='pt')
        story_token_lens.append(len(encoded_input['input_ids']))


Token indices sequence length is longer than the specified maximum sequence length for this model (2421 > 1024). Running this sequence through the model will result in indexing errors


In [14]:
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')

In [16]:
encoded_input['input_ids']


tensor([[3041, 5372,  502,  416,  597, 2420,  345, 1549,  588,   13]])

In [17]:
encoded_input

{'input_ids': tensor([[3041, 5372,  502,  416,  597, 2420,  345, 1549,  588,   13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [20]:
tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])

['Re', 'place', 'Ġme', 'Ġby', 'Ġany', 'Ġtext', 'Ġyou', "'d", 'Ġlike', '.']