In [7]:
from transformers import PreTrainedTokenizer, AutoTokenizer
from transformer_lens import HookedTransformer, HookedTransformerConfig

In [8]:
# get the gpt2 tokenizer
TOKENIZER: PreTrainedTokenizer = AutoTokenizer.from_pretrained("gpt2")

In [11]:
print(f"{len(TOKENIZER)} ")
tokens_with_quotes: list[str] = [
	x 
    for x in TOKENIZER.get_vocab().keys() 
    if ('"' in x)
]
print(f"{len(tokens_with_quotes) = }")

50257 
len(tokens_with_quotes) = 131


In [12]:
print(tokens_with_quotes)

['{"', 'Ġ"-', 'Ġ"[', '"]=>', 'Ġ"_', '="#', '">', '"""', '="', '?\'"', 'Ġ"#', '\',"', '":"","', '",', '/"', '.""', 'Ġ."', 'Ġ,"', '"/>', '":["', '").', '!"', 'Ġ"""', '"!', 'Ġ"(', 'âĢĶ"', '="/', '"âĢ¦', ';"', 'Ġ"$:/', 'âĢ¦."', '":', '!!"', '"><', '".', '"-', '.",', ',"', 'Ġ["', '%"', 'Ġ("', '=""', '"âĢĶ', '."', '"],"', '\\",', '"},', ']."', '\\">', '?!"', '\'"', '"}],"', ')."', 'Ġ"\'', '"],', '!",', '":-', '?".', 'Ġ..."', '=\\"', 'Ġ"...', ':"', '!".', '("', '},{"', '"))', '!\'"', '"),', ',\'"', 'Ġ".', '")', '":"/', '..."', '."[', '.\'"', 'Ġ"{', '"...', '"},"', 'Ġ"\\', '"></', ']"', '.","', '");', '.,"', '".[', '},"', 'Ġ\\"', '?"', 'Ġ"<', '?",', '\\":', '>"', '\\"', 'Ġ"+', '":[', '"\'', '"},{"', '":{"', '!?"', '),"', '","', '["', 'Ġ{"', '"[', '"]', 'âĢ¦"', 'Ġ"', '":"', 'Ġ"%', '"?', '";', '.")', '":""},{"', 'Ġ""', '}"', 'ĠâĢ¦"', '""', 'Ġ"$', '":[{"', 'Ġ",', '"', '-"', '"(', ')"', 'Ġ"/', '],"', '"}', '\'."', ')",', 'Ġ"âĢ¦', 'Ġ"@']


In [3]:
# load a subset of the tinystories dataset
with open("../data/tiny_stories/tinystories_10k.txt", "r", encoding="utf-8") as f:
    TEXT_DATA: list[str] = f.read().split("<|endoftext|>")

In [27]:
# split into tokens but don't convert to ids
text_data_tokenized = [
	TOKENIZER(x).tokens()
	for x in TEXT_DATA
]

from itertools import chain
text_data_tokenized_joined = list(chain(*text_data_tokenized))

In [29]:
print(f"{len(text_data_tokenized) = }")
print(f"{len(text_data_tokenized_joined) = }")

len(text_data_tokenized) = 1728
len(text_data_tokenized_joined) = 349321


In [34]:
# set of tokens with quotes which appear in the dataset

from collections import Counter

quote_tokens_in_data = Counter([
	x
	for x in text_data_tokenized_joined
	if ('"' in x)
])


# print(f"{quote_tokens_in_data = }")
for x in quote_tokens_in_data:
	print(f"{quote_tokens_in_data[x]}\t`{x}`")

2983	`Ġ"`
1262	`!"`
1340	`."`
702	`?"`
807	`"`
364	`,"`
41	`".`
3	`?!"`
3	`?".`
2	`'."`
2	`?",`
1	`..."`
3	`",`
5	`!".`
1	`Ġ"'`
1	`',"`


In [44]:
import torch

def process_text(text):
    # Load the GPT-2 tokenizer
    tokenizer = AutoTokenizer.from_pretrained("gpt2")

    # Tokenize the text
    tokens = tokenizer.tokenize(text)

    # Convert tokens to IDs
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Compute binary feature for each token (1 if inside a quote, 0 otherwise)
    quote_feature = []
    inside_quote: bool = False
    is_quote: bool = False
    for token in tokens:
        if "\"" in token:
            is_quote = True
            inside_quote = not inside_quote
        else:
            is_quote = False
        quote_feature.append(
            (1 if not is_quote else 0) # quotes are not inside the quote
            if inside_quote 
            else 0
        ) 

    # Convert token IDs and quote feature to tensors
    token_ids_tensor = torch.tensor(token_ids)
    quote_feature_tensor = torch.tensor(quote_feature)

    return tokens, token_ids_tensor, quote_feature_tensor


def display_results(text):
    tokens, _, quote_feature_tensor = process_text(text)

    print("Input text:")
    for token, feature in zip(tokens, quote_feature_tensor):
        if feature == 1:
            # print(f"\033[43m{token}\033[0m", end=" ")
            # dark blue background
            print(f"\033[44m{token}\033[0m", end=" ")
        else:
            print(token, end=" ")
    print("\n")
    
display_results(TEXT_DATA[8])

Input text:
ĊĊ Ċ S ara Ġand ĠBen Ġwanted Ġto Ġdecor ate Ġa Ġbowl Ġfor Ġtheir Ġmom . ĠThey Ġfound Ġa Ġbig Ġbowl Ġin Ġthe Ġkitchen Ġand Ġsome Ġpaint Ġand Ġbrushes . ĠThey Ġtook Ġthe Ġbowl Ġand Ġthe Ġpaint Ġto Ġthe Ġbackyard Ġand Ġput Ġthem Ġon Ġa Ġtable . Ċ " [44mLet[0m [44m's[0m [44mĠmake[0m [44mĠthe[0m [44mĠbowl[0m [44mĠpretty[0m [44mĠwith[0m [44mĠcolors[0m ," ĠSara Ġsaid . Ċ " [44mOK[0m [44m,[0m [44mĠI[0m [44mĠwill[0m [44mĠpaint[0m [44mĠa[0m [44mĠflower[0m ," ĠBen Ġsaid . Ċ They Ġstarted Ġto Ġpaint Ġthe Ġbowl Ġwith Ġdifferent Ġcolors . ĠSara Ġpainted Ġa Ġred Ġheart Ġand ĠBen Ġpainted Ġa Ġyellow Ġflower . ĠThey Ġwere Ġhaving Ġfun . Ċ But Ġthen , Ġit Ġstarted Ġto Ġrain . ĠThe Ġrain Ġwas Ġwet Ġand Ġcold . ĠIt Ġmade Ġthe Ġpaint Ġrun Ġand Ġdrip . ĠThe Ġbowl Ġlooked Ġmessy Ġand Ġugly . Ċ " [44mOh[0m [44mĠno[0m [44m,[0m [44mĠthe[0m [44mĠrain[0m [44mĠruined[0m [44mĠour[0m [44mĠbowl[0m !" ĠSara Ġcried . Ċ " [44mMom[0m [44mĠwill[0m [44mĠnot[0m