In [2]:
from typing import List
from collections import defaultdict
import os
import pickle

import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import scipy.linalg
import torch
import torch.nn.functional as F
import sklearn.cluster

import datasets
from datasets import load_dataset

In [11]:
def contains_unicode(text: str) -> bool:
    return any(ord(char) > 127 for char in text)

def tokens_to_latex(tokens: List[str], highlight_index=-1) -> str:
    latex_code = ""
    for i, token in enumerate(tokens):
        # choose the text that will go inside the \tok command after {\strut}
        if token == "\n":
            latex_text = r"{\textbackslash}n" # some text that represents a newline
            # latex_text = "↲" # sadly these aren't working
        elif all([c == " " for c in token]):
            latex_text = r"\phantom{" + "a"*len(token) + r"}" # some invisible text that represents a space
        elif token == "\t":
            latex_text = r"\phantom{aaaa}" # some invisible text that represents a tab
        else:
            latex_text = token.replace("_", r"\_").replace("#", r"\#").replace("$", r"\$").replace("%", r"\%").replace("{", r"\{").replace("}", r"\}")
        background_color = "white" if i != highlight_index % len(tokens) else "lightred"
        latex_code += r'\tok[{}]'.format(background_color) + r'{{\strut}' + latex_text + '}'
        latex_code += r'\allowbreak '  # Allow line breaks between tokens
        if token == "\n":
            latex_code += r"\\"
    return latex_code

In [10]:
clusters, _ = torch.load("data/400clusters-2.pt")

## Cluster 11

In [45]:
cluster_i = 11
cluster = clusters[cluster_i]
sample_i = 7

before = 70
after = 0

tokens, token_idx = cluster[sample_i]
tokens_before = min(before, token_idx)
tokens_after = min(after, len(tokens) - token_idx - 1)
tokens_slice = tokens[token_idx-tokens_before:token_idx + after + 1]
print("".join(tokens_slice))

Lily and Ben nod. They promise to be careful. They ask mom to read the letter to them. Mom smiles. She reads the letter. It is from grandma. She says she loves them a lot. She sends them kisses and hugs. Lily and Ben are happy. They send kisses and hugs back to grandma. They thank mom for the letter


In [48]:
# choose diverse samples from each cluster
i_choices = {
    0: 59,
    3: 60,
    4: 49,
    5: 57,
    7: 70,
}

after = 0

text = ""

for j, (i, context_length) in enumerate(i_choices.items()):
    before = context_length
    tokens, token_idx = cluster[i]
    tokens_before = min(before, token_idx)
    tokens_after = min(after, len(tokens) - token_idx - 1)
    tokens_slice = tokens[token_idx-tokens_before:token_idx + after + 1]
    text += tokens_to_latex(tokens_slice)
    if j != len(i_choices)-1:
        text += "\n\n{\color{gray}\\rule{0.99\linewidth}{0.5pt}}\n\n"

with open(f"/om2/user/ericjm/the-everything-machine/texts/tinystories/cluster{cluster_i}.tex", 'w') as f:
    f.write(text)

## Cluster 31

In [50]:
cluster_i = 31
cluster = clusters[cluster_i]
sample_i = 0

before = 70
after = 0

tokens, token_idx = cluster[sample_i]
tokens_before = min(before, token_idx)
tokens_after = min(after, len(tokens) - token_idx - 1)
tokens_slice = tokens[token_idx-tokens_before:token_idx + after + 1]
print("".join(tokens_slice))

Once upon a time


In [51]:
# choose diverse samples from each cluster
i_choices = {
    0: 10,
    1: 10,
    2: 10,
    3: 10,
    4: 10,
    5: 10,
    6: 10,
    7: 10,
}

after = 0

text = ""

for j, (i, context_length) in enumerate(i_choices.items()):
    before = context_length
    tokens, token_idx = cluster[i]
    tokens_before = min(before, token_idx)
    tokens_after = min(after, len(tokens) - token_idx - 1)
    tokens_slice = tokens[token_idx-tokens_before:token_idx + after + 1]
    text += tokens_to_latex(tokens_slice)
    if j != len(i_choices)-1:
        text += "\n\n{\color{gray}\\rule{0.99\linewidth}{0.5pt}}\n\n"

with open(f"/om2/user/ericjm/the-everything-machine/texts/tinystories/cluster{cluster_i}.tex", 'w') as f:
    f.write(text)

## Cluster 75

In [130]:
cluster_i = 75
cluster = clusters[cluster_i]
sample_i = 4

before = 38
after = 0

tokens, token_idx = cluster[sample_i]
tokens_before = min(before, token_idx)
tokens_after = min(after, len(tokens) - token_idx - 1)
tokens_slice = tokens[token_idx-tokens_before:token_idx + after + 1]
print("".join(tokens_slice))

Next, her mom told Emma to wipe the floor clean. Emma grabbed a cloth and wiped the floor. When she was finished, it was as clean as a new penny.

Finally,


In [131]:
# choose diverse samples from each cluster
i_choices = {
    0: 54,
    1: 72,
    2: 50,
    3: 80,
    4: 38
}

after = 0

text = ""

for j, (i, context_length) in enumerate(i_choices.items()):
    before = context_length
    tokens, token_idx = cluster[i]
    tokens_before = min(before, token_idx)
    tokens_after = min(after, len(tokens) - token_idx - 1)
    tokens_slice = tokens[token_idx-tokens_before:token_idx + after + 1]
    text += tokens_to_latex(tokens_slice)
    if j != len(i_choices)-1:
        text += "\n\n{\color{gray}\\rule{0.99\linewidth}{0.5pt}}\n\n"

with open(f"/om2/user/ericjm/the-everything-machine/texts/tinystories/cluster{cluster_i}.tex", 'w') as f:
    f.write(text)

## Cluster 78

In [126]:
cluster_i = 77
cluster = clusters[cluster_i]
sample_i = 11

before = 37
after = 0

tokens, token_idx = cluster[sample_i]
tokens_before = min(before, token_idx)
tokens_after = min(after, len(tokens) - token_idx - 1)
tokens_slice = tokens[token_idx-tokens_before:token_idx + after + 1]
print("".join(tokens_slice))

Nearby, her mom was watching and called out, "Lucy, come here! What's that you have there?"

Lucy proudly held up the hoop and announced, "


In [127]:
# choose diverse samples from each cluster
i_choices = {
    0: 41,
    3: 49,
    4: 51,
    5: 67,
    11: 37
}

after = 0

text = ""

for j, (i, context_length) in enumerate(i_choices.items()):
    before = context_length
    tokens, token_idx = cluster[i]
    tokens_before = min(before, token_idx)
    tokens_after = min(after, len(tokens) - token_idx - 1)
    tokens_slice = tokens[token_idx-tokens_before:token_idx + after + 1]
    text += tokens_to_latex(tokens_slice)
    if j != len(i_choices)-1:
        text += "\n\n{\color{gray}\\rule{0.99\linewidth}{0.5pt}}\n\n"

with open(f"/om2/user/ericjm/the-everything-machine/texts/tinystories/cluster{cluster_i}.tex", 'w') as f:
    f.write(text)