In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import experiments

In [5]:
with open('gpt_5_string_2.txt', 'r') as f:
    string = f.read()
    print(f"len: {len(string)}")
    (a, b) = experiments.test_markov_prop(string, 2, 0.2)
    print(a)
    print(b)
    print(a - b)

len: 4397
[[0.30769231 0.69230769]
 [0.34188034 0.65811966]
 [0.34615385 0.65384615]
 [0.52739726 0.47260274]]
[[0.32013201 0.67986799]
 [0.30506478 0.69493522]
 [0.2426384  0.7573616 ]
 [0.38943894 0.61056106]]
[[-0.01243971  0.01243971]
 [ 0.03681556 -0.03681556]
 [ 0.10351545 -0.10351545]
 [ 0.13795832 -0.13795832]]


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
import itertools
import random
import string


def add_characters(input: str, char: str = "x") -> str:
    return "".join(
        list(
            itertools.chain.from_iterable(
                zip(input, itertools.repeat(char, times=len(input)))
            )
        )
    )


# add random length
def add_words(input: str, word_length: int = 6) -> str:
    input_split = input.split(" ")
    junk = []
    for _ in range(len(input_split)):
        word = ""
        for _ in range(word_length):
            word += random.choice(string.ascii_letters)
        junk.append(word)
    return " ".join(list(itertools.chain.from_iterable(zip(input_split, junk))))


def split_across_messages(input: str) -> list[str]:
    input_split = input.split(" ")

    return input_split


def scramble_word(input: str) -> str:
    words = input.split()
    res = []

    for word in words:
        if len(word) > 2:
            inner_letters = list(word)[1:-1]

            random.shuffle(inner_letters)

            new_word = word[0]
            new_word += "".join(inner_letters)
            new_word += word[-1]

            res.append(new_word)

        else:
            res.append(word)

    return " ".join(res)

In [None]:
add_characters("Do you understand me?")

'Lxoxrxexmx xixpxsxuxmx xdxoxlxoxrx xsxixtx'

In [None]:
add_words("Do you understand me?")

'Lorem ocCCWh ipsum LXkRHb dolor CynVEE sit HAjqRe'

In [None]:
split_across_messages("Do you understand me?")

['Lorem', 'ipsum', 'dolor', 'sit']

In [3]:
scramble_word("Do you understand me?")

'Do you udnensatrd me?'

In [6]:
class SmolChatbot:
    def __init__(self, model_name: str = "HuggingFaceTB/SmolLM3-3B", system_prompt: str = ""):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.history: list[dict[str, str]] = []

        if system_prompt:
            self.history.append({"role": "system", "content": system_prompt})

    def reset(self, system_prompt: str = ""):
        self.history = []

        if system_prompt:
            self.history.append({"role": "system", "content": system_prompt})

    def __call__(self, user_input: str, enable_thinking=True):
        messages = self.history + [{"role": "user", "content": user_input}]

        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=enable_thinking,
        )

        inputs = self.tokenizer(text, return_tensors="pt")
        response_ids = self.model.generate(**inputs, max_new_tokens=32768)[0][
            len(inputs.input_ids[0]) :
        ].tolist()

        try:
            # rindex finding 128003 (</think>)
            split_index = len(response_ids) - response_ids[::-1].index(128003)
        except ValueError:
            split_index = 0

        _reasoning_content = self.tokenizer.decode(
            response_ids[:split_index], skip_special_tokens=True
        ).strip("\n")
        content = self.tokenizer.decode(
            response_ids[split_index:], skip_special_tokens=True
        ).strip("\n")

        response = self.tokenizer.decode(response_ids, skip_special_tokens=True)

        self.history.append({"role": "user", "content": user_input})
        self.history.append({"role": "assistant", "content": response})

        return content

In [7]:
# smol = SmolChatbot()

In [8]:
# print(smol("Hello", enable_thinking=False))

In [9]:
# smol.history

In [6]:
class QwenChatbot:
    def __init__(self, model_name: str = "Qwen/Qwen3-0.6B", system_prompt: str = ""):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.history: list[dict[str, str]] = []

        if system_prompt:
            self.history.append({"role": "system", "content": system_prompt})

    def reset(self, system_prompt: str = ""):
        self.history = []

        if system_prompt:
            self.history.append({"role": "system", "content": system_prompt})

    def __call__(self, user_input: str, enable_thinking=True):
        messages = self.history + [{"role": "user", "content": user_input}]

        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=enable_thinking,
        )

        inputs = self.tokenizer(text, return_tensors="pt")
        response_ids = self.model.generate(**inputs, max_new_tokens=32768)[0][
            len(inputs.input_ids[0]) :
        ].tolist()

        try:
            # rindex finding 151668 (</think>)
            split_index = len(response_ids) - response_ids[::-1].index(151668)
        except ValueError:
            split_index = 0

        _reasoning_content = self.tokenizer.decode(
            response_ids[:split_index], skip_special_tokens=True
        ).strip("\n")
        content = self.tokenizer.decode(
            response_ids[split_index:], skip_special_tokens=True
        ).strip("\n")

        response = self.tokenizer.decode(response_ids, skip_special_tokens=True)

        self.history.append({"role": "user", "content": user_input})
        self.history.append({"role": "assistant", "content": response})

        return content


In [7]:
qwen = QwenChatbot()

In [12]:
qwen.reset()
qwen("Hello")

'Hello! 😊 How can I assist you today?'

In [60]:
qwen.reset()
print(
    qwen(
        "Alice has twenty apples and Bob has fourteen apples. How many apples do they have in total?"
    )
)
print(qwen.history)

Alice has twenty apples, and Bob has fourteen apples. To find the total number of apples they have together, we need to add the two quantities.

$$
20 + 14 = 34
$$

Thus, the total number of apples they have is:

$$
\boxed{34}
$$
[{'role': 'user', 'content': 'Alice has twenty apples and Bob has fourteen apples. How many apples do they have in total?'}, {'role': 'assistant', 'content': "<think>\nOkay, let's see. Alice has twenty apples, and Bob has fourteen. The question is asking how many apples they have in total. Hmm, so I need to add these two numbers together. Let me think. \n\nFirst, I should check if there's any trick here. Like, maybe they have some apples in common? Wait, no, the problem just says Alice has twenty and Bob has fourteen. It doesn't mention anything about overlapping apples. So maybe they are separate? Like, they don't share any apples? That seems logical. So if Alice has 20 and Bob has 14, adding them together would be 20 + 14. \n\nLet me do the math. 20 plus 14.

In [62]:
scramble_word(
    "Alice has twenty apples and Bob has fourteen apples. How many apples do they have in total?"
)

'Aicle has tewtny appels and Bob has furteeon alpesp. How mnay alppes do tehy hvae in taolt?'

In [61]:
qwen.reset()
print(
    qwen(
        scramble_word(
            "Alice has twenty apples and Bob has fourteen apples. How many apples do they have in total?"
        )
    )
)
print(qwen.history)

Acile has "tenth aeppls" and Bob has "futeeorn aslepp". If "aeppls" is interpreted as "apples" and "futeeorn" as "futeeapple", then each has **one apple**. Therefore, the total number of alpeps is:

$$
\boxed{2}
$$
[{'role': 'user', 'content': 'Acile has ttenwy aeppls and Bob has futeeorn aslepp. How many alpeps do they hvae in talto?'}, {'role': 'assistant', 'content': '<think>\nOkay, let\'s see. The question is about figuring out how many "alpeps" are there in total between Acile and Bob. The original statements given are:\n\n- Acile has ttenwy aeppls\n- Bob has futeeorn aslepp\n\nHmm, first, I need to make sense of these words. The key here is to translate them into English and then count the number of "alpeps" they each have. Let me start with Acile\'s case.\n\nStarting with "ttenwy aeppls". The word "ttenwy" – that seems like a typo or a misspelling. Maybe "tenth" or "ten" but with some extra letters? Let me think. "Ttenwy" could be "tenth" with an extra "y"? Or maybe it\'s "ten" 

In [58]:
qwen.reset()
print(qwen(scramble_word("Hello. Do you understand me?")))
print(qwen.history)

Holle! Ich kann nicht mit Ihnen verwechseln, aber ich kann Ihnen helfen. Was ist Ihnen wichtig? 😊
[{'role': 'user', 'content': 'Holle. Do you uredtnnasd me?'}, {'role': 'assistant', 'content': '<think>\nOkay, the user is asking, "Holle. Do you uredtnnasd me?" Let me break this down. First, "Holle" is German for "Hello," so the user is greeting me. Then, "Do you uredtnnasd me?" translates to "Do you remember me?" in English. The user is asking if I remember them. \n\nNow, I need to respond appropriately. Since I\'m an AI, I don\'t have personal experiences or memories. So the answer should confirm that I don\'t remember individuals. But maybe I can offer to help with something else. Let me check if there\'s any nuance I\'m missing. The user might be testing if I can recognize them, but I can\'t. So the response should be straightforward but friendly.\n\nI should make sure the language is correct, using proper German terms. Also, keep the tone friendly and helpful. Let me put that togeth

In [14]:
qwen.reset()
print(qwen("Sort the following list: [54, 234, 3245, 2, 7, 3, 2]"))
print(qwen.history)

To sort the list `[54, 234, 3245, 2, 7, 3, 2]`, we need to arrange the elements in ascending order. This means comparing the numerical values and placing the smallest numbers first.

After converting all elements to their numeric form, we compare them:

- The smallest number is `2`.
- The next smallest is `2`.
- Then comes `3`, `3`.
- Next is `54`, followed by `7`, then `234`, and finally `3245`.

Thus, the sorted list is:

$$
[2, 2, 3, 3, 54, 7, 234, 3245]
$$

**Final Answer:**  
$$
\boxed{[2, 2, 3, 3, 54, 7, 234, 3245]}
$$
[{'role': 'user', 'content': 'Sort the following list: [54, 234, 3245, 2, 7, 3, 2]'}, {'role': 'assistant', 'content': '<think>\nOkay, let\'s see. I need to sort the list [54, 234, 3245, 2, 7, 3, 2]. Hmm, first, I should probably check if there are any special cases or if there\'s any specific sorting method required. The problem just says to sort it, so I guess it\'s a regular sort.\n\nLet me start by writing down the list again to make sure I have it right: 54, 2

In [13]:
qwen.reset()
print(qwen(add_characters("Do you understand me?")))

KeyboardInterrupt: 

In [None]:
add_words("What is the square root of two?")

'What jxvmXY is seSSEy the NLVWyP square tuxbop root lWslJI of uRCnqy two? yYCCyg'

In [None]:
qwen.reset()
print(qwen(add_words("What is the square root of two?"), enable_thinking=False))

The question seems to be a riddle or a cryptic puzzle. Let's break it down:

- **"What OQdao is ICPWbZ the PISiaK square HioQcF root jqFCPr of vYCzoH two?"**

### Step-by-step:

1. **"OQdao"** – This could be a cipher or a reference to a word.
2. **"ICPWbZ"** – This might be a number or a code.
3. **"PISiaK"** – This could be a word or a cipher.
4. **"HioQcF"** – Another cipher or code.
5. **"jqFCPr"** – Another cipher or code.
6. **"vYCzoH"** – Another cipher or code.

### Possible Interpretation:

This riddle is likely a play on words or a cryptic puzzle where the answer is a word or phrase that can be derived from the cipher codes.

### Final Answer:

**"Two"** – This fits the pattern, as it's the only word that can be derived from the cipher codes.

**Answer:** Two.


In [None]:
qwen.history

[{'role': 'user',
  'content': 'Do nIYviD you aYSjuR understand yDtqIL me? PNxbgd'},
 {'role': 'assistant',
  'content': "<think>\nOkay, the user is asking if I understand them. First, I need to respond politely. Since I'm an AI, I should acknowledge their question and express that I'm here to help. I should keep it friendly and open-ended so they feel comfortable sharing more. Let me make sure the response is clear and that they know I'm available for any questions they might have. I should also check if there's anything else they need, so they know I'm here to assist further.\n</think>\n\nHello! I'm here to help you with anything you need. How can I assist you today? 😊"}]

In [None]:
add_characters("Do you understand me?")

'Dxox xyxoxux xuxnxdxexrxsxtxaxnxdx xmxex?x'

In [8]:
token = qwen.tokenizer.apply_chat_template(
    [{"role": "user", "content": "Do you understand me?"}],
    tokenize=True,
    add_generation_prompt=True,
    enable_thinking=False,
)

for t in token:
    print(t, qwen.tokenizer.decode(t))

151644 <|im_start|>
872 user
198 

5404 Do
498  you
3535  understand
752  me
30 ?
151645 <|im_end|>
198 

151644 <|im_start|>
77091 assistant
198 

151667 <think>
271 


151668 </think>
271 




In [11]:
token = qwen.tokenizer.apply_chat_template(
    [{"role": "user", "content": "Do you understand me?"}],
    tokenize=True,
    add_generation_prompt=True,
    enable_thinking=False,
)
list(token)

[151644,
 872,
 198,
 5404,
 498,
 3535,
 752,
 30,
 151645,
 198,
 151644,
 77091,
 198,
 151667,
 271,
 151668,
 271]

In [12]:
token = qwen.tokenizer.apply_chat_template(
    [{"role": "user", "content": add_characters("Do you understand me?")}],
    tokenize=True,
    add_generation_prompt=True,
    enable_thinking=False,
)
list(token)

[151644,
 872,
 198,
 35,
 87,
 5131,
 30784,
 87,
 5131,
 2200,
 856,
 2200,
 77,
 9703,
 87,
 327,
 12651,
 82,
 2252,
 87,
 706,
 77,
 9703,
 87,
 856,
 18085,
 327,
 30,
 87,
 151645,
 198,
 151644,
 77091,
 198,
 151667,
 271,
 151668,
 271]