In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from transformers.models.qwen2.modeling_qwen2 import apply_rotary_pos_emb


In [2]:
model = AutoModelForCausalLM.from_pretrained("apple/OpenELM-270M", torch_dtype=torch.float32, trust_remote_code=True)
pretrained_config = AutoConfig.from_pretrained("apple/OpenELM-270M", trust_remote_code=True)




In [14]:
pretrained_config

OpenELMConfig {
  "_name_or_path": "apple/OpenELM-270M",
  "activation_fn_name": "swish",
  "architectures": [
    "OpenELMForCausalLM"
  ],
  "auto_map": {
    "AutoConfig": "apple/OpenELM-270M--configuration_openelm.OpenELMConfig",
    "AutoModelForCausalLM": "apple/OpenELM-270M--modeling_openelm.OpenELMForCausalLM"
  },
  "bos_token_id": 1,
  "eos_token_id": 2,
  "ffn_dim_divisor": 256,
  "ffn_multipliers": [
    0.5,
    0.73,
    0.97,
    1.2,
    1.43,
    1.67,
    1.9,
    2.13,
    2.37,
    2.6,
    2.83,
    3.07,
    3.3,
    3.53,
    3.77,
    4.0
  ],
  "ffn_with_glu": true,
  "head_dim": 64,
  "initializer_range": 0.02,
  "max_context_length": 2048,
  "model_dim": 1280,
  "model_type": "openelm",
  "normalization_layer_name": "rms_norm",
  "normalize_qk_projections": true,
  "num_gqa_groups": 4,
  "num_kv_heads": [
    3,
    3,
    3,
    3,
    3,
    4,
    4,
    4,
    4,
    4,
    4,
    4,
    5,
    5,
    5,
    5
  ],
  "num_query_heads": [
    12,
    12,
 

In [3]:
modules = dict(model.named_modules())
dir(modules["transformer.layers"][0])


['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_buffers',
 '_call_impl',
 '_compiled_call_impl',
 '_forward_hooks',
 '_forward_hooks_always_called',
 '_forward_hooks_with_kwargs',
 '_forward_pre_hooks',
 '_forward_pre_hooks_with_kwargs',
 '_get_backward_hooks',
 '_get_backward_pre_hooks',
 '_get_name',
 '_is_full_backward_hook',
 '_is_hf_initialized',
 '_load_from_state_dict',
 '_load_state_dict_post_hooks',
 '_load_state_dict_pre_hooks',
 '_maybe_warn_non_full_backward_hook',
 '_modules',
 '_named_members',
 '_no

In [7]:
list(dict(model.named_modules()).keys())

['',
 'transformer',
 'transformer.token_embeddings',
 'transformer.layers',
 'transformer.layers.0',
 'transformer.layers.0.attn',
 'transformer.layers.0.attn.qkv_proj',
 'transformer.layers.0.attn.pos_embedding',
 'transformer.layers.0.attn.q_norm',
 'transformer.layers.0.attn.k_norm',
 'transformer.layers.0.attn.out_proj',
 'transformer.layers.0.ffn',
 'transformer.layers.0.ffn.proj_1',
 'transformer.layers.0.ffn.proj_2',
 'transformer.layers.0.ffn.act',
 'transformer.layers.0.ffn_norm',
 'transformer.layers.0.attn_norm',
 'transformer.layers.1',
 'transformer.layers.1.attn',
 'transformer.layers.1.attn.qkv_proj',
 'transformer.layers.1.attn.pos_embedding',
 'transformer.layers.1.attn.q_norm',
 'transformer.layers.1.attn.k_norm',
 'transformer.layers.1.attn.out_proj',
 'transformer.layers.1.ffn',
 'transformer.layers.1.ffn.proj_1',
 'transformer.layers.1.ffn.proj_2',
 'transformer.layers.1.ffn.act',
 'transformer.layers.1.ffn_norm',
 'transformer.layers.1.attn_norm',
 'transformer.l

In [15]:
type(model)

transformers_modules.apple.OpenELM-270M.945fb18d02b1c4c81d7989e80b7564c6d91e8300.modeling_openelm.OpenELMForCausalLM

In [18]:
modules = dict(model.named_modules())
tok_emb = modules["transformer.token_embeddings"]
tok_emb.num_embeddings, tok_emb.embedding_dim

(32000, 1280)

In [20]:
model.get_input_embeddings().embedding_dim

1280

In [2]:
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct", torch_dtype=torch.float32, trust_remote_code=True)
pretrained_config = AutoConfig.from_pretrained("Qwen/Qwen2-0.5B-Instruct", trust_remote_code=True)



In [3]:
pretrained_config

Qwen2Config {
  "_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "max_position_embeddings": 32768,
  "max_window_layers": 24,
  "model_type": "qwen2",
  "num_attention_heads": 14,
  "num_hidden_layers": 24,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_theta": 1000000.0,
  "sliding_window": 32768,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

In [29]:
model.get_input_embeddings().weight.shape

torch.Size([151936, 1024])

In [30]:
modules["model.embed_tokens"].weight.shape

torch.Size([151936, 1024])

In [6]:
dir(model)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_assisted_decoding',
 '_auto_class',
 '_autoset_attn_implementation',
 '_backward_compatibility_gradient_checkpointing',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_beam_sample',
 '_beam_search',
 '_buffers',
 '_call_impl',
 '_check_and_enable_flash_attn_2',
 '_check_and_enable_sdpa',
 '_constrained_beam_search',
 '_contrastive_search',
 '_convert_head_mask_to_5d',
 '_copy_lm_head_original_to_resized',
 '_create_repo',
 '_dispatch_accelerate_model',
 '_expand_inputs_for_generation',
 '_extract_past_from_model_output',
 '_for

In [3]:
model.modules

<bound method Module.modules of Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen

In [3]:
modules = dict(model.named_modules())
i = 0
q_proj = modules[f"model.layers.{i}.self_attn.q_proj"]
k_proj = modules[f"model.layers.{i}.self_attn.k_proj"]
v_proj = modules[f"model.layers.{i}.self_attn.v_proj"]
o_proj = modules[f"model.layers.{i}.self_attn.o_proj"]
layers = modules[f"model.layers"]
type(layers)

torch.nn.modules.container.ModuleList

In [16]:
v = torch.full((2, 3, 5, layers[0].self_attn.head_dim), 1.2, dtype=torch.float32)
cos, sin = layers[0].self_attn.rotary_emb(v, seq_len=7)
cos.shape, sin.shape, layers[0].self_attn.head_dim

(torch.Size([7, 64]), torch.Size([7, 64]), 64)

In [26]:
B = 2
L = 6
D = 4
a = torch.arange(0, L, dtype=torch.int64)
b = torch.arange(0, D, dtype=torch.int64)
# position_ids = a.reshape(1, L) + b.reshape(D, 1)
position_ids = a.unsqueeze(0).tile((B, 1))
print(position_ids)
cos[position_ids].shape

tensor([[0, 1, 2, 3, 4, 5],
        [0, 1, 2, 3, 4, 5]])


torch.Size([2, 6, 64])

In [25]:
L = 6
position_ids = torch.arange(
    0, L, dtype=torch.long
)
position_ids = position_ids.unsqueeze(0).view(-1, L)
position_ids.shape

torch.Size([1, 6])

In [19]:
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)

In [28]:
H = 3
q = torch.full((B, H, L, layers[0].self_attn.head_dim), 1.2, dtype=torch.float32)
q_embed = (q * cos[position_ids].unsqueeze(1)) + (rotate_half(q) * sin[position_ids].unsqueeze(1))
q_embed.shape

torch.Size([2, 3, 6, 64])

In [8]:
modules = {name: module for name, module in model.named_modules()}
modules.keys()

dict_keys(['', 'model', 'model.embed_tokens', 'model.layers', 'model.layers.0', 'model.layers.0.self_attn', 'model.layers.0.self_attn.q_proj', 'model.layers.0.self_attn.k_proj', 'model.layers.0.self_attn.v_proj', 'model.layers.0.self_attn.o_proj', 'model.layers.0.self_attn.rotary_emb', 'model.layers.0.mlp', 'model.layers.0.mlp.gate_proj', 'model.layers.0.mlp.up_proj', 'model.layers.0.mlp.down_proj', 'model.layers.0.mlp.act_fn', 'model.layers.0.input_layernorm', 'model.layers.0.post_attention_layernorm', 'model.layers.1', 'model.layers.1.self_attn', 'model.layers.1.self_attn.q_proj', 'model.layers.1.self_attn.k_proj', 'model.layers.1.self_attn.v_proj', 'model.layers.1.self_attn.o_proj', 'model.layers.1.self_attn.rotary_emb', 'model.layers.1.mlp', 'model.layers.1.mlp.gate_proj', 'model.layers.1.mlp.up_proj', 'model.layers.1.mlp.down_proj', 'model.layers.1.mlp.act_fn', 'model.layers.1.input_layernorm', 'model.layers.1.post_attention_layernorm', 'model.layers.2', 'model.layers.2.self_attn'

In [31]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B", trust_remote_code=True)

inputs = tokenizer('''def print_prime(n):
   """
   Print all primes between 1 and n
   """''', return_tensors="pt", return_attention_mask=False)
inputs

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'input_ids': tensor([[  750,  1173, 38217,  1445,   982,   256,  3190,   256,  8213,   678,
         49433,  1948,   220,    16,   323,   308,   198,   256,  4210]])}

In [34]:
tokenizer.vocab_size

151643

In [10]:
hidden = modules["model"](inputs["input_ids"]).last_hidden_state
print(hidden.shape)
hidden

torch.Size([1, 19, 1024])


tensor([[[  3.6836,  -1.1255,   1.5150,  ...,   2.6303,  -0.5123,   1.7374],
         [  7.4853,  -0.9902,  -4.0687,  ...,   1.7265,   0.2599,  -1.1351],
         [ 24.2062,  -7.2295,   4.1781,  ...,   9.3780,  -1.2049,  -6.5320],
         ...,
         [ -5.0436,   1.9755,  -3.9743,  ...,   1.4521,   1.5901,   6.1949],
         [ -0.9378,   4.4864,  -1.1898,  ...,   4.1787,  -2.2388,  -4.0903],
         [-10.6983,  -3.1012,  -4.7337,  ...,   3.5523,  -4.2465,   0.8423]]],
       grad_fn=<MulBackward0>)

In [11]:
output = modules["lm_head"](hidden)
print(output.shape)
output

torch.Size([1, 19, 151936])


tensor([[[-2.4811, -6.1711, -6.3514,  ..., -8.3448, -8.3446, -8.3446],
         [-2.9108,  1.6906, -3.2120,  ..., -6.1205, -6.1200, -6.1203],
         [ 3.0356,  3.6571,  2.9795,  ..., -4.3870, -4.3862, -4.3864],
         ...,
         [ 9.9981, 10.2735, 11.2037,  ..., -3.3791, -3.3778, -3.3788],
         [-0.9505, -2.4659, -2.6616,  ..., -4.5804, -4.5799, -4.5798],
         [ 8.5299,  7.0447, 13.8696,  ..., -1.6405, -1.6392, -1.6405]]],
       grad_fn=<UnsafeViewBackward0>)

In [None]:
def generate_thoughts(token_ids: torch.Tensor, thought_tokens: int) -> torch.Tensor:
    """
    In parallel across `n` tokens, generate `thought_tokens` tokens resulting in
    `n x (3 + thought_tokens)` total tokens including the <|startofthought|> and
    <|endofthought|> tokens.
    """
    for t in range(thought_tokens):
        # Naive attention computation:
        # evaluate the model on the unraveled sequence of tokens
        # position embeddings: [p_0 ... p_l p_1 ... p_{l + 1} p_2 ... p_{l + 2}]
        # query, key, value: (b, (t + 1) x l, e)
        # special attention mask
        # only need last l predictions

        # Optimized attention computation:
        # position embeddings: [p_0 ... p_l p_1 ... p_{l + 1} p_2 ... p_{l + 2}]
        # new attention weights: original seq (b, l, e), t^th thought tokens (b, l, e)
        # new diagonal attention weigths:
        #   thought tokens (b, t + 1, l, e), t^th thought tokens (b, 1, l, e) => (b, t + 1, l)
        # form full attention weight tensor (b, t + 2, l, l)
        # softmax of weights
        # linear combination of values (b, t + 2, l, e)
        # output (b, l, e)

        # custom attention weight calculation
        # traditional multi-head attention calculation where K != V using custom attn weights
        # attention(sparse_attn_weights, values)
        # can still use flash attention trick to tile matrices and calculate in SRAM





In [1]:
import dask.dataframe as dd

In [19]:
df = dd.read_json("hf://datasets/allenai/c4/en/c4-train.00000-of-01024.json.gz")
len(df)

: 

In [2]:
df = dd.read_parquet("hf://datasets/open-web-math/open-web-math/data/train-00000-of-00114-5a023365406cb9c4.parquet")
len(df)

55397

In [15]:
row = df.loc[0]["text"].compute()
type(row)

pandas.core.series.Series

In [17]:
print(row[0])

Bayes and his Theorem

My earlier post on Bayesian probability seems to have generated quite a lot of readers, so this lunchtime I thought I’d add a little bit of background. The previous discussion started from the result

$P(B|AC) = K^{-1}P(B|C)P(A|BC) = K^{-1} P(AB|C)$

where

$K=P(A|C).$

Although this is called Bayes’ theorem, the general form of it as stated here was actually first written down, not by Bayes but by Laplace. What Bayes’ did was derive the special case of this formula for “inverting” the binomial distribution. This distribution gives the probability of x successes in n independent “trials” each having the same probability of success, p; each “trial” has only two possible outcomes (“success” or “failure”). Trials like this are usually called Bernoulli trials, after Daniel Bernoulli. If we ask the question “what is the probability of exactly x successes from the possible n?”, the answer is given by the binomial distribution:

$P_n(x|n,p)= C(n,x) p^x (1-p)^{n-x}$

whe

In [22]:
from datasets import load_dataset

train_dataset = load_dataset(
    "open-web-math/open-web-math",
    data_files={"train": ["data/train-00000-of-00114-5a023365406cb9c4.parquet"],},
    num_proc=8,
)

In [88]:
import random


def random_split_on_whitespace(text: str, min_remaining_whitespace: int = 256) -> str:
    """
    Splits a string on a random whitespace character and returns the second part.

    Args:
        text: The string to split.
        min_remaining_whitespace: The minimum number of whitespace characters to leave in string.

    Returns:
        The second part of the string after a random whitespace split, 
        or the original string if no whitespace is found.
        Leading whitespace is stripped.
    """
    whitespace_indexes = [i for i, char in enumerate(text) if char.isspace()][:-min_remaining_whitespace]

    if not whitespace_indexes:
        return text  # Not enough whitespace found, return original text

    random_index = random.choice(whitespace_indexes)

    return text[random_index + 1:].lstrip()


def process_batch(examples: dict[str, list]) -> dict[str, list]:
    result = {
        "input_ids": [
            tokenizer(random_split_on_whitespace(text, 256), max_length=256, truncation=True)["input_ids"]
            for text in examples["text"]
        ],
    }
    result["text"] = tokenizer.batch_decode(result["input_ids"])
    return result


In [54]:
np_dataset.save_to_disk("data/open-web-math")

Saving the dataset (0/2 shards):   0%|          | 0/55397 [00:00<?, ? examples/s]

In [89]:
small_dataset = train_dataset["train"].select(range(100)).map(
    process_batch, batched=True, remove_columns=["url", "date", "metadata"],
)
small_dataset.set_format("np", columns=["input_ids"], output_all_columns=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [67]:
import datasets

np_dataset = datasets.load_from_disk("data/open-web-math")
type(np_dataset["train"][0]["input_ids"])

list

In [90]:
small_dataset[0]

{'input_ids': array([12555,   374,   279, 18927,  7982,   315,   281,    30, 11204,
         1096,   374,   264, 11416,  3110,   315, 27949, 32711,    13,
         1260,  2684,   279,  4396,  4226,    11,  9583,    11,   714,
          553,  1602,  5686,   337,  2774, 32711,    13,   758,   847,
         9459,   432,   374,  5008,  5000,   311,  9357,   279,   829,
         9154,   288,   527, 57817,  3118,   389,  1128,   566,  3520,
         1521,    11,  7892, 49055, 26536,  1521, 11689, 24645,   419,
        18527,   979,   566, 14257,   279,  4586,  1102,  2937,    11,
          892,   374,   902, 10492,  3170,   279, 57817,   374,  2677,
         6941,   304,  9154,   288,   527, 33562,   382,  1986,   374,
          537,   279,  1172,  3110,   304,  8038,  1380,   279,  4969,
         1697,   748,   829,   374, 12392,   311,   264,  1102,   476,
        18335,    13,   758,  2097,    11,   432,   374,  4558,   264,
         2329,   315, 21331,   429,   894, 57817,   429,   702, 

In [68]:
s = """Bayes and his Theorem

My earlier post on Bayesian probability seems to have generated quite a lot of readers, so this lunchtime I thought I’d add a little bit of background. The previous discussion started from the result

$P(B|AC) = K^{-1}P(B|C)P(A|BC) = K^{-1} P(AB|C)$

where

$K=P(A|C).$

Although this is called Bayes’ theorem, the general form of it as stated here was actually first written down, not by Bayes but by Laplace. What Bayes’ did was derive the special case of this formula for “inverting” the binomial distribution. This distribution gives the probability of x successes in n independent “trials” each having the same probability of success, p; each “trial” has only two possible outcomes (“success” or “failure”). Trials like this are usually called Bernoulli trials, after Daniel Bernoulli. If we ask the question “what is the probability of exactly x successes from the possible n?”, the answer is given by the binomial distribution:

$P_n(x|n,p)= C(n,x) p^x (1-p)^{n-x}$

where

$C(n,x)= n!/x!(n-x)!$

is the number of distinct combinations of x objects that can be drawn from a pool of n.

You can probably see immediately how this arises. The probability of x consecutive successes is p multiplied by itself x times, or px. The probability of (n-x) successive failures is similarly (1-p)n-x. The last two terms basically therefore tell us the probability that we have exactly x successes (since there must be n-x failures). The combinatorial factor in front takes account of the fact that the ordering of successes and failures doesn’t matter.

The binomial distribution applies, for example, to repeated tosses of a coin, in which case p is taken to be 0.5 for a fair coin. A biased coin might have a different value of p, but as long as the tosses are independent the formula still applies. The binomial distribution also applies to problems involving drawing balls from urns: it works exactly if the balls are replaced in the urn after each draw, but it also applies approximately without replacement, as long as the number of draws is much smaller than the number of balls in the urn. I leave it as an exercise to calculate the expectation value of the binomial distribution, but the result is not surprising: E(X)=np. If you toss a fair coin ten times the expectation value for the number of heads is 10 times 0.5, which is five. No surprise there. After another bit of maths, the variance of the distribution can also be found. It is np(1-p).

So this gives us the probability of x given a fixed value of p. Bayes was interested in the inverse of this result, the probability of p given x. In other words, Bayes was interested in the answer to the question “If I perform n independent trials and get x successes, what is the probability distribution of p?”. This is a classic example of inverse reasoning. He got the correct answer, eventually, but by very convoluted reasoning. In my opinion it is quite difficult to justify the name Bayes’ theorem based on what he actually did, although Laplace did specifically acknowledge this contribution when he derived the general result later, which is no doubt why the theorem is always named in Bayes’ honour.
...

9. […] I posted a little piece about Bayesian probability. That one and the others that followed it (here and here) proved to be surprisingly popular so I’ve been planning to add a few more posts […]

10. It already has a popular name: Stigler’s law of eponymy.
"""
tokenizer(random_split_on_whitespace(s), return_tensors="np", max_length=256, truncation=True)

{'input_ids': array([[ 1055,   419, 14806,   369,  1036,   258, 49417,   854,   279,
         9544, 20855,  7982,    13,  1096,  7982,  6696,   279, 18927,
          315,   856, 47088,   304,   308,  9489,  1036,   376, 10309,
          854,  1817,  3432,   279,  1852, 18927,   315,  2393,    11,
          281,    26,  1817,  1036, 47347,   854,   702,  1172,  1378,
         3204, 19554, 26087,  5630,   854,   476,  1036, 28939, 64212,
        69444,  1075,   419,   525,  5990,  2598, 14168,   283, 39976,
        19080,    11,  1283, 15118, 14168,   283, 39976,    13,  1416,
          582,  2548,   279,  3405,  1036, 12555,   374,   279, 18927,
          315,  6896,   856, 47088,   504,   279,  3204,   308,    30,
         9336,   279,  4226,   374,  2661,   553,   279,  9544, 20855,
         7982,  1447,     3,    47,  1089,  2075,    91,    77,  7237,
        11730,   356,  1445, 12803,     8,   281,    61,    87,   320,
           16,  2268, 29776,    90,    77,  6558, 31716,   271,

In [58]:
em_dash_ids = tokenizer("---", return_tensors="pt", return_attention_mask=False)["input_ids"][0]
em_dash_ids

tensor([4421])

In [37]:
special_tokens_dict = {
    "additional_special_tokens": [
        "<|startofthought|>",
        "<|endofthought|>",
    ],
}

num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print("We have added", num_added_toks, "tokens")

# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer))

We have added 2 tokens


<bound method Qwen2ForCausalLM.get_input_embeddings of Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151648, 1024)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
   

In [60]:
embeddings = model.get_input_embeddings()
init_token_embedding = embeddings.weight[em_dash_ids].detach()
init_token_embedding.mean(dim=0)

tensor([ 0.0171, -0.0122,  0.0055,  ...,  0.0212,  0.0002, -0.0220])

In [1]:
model.layers

NameError: name 'model' is not defined

In [10]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("Qwen/Qwen1.5-0.5B")
config.tie_word_embeddings

True

In [14]:
print(config.to_json_string())

{
  "_name_or_path": "Qwen/Qwen1.5-0.5B",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 2816,
  "max_position_embeddings": 32768,
  "max_window_layers": 21,
  "model_type": "qwen2",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_key_value_heads": 16,
  "rms_norm_eps": 1e-06,
  "rope_theta": 1000000.0,
  "sliding_window": 32768,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.39.2",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}



In [51]:
with torch.no_grad():
    embeddings.weight[-2] = init_token_embedding
    embeddings.weight[-1] = init_token_embedding
    model.set_input_embeddings(embeddings)
    model.tie_weights()

In [64]:
len(np_dataset["train"][0]["input_ids"])

256

In [1]:
from typing import Callable

import dataclasses
import pathlib
import random

import lightning as L
import lightning.pytorch
import torch
import torch.utils.data

from datasets import load_dataset
from huggingface_hub import HfFileSystem
from transformers import AutoConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
@dataclasses.dataclass
class QuietStarConfig:
    batch_size: int = 1
    dtype: torch.dtype = torch.float32
    learning_rate: float = 1e-3
    max_length: int = 256
    model_name: str = "Qwen/Qwen1.5-0.5B"
    seed: int = 123


class QuietStarModel(L.LightningModule):
    def __init__(self, config: QuietStarConfig):
        super().__init__()
        self.model = AutoModelForCausalLM.from_pretrained(config.model_name, torch_dtype=config.dtype, trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)

        self.tokenizer.add_special_tokens({
            "additional_special_tokens": [
                "<|startofthought|>",
                "<|endofthought|>",
            ],
        })

        # resize_token_embeddings expects to receive the full size of the new vocabulary
        self.model.resize_token_embeddings(len(self.tokenizer))

        # initialize the new tokens to use the average of the embeddings of the tokenized string "---"
        em_dash_ids = self.tokenizer("---", return_tensors="pt", return_attention_mask=False)["input_ids"][0]
        em_dash_ids

        embeddings = self.model.get_input_embeddings()
        init_token_embedding = embeddings.weight[em_dash_ids].detach()
        init_token_embedding = init_token_embedding.mean(dim=0)

        model_config = AutoConfig.from_pretrained("Qwen/Qwen1.5-0.5B")
        with torch.no_grad():
            embeddings.weight[-2] = init_token_embedding
            embeddings.weight[-1] = init_token_embedding
            self.model.set_input_embeddings(embeddings)
            if model_config.tie_word_embeddings:
                self.model.tie_weights()

        self.learning_rate = config.learning_rate

    def forward(self, input_ids):
        output = self.model(input_ids[:, :-1], labels=input_ids[:, 1:])
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        loss, logits = self(input_ids)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

In [3]:
def random_split_on_whitespace(text: str, min_remaining_whitespace: int = 256) -> str:
    """
    Splits a string on a random whitespace character and returns the second part.

    Args:
        text: The string to split.
        min_remaining_whitespace: The minimum number of whitespace characters to leave in string.

    Returns:
        The second part of the string after a random whitespace split, 
        or the original string if no whitespace is found.
        Leading whitespace is stripped.
    """
    whitespace_indexes = [i for i, char in enumerate(text) if char.isspace()][:-min_remaining_whitespace]

    if not whitespace_indexes:
        return text  # Not enough whitespace found, return original text

    random_index = random.choice(whitespace_indexes)

    return text[random_index + 1:].lstrip()


def process_batch(tokenizer: AutoTokenizer, max_length: int = 256) -> Callable[[dict[str, list]], dict[str, list]]:
    def _process(examples: dict[str, list]) -> dict[str, list]:
        result = {
            "input_ids": [
                tokenizer(random_split_on_whitespace(text, max_length + 1), max_length=max_length + 1, truncation=True)["input_ids"]
                for text in examples["text"]
            ],
        }
        result["text"] = tokenizer.batch_decode(result["input_ids"])
        return result
    
    return _process

config = QuietStarConfig(
    batch_size=1,
    learning_rate=1e-4,
    max_length=8,
    seed=123,
)

lightning.pytorch.seed_everything(config.seed, workers=True)

model = QuietStarModel(config)

Seed set to 123
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
fs = HfFileSystem()

# List all ".parquet" files in the repo
paths = fs.glob("datasets/open-web-math/open-web-math/**/*.parquet")
relative_paths = [
    str(pathlib.Path(path).relative_to(*pathlib.Path(path).parts[:3]))
    for path in paths
]

large_dataset = load_dataset(
    "open-web-math/open-web-math",
    data_files={"train": relative_paths[:1]},
    num_proc=8,
)

small_dataset = large_dataset["train"].select(range(256)).map(
    process_batch(model.tokenizer, config.max_length),
    batched=True,
    remove_columns=["url", "date", "metadata"],
)
small_dataset = small_dataset.remove_columns("text")
small_dataset.set_format("pt", columns=["input_ids"], output_all_columns=True)
small_dataset.save_to_disk("data/open-web-math")

del large_dataset

Map:   0%|          | 0/256 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/256 [00:00<?, ? examples/s]

In [5]:
ds = small_dataset.train_test_split(test_size=0.125, shuffle=False)
train_dataloader = torch.utils.data.DataLoader(ds["train"], batch_size=config.batch_size)
test_dataloader = torch.utils.data.DataLoader(ds["test"], batch_size=config.batch_size)

trainer = lightning.pytorch.Trainer(deterministic=True, accelerator="cpu")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [6]:
trainer.fit(model, train_dataloader, test_dataloader)

/home/user/miniconda3/envs/directml/lib/python3.10/site-packages/lightning/pytorch/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
/home/user/miniconda3/envs/directml/lib/python3.10/site-packages/lightning/pytorch/trainer/configuration_validator.py:72: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.

  | Name  | Type             | Params
-------------------------------------------
0 | model | Qwen2ForCausalLM | 463 M 
-------------------------------------------
463 M     Trainable params
0         Non-trainable params
463 M     Total params
1,854.771 Total estimated model params size (MB)
/home/user/miniconda3/envs/directml/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in th

Training: |          | 0/? [00:00<?, ?it/s]

: 

In [12]:
input_ids, = next(iter(train_dataloader))

In [13]:
input_ids

'input_ids'