In [1]:
import torch
import pandas as pd
import pytorch_lightning as pl

from torch.utils.data import DataLoader
from datasets import Dataset, load_dataset
from transformers import Qwen2Config, Qwen2TokenizerFast, Qwen2ForCausalLM

# Load data

In [2]:
# Load the dataset in streaming mode
stories_stream = load_dataset('roneneldan/TinyStories', streaming=True, trust_remote_code=True)
stories_stream

IterableDatasetDict({
    train: IterableDataset({
        features: ['text'],
        num_shards: 4
    })
    validation: IterableDataset({
        features: ['text'],
        num_shards: 1
    })
})

In [3]:
n_rows = 1100

# Get the first 100 rows
rows = list(stories_stream['train'].take(n_rows))

# Count the total number of characters
total_chars = sum(len(row["text"]) for row in rows)
total_chars

1014715

In [4]:
stories = Dataset.from_list(rows)

print(stories)

Dataset({
    features: ['text'],
    num_rows: 1100
})


In [5]:
batch_size = 100

def batch_iterator():
    for i in range(0, len(stories), batch_size):
        yield stories[i : i + batch_size]["text"]

# Train Tokenizer

In [6]:
base_tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen2.5-0.5B", errors="ignore")

In [7]:
base_tokenizer.decode([51461], errors="replace")

' �'

In [8]:
vocab_size = 1024
tokenizer = base_tokenizer.train_new_from_iterator(
    batch_iterator(),
    vocab_size=vocab_size
)






# Test untrained model

## Initialize model

In [9]:
hidden_size = 64

config = Qwen2Config(
    num_hidden_layers=3,
    hidden_size=hidden_size,
    intermediate_size=hidden_size * 4,  # MLP hidden dim, following GPT-2 approach x4
    num_attention_heads=8,
    num_key_value_heads=2, # if equal to the num_attention heads, the MHA if 1 then MQA, else GQA
    vocab_size=vocab_size,
    max_position_embeddings=512,  # Maximum sequence length
    attention_probs_dropout_prob=0.1,
)

config

Qwen2Config {
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "silu",
  "hidden_size": 64,
  "initializer_range": 0.02,
  "intermediate_size": 256,
  "max_position_embeddings": 512,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 8,
  "num_hidden_layers": 3,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.3",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 1024
}

In [10]:
model = Qwen2ForCausalLM(config)

In [11]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

# Calculate the total size in bytes (assuming float32, 4 bytes per parameter)
total_size_bytes = total_params * 4

# Convert to megabytes
total_size_mb = total_size_bytes / (1024 * 1024)

print(f"Total size of the model: {total_size_mb:.2f} MB")

Total number of parameters: 309,984
Total size of the model: 1.18 MB


## Generate text from the model

In [12]:
def decode_tokens_to_dataframe(tokenizer, inputs):
    """Decodes token IDs to tokens and returns them as a transposed DataFrame."""
    decoded_tokens = [tokenizer.decode(token_id) for token_id in inputs[0]]
    token_ids = inputs[0].tolist()

    # Create and return a transposed DataFrame
    df = pd.DataFrame({
        "Token": decoded_tokens,
        "Token ID": token_ids
    })

    return df.T

In [13]:
text = "One day a little girl, wakanda"
inputs = tokenizer(text, return_tensors="pt")

In [14]:
decode_tokens_to_dataframe(tokenizer, inputs["input_ids"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Token,One,day,a,little,girl,",",wa,k,and,a
Token ID,446,371,272,406,451,25,283,88,711,78


In [15]:
outputs = model.generate(
    inputs['input_ids'],
    max_length=16
)

In [16]:
decode_tokens_to_dataframe(tokenizer, outputs)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
Token,One,day,a,little,girl,",",wa,k,and,a,G,�,li,b,�,out
Token ID,446,371,272,406,451,25,283,88,711,78,52,161,383,79,127,466


In [17]:
token_to_id = tokenizer.get_vocab()
id_to_token = {v: k for k, v in token_to_id.items()}

> From [Qwen/tokenization_note.md](https://github.com/QwenLM/Qwen/blob/main/tokenization_note.md): The regular tokens are BPE tokens learned from byte sequences of texts encoded using the UTF-8 encoding. While this allows tokenization of all texts and no unknown token exists, it may fall back to using single bytes when tokenizing uncommon texts. You may encounter UTF-8 decoding errors and as the errors are default to replace, thus the replacement character (�) in incomplete generation.

In [18]:
print(id_to_token[189])
print(tokenizer.decode(189))
print(tokenizer.convert_ids_to_tokens(189))

ó
�
ó


In [103]:
print(id_to_token[271])
print(tokenizer.decode(271))
print(tokenizer.convert_ids_to_tokens(271))

Ġt
 t
Ġt


# Train language model

## Create Lightning Data Module

In this case, we are creating a simple Lightning module when we have already loaded the Dataset. We would normally want to use this in a more complex manner, like with an iterableDataset from a series of Parquet files

In [21]:
class DataModule(pl.LightningDataModule):

    def __init__(
            self,
            dataset_name: str,
            n_train_rows: int,
            n_val_rows: int,
            batch_size: int,
            max_seq_length: int,
            num_workers: int,
            tokenizer: Qwen2TokenizerFast,
            random_seed: int = 42,
    ):
        """
        :param dataset_name: Name of the dataset.
        :param n_train_rows: Number of training rows.
        :param n_val_rows: Number of validation rows.
        :param batch_size: Batch size.
        :param max_seq_length: Max sequence length.
        :param num_workers: Number of workers.
        :param random_seed: Random seed.
        """
        super().__init__()
        self.dataset_name = dataset_name
        self.n_train_rows = n_train_rows
        self.n_val_rows = n_val_rows
        self.batch_size = batch_size
        self.max_seq_length = max_seq_length
        self.num_workers = num_workers
        self.tokenizer = tokenizer
        self.random_seed = random_seed

    def setup(self, stage: str):
        # Load dataset in streaming mode
        ds = load_dataset(
            self.dataset_name,
            streaming=True,
            trust_remote_code=True
        )

        # Create dataset
        self.train_ds = self._create_dataset(
            ds=ds,
            split="train",
            n_rows=self.n_train_rows,
        )
        self.val_ds = self._create_dataset(
            ds=ds,
            split="validation",
            n_rows=self.n_val_rows,
        )

        # Tokenizer
        # TODO: In reality, we would the tokenizer here
        self.tokenizer = tokenizer


    def train_dataloader(self):
        return DataLoader(
            dataset=self.train_ds,
            batch_size=self.batch_size,
            collate_fn=self._collate_batch,
            num_workers=self.num_workers,
        )

    def val_dataloader(self):
        return DataLoader(
            dataset=self.val_ds,
            batch_size=self.batch_size,
            collate_fn=self._collate_batch,
            num_workers=self.num_workers,
        )


    def _create_dataset(self, ds, split, n_rows):
        # Download and load N rows
        rows = list(stories_stream[split].take(n_rows))
        return Dataset.from_list(rows)

    def _collate_batch(self, batch):

        batch_text = [item["text"] for item in batch]

        # Tokenize texts and stack the resulting tensors
        batch_tokenized = self.tokenizer(
            batch_text,
            truncation=True,
            padding="longest",
            max_length=self.max_seq_length,
            return_tensors="pt",
        )

        return batch_tokenized

### Test the `DataModule`

In [22]:
data_module = DataModule(
    dataset_name='roneneldan/TinyStories',  # The dataset name
    n_train_rows=10,                       # For testing, load only 10 rows
    n_val_rows=10,                         # For testing, load only 10 validation rows
    batch_size=2,                          # Smaller batch size for testing
    max_seq_length=128,                    # Choose a max sequence length (e.g., 128)
    num_workers=0,                         # No need for multiple workers in a test scenario
    tokenizer=tokenizer,                   # Pass the tokenizer here
    random_seed=42                          # Optional, for reproducibility
)

# Set up the data module
data_module.setup(stage='fit')

# Get the first batch from the train dataloader
train_dataloader = data_module.train_dataloader()

# Iterate over the dataloader to get the first batch
first_batch = next(iter(train_dataloader))

# Print the first batch to inspect it
print(first_batch)

{'input_ids': tensor([[446, 371,  25, 272, 406, 451, 572, 398, 616, 272, 831, 322, 333, 326,
         758,  27, 325, 789, 318, 294, 291, 602,  83, 477,  98,  89,  97, 280,
         377, 353, 318, 883, 318, 294, 391, 301,  93,  27, 398, 461, 280, 902,
         276, 831, 322, 353, 326, 399,  25, 356, 348, 480, 430, 100, 272, 433,
          97, 299, 361, 326, 391, 327,  97, 319, 370, 491, 280, 326, 399, 278,
         343,  25, 350, 810,  25, 346, 616, 745, 831, 322,  27, 880, 314, 366,
         902, 318, 353, 516, 278, 430, 100, 625, 391, 327,  97, 610, 900, 399,
         508, 278, 343,  25, 350, 922,  25, 398,  25, 373, 481, 902, 276, 831,
         322, 278, 966, 101, 637, 391, 327,  97, 503,  65,  92, 576,  25, 380,
         391, 606],
        [455, 475, 272, 420,  25, 424, 294, 272, 406, 569, 572, 382,  82, 580,
          27, 382,  82, 580, 560, 280, 449, 848, 278, 377, 333, 276, 749,  27,
         382,  82, 580, 294, 272, 292, 416,  97,  85, 102, 569, 883, 292, 683,
         375, 614,

## Create `ModelModule`

## Run training