In [5]:
print("Hello")

Hello


In [6]:
import requests

In [7]:
import os


In [9]:
# Download TinyStories dataset (small sample for now)
print("Downloading TinyStories dataset...")

# We'll use a smaller subset hosted on HuggingFace
url = "https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data/data/train-00000-of-00004.parquet"


Downloading TinyStories dataset...


In [10]:
sample_stories = """
Once upon a time there was a little girl named Lily. Lily loved to play outside. One day she saw a big red ball. The ball was very bouncy.
Lily picked up the ball and threw it high. The ball went up and up. Then it came down fast. Lily caught the ball and laughed.
She played with the ball all day. When the sun went down, Lily went home. She was very happy. The end.

Tom had a small cat. The cat was black and white. The cat liked to sleep. Every day the cat would find a sunny spot.
The cat would curl up and close its eyes. Tom would pet the cat gently. The cat would purr loudly. Tom loved his cat very much.
One day the cat chased a bird. The bird flew away. The cat came back to Tom. Tom gave the cat some food. The end.

There was a boy named Sam. Sam liked to build things. He had many blocks. The blocks were different colors.
Sam built a tall tower with the blocks. The tower was very high. Sam was proud of his tower. Then his sister came.
She wanted to play too. Together they built an even bigger tower. It was the biggest tower ever. They were both happy. The end.
""" * 50  # Repeat to get more data

print(f"✓ Dataset created")
print(f"Total characters: {len(sample_stories):,}")
print(f"Total words (approx): {len(sample_stories.split()):,}")

print("\n" + "="*60)
print("SAMPLE FROM DATASET:")
print("="*60)
print(sample_stories[:500])
print("...")


✓ Dataset created
Total characters: 54,050
Total words (approx): 10,950

SAMPLE FROM DATASET:

Once upon a time there was a little girl named Lily. Lily loved to play outside. One day she saw a big red ball. The ball was very bouncy.
Lily picked up the ball and threw it high. The ball went up and up. Then it came down fast. Lily caught the ball and laughed.
She played with the ball all day. When the sun went down, Lily went home. She was very happy. The end.

Tom had a small cat. The cat was black and white. The cat liked to sleep. Every day the cat would find a sunny spot.
The cat would
...


In [11]:
# Save to file
with open('tiny_stories.txt', 'w') as f:
    f.write(sample_stories)

print("\n✓ Saved to 'tiny_stories.txt'")




✓ Saved to 'tiny_stories.txt'


In [12]:
import torch

# Load our data
with open('tiny_stories.txt', 'r') as f:
    text = f.read()

print(f"Loaded {len(text):,} characters")

# Build character-level tokenizer
print("\n" + "="*60)
print("BUILDING TOKENIZER")
print("="*60)

# Get all unique characters
chars = sorted(set(text))
vocab_size = len(chars)

print(f"\nVocabulary size: {vocab_size}")
print(f"Characters in vocab: {chars}")

# Create mappings
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

print("\n" + "="*60)
print("ENCODING EXAMPLE")
print("="*60)

# Test encoding
sample = "The cat sat"
encoded = [char_to_idx[ch] for ch in sample]
decoded = ''.join([idx_to_char[i] for i in encoded])

print(f"Original text: '{sample}'")
print(f"Encoded: {encoded}")
print(f"Decoded: '{decoded}'")
print(f"\n✓ Encoding/decoding works correctly!")

# Show character to ID mapping for sample
print("\n" + "="*60)
print("CHARACTER → ID MAPPING (sample)")
print("="*60)
for ch in sample[:5]:
    print(f"'{ch}' → {char_to_idx[ch]}")

# Encode entire dataset
print("\n" + "="*60)
print("ENCODING FULL DATASET")
print("="*60)

data = torch.tensor([char_to_idx[ch] for ch in text], dtype=torch.long)
print(f"Encoded dataset shape: {data.shape}")
print(f"First 100 tokens: {data[:100].tolist()}")

# Show what those tokens mean
print(f"\nFirst 100 characters decoded:")
print(''.join([idx_to_char[i.item()] for i in data[:100]]))

# Save for next step
torch.save({
    'data': data,
    'char_to_idx': char_to_idx,
    'idx_to_char': idx_to_char,
    'vocab_size': vocab_size
}, 'tokenized_data.pt')

print("\n✓ Tokenized data saved to 'tokenized_data.pt'")

print("\n" + "="*60)
print("STEP 2 COMPLETE ✓")
print("="*60)
print("✓ Text converted to numbers")
print("✓ Can convert back to text")
print("✓ Ready for model training")

Loaded 54,050 characters

BUILDING TOKENIZER

Vocabulary size: 34
Characters in vocab: ['\n', ' ', ',', '.', 'E', 'H', 'I', 'L', 'O', 'S', 'T', 'W', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']

ENCODING EXAMPLE
Original text: 'The cat sat'
Encoded: [10, 19, 16, 1, 14, 12, 29, 1, 28, 12, 29]
Decoded: 'The cat sat'

✓ Encoding/decoding works correctly!

CHARACTER → ID MAPPING (sample)
'T' → 10
'h' → 19
'e' → 16
' ' → 1
'c' → 14

ENCODING FULL DATASET
Encoded dataset shape: torch.Size([54050])
First 100 tokens: [0, 8, 24, 14, 16, 1, 30, 26, 25, 24, 1, 12, 1, 29, 20, 23, 16, 1, 29, 19, 16, 27, 16, 1, 32, 12, 28, 1, 12, 1, 22, 20, 29, 29, 22, 16, 1, 18, 20, 27, 22, 1, 24, 12, 23, 16, 15, 1, 7, 20, 22, 33, 3, 1, 7, 20, 22, 33, 1, 22, 25, 31, 16, 15, 1, 29, 25, 1, 26, 22, 12, 33, 1, 25, 30, 29, 28, 20, 15, 16, 3, 1, 8, 24, 16, 1, 15, 12, 33, 1, 28, 19, 16, 1, 28, 12, 32, 1, 12, 1]

First 100 characters decoded:

Once upon a tim

In [13]:
import torch

# Load our data
with open('tiny_stories.txt', 'r') as f:
    text = f.read()

print(f"Loaded {len(text):,} characters")

# Build character-level tokenizer
print("\n" + "="*60)
print("BUILDING TOKENIZER")
print("="*60)

# Get all unique characters
chars = sorted(set(text))
vocab_size = len(chars)

print(f"\nVocabulary size: {vocab_size}")
print(f"Characters in vocab: {chars}")

# Create mappings
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

print("\n" + "="*60)
print("ENCODING EXAMPLE")
print("="*60)

# Test encoding
sample = "The cat sat"
encoded = [char_to_idx[ch] for ch in sample]
decoded = ''.join([idx_to_char[i] for i in encoded])

print(f"Original text: '{sample}'")
print(f"Encoded: {encoded}")
print(f"Decoded: '{decoded}'")
print(f"\n✓ Encoding/decoding works correctly!")

# Show character to ID mapping for sample
print("\n" + "="*60)
print("CHARACTER → ID MAPPING (sample)")
print("="*60)
for ch in sample[:5]:
    print(f"'{ch}' → {char_to_idx[ch]}")

# Encode entire dataset
print("\n" + "="*60)
print("ENCODING FULL DATASET")
print("="*60)

data = torch.tensor([char_to_idx[ch] for ch in text], dtype=torch.long)
print(f"Encoded dataset shape: {data.shape}")
print(f"First 100 tokens: {data[:100].tolist()}")

# Show what those tokens mean
print(f"\nFirst 100 characters decoded:")
print(''.join([idx_to_char[i.item()] for i in data[:100]]))

# Save for next step
torch.save({
    'data': data,
    'char_to_idx': char_to_idx,
    'idx_to_char': idx_to_char,
    'vocab_size': vocab_size
}, 'tokenized_data.pt')

print("\n✓ Tokenized data saved to 'tokenized_data.pt'")

print("\n" + "="*60)
print("STEP 2 COMPLETE ✓")
print("="*60)
print("✓ Text converted to numbers")
print("✓ Can convert back to text")
print("✓ Ready for model training")

Loaded 54,050 characters

BUILDING TOKENIZER

Vocabulary size: 34
Characters in vocab: ['\n', ' ', ',', '.', 'E', 'H', 'I', 'L', 'O', 'S', 'T', 'W', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']

ENCODING EXAMPLE
Original text: 'The cat sat'
Encoded: [10, 19, 16, 1, 14, 12, 29, 1, 28, 12, 29]
Decoded: 'The cat sat'

✓ Encoding/decoding works correctly!

CHARACTER → ID MAPPING (sample)
'T' → 10
'h' → 19
'e' → 16
' ' → 1
'c' → 14

ENCODING FULL DATASET
Encoded dataset shape: torch.Size([54050])
First 100 tokens: [0, 8, 24, 14, 16, 1, 30, 26, 25, 24, 1, 12, 1, 29, 20, 23, 16, 1, 29, 19, 16, 27, 16, 1, 32, 12, 28, 1, 12, 1, 22, 20, 29, 29, 22, 16, 1, 18, 20, 27, 22, 1, 24, 12, 23, 16, 15, 1, 7, 20, 22, 33, 3, 1, 7, 20, 22, 33, 1, 22, 25, 31, 16, 15, 1, 29, 25, 1, 26, 22, 12, 33, 1, 25, 30, 29, 28, 20, 15, 16, 3, 1, 8, 24, 16, 1, 15, 12, 33, 1, 28, 19, 16, 1, 28, 12, 32, 1, 12, 1]

First 100 characters decoded:

Once upon a tim