# Arabic TinyStories (Translated) SLM

In [4]:
!pip install -U datasets transformers sentencepiece tqdm



In [5]:
# List available CUDA devices
import torch
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA not available")


GPU 0: NVIDIA GeForce RTX 4070
GPU 1: NVIDIA GeForce RTX 3080 Ti
GPU 2: NVIDIA GeForce RTX 3060


In [6]:
import os, math, torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import pipeline, AutoTokenizer
from tqdm.auto import tqdm

# Translate 1% as proof of concept; cache to parquet (Marian en->ar)
TRANSLATE_FRAC = 0.001
CACHE_TRAIN = "tini_ar_train.parquet"
CACHE_VAL = "tini_ar_val.parquet"
BATCH_SIZE_TRANSLATE = 2
MAX_LENGTH = 512
DEVICES = [0, 1]  # GPUs to use; CPU fallback
MODEL_NAME = "Helsinki-NLP/opus-mt-en-ar"


def load_and_translate():
    if os.path.exists(CACHE_TRAIN) and os.path.exists(CACHE_VAL):
        train_ds = Dataset.from_parquet(CACHE_TRAIN)
        val_ds = Dataset.from_parquet(CACHE_VAL)
        print("Loaded cached translated datasets.")
        return DatasetDict({"train": train_ds, "validation": val_ds})

    tiny = load_dataset("roneneldan/TinyStories")
    train_limit = max(1, int(len(tiny['train']) * TRANSLATE_FRAC))
    val_limit = max(1, int(len(tiny['validation']) * TRANSLATE_FRAC))

    available = torch.cuda.device_count()
    devices = [d for d in DEVICES if d < available] if torch.cuda.is_available() else []
    if not devices:
        devices = [-1]
    pipes = [pipeline("translation", model=MODEL_NAME, device=d) for d in devices]
    print(f"Using devices for translation: {devices}; train_limit={train_limit}, val_limit={val_limit}")

    def translate_split(split_ds, limit):
        texts = split_ds["text"][:limit]
        outputs_all = []
        pcount = len(pipes)
        for chunk_start in tqdm(range(0, len(texts), BATCH_SIZE_TRANSLATE), desc="translating"):
            chunk = texts[chunk_start:chunk_start+BATCH_SIZE_TRANSLATE]
            pipe = pipes[(chunk_start//BATCH_SIZE_TRANSLATE) % pcount]
            outs = pipe(chunk, max_length=MAX_LENGTH, truncation=True)
            outputs_all.extend([o["translation_text"] for o in outs])
        return Dataset.from_dict({"text": outputs_all})

    train_ds = translate_split(tiny['train'], train_limit)
    val_ds = translate_split(tiny['validation'], val_limit)
    train_ds.to_parquet(CACHE_TRAIN)
    val_ds.to_parquet(CACHE_VAL)
    return DatasetDict({"train": train_ds, "validation": val_ds})

dataset = load_and_translate()
print(dataset)


Using devices for translation: [0, 1]; train_limit=2119, val_limit=21


translating:   0%|          | 0/1060 [00:00<?, ?it/s]

Your input_length: 512 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 512 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 492 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 512 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 512 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 512 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 495 is bigger than 0.9 * max_length: 512. You

translating:   0%|          | 0/11 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21
    })
})


Tokenize with Arabic GPT-2 tokenizer and bin.

In [7]:
from tqdm.auto import tqdm
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

BIN_PREFIX = "tini_ar"
TRAIN_BIN = f"{BIN_PREFIX}_train.bin"
VAL_BIN = f"{BIN_PREFIX}_validation.bin"


def processing(sample_text):
    ids = tokenizer.encode(sample_text['text'], add_special_tokens=False)
    return {'ids': ids, 'len': len(ids)}

cols_to_remove = [c for c in dataset['train'].column_names if c != 'text']
if not (os.path.exists(TRAIN_BIN) and os.path.exists(VAL_BIN)):
    tokenized = dataset.map(processing, remove_columns=cols_to_remove, desc="tokenizing", num_proc=4)
    for split, dset in tokenized.items():
        arr_len = np.sum(dset['len'], dtype=np.uint64)
        filename = TRAIN_BIN if split == 'train' else VAL_BIN
        arr = np.memmap(filename, dtype=np.uint32, mode='w+', shape=(arr_len,))
        total_batches = 1 if len(dset) < 512 else 512
        idx = 0
        for b in tqdm(range(total_batches), desc=f"writing {filename}"):
            shard = dset.shard(num_shards=total_batches, index=b, contiguous=True).with_format('numpy')
            arr_batch = np.concatenate(shard['ids'])
            arr[idx: idx+len(arr_batch)] = arr_batch
            idx += len(arr_batch)
        arr.flush()
else:
    print("Reusing existing bins")


tokenizing (num_proc=4):   0%|          | 0/2119 [00:00<?, ? examples/s]

tokenizing (num_proc=4):   0%|          | 0/21 [00:00<?, ? examples/s]

writing tini_ar_train.bin:   0%|          | 0/512 [00:00<?, ?it/s]

writing tini_ar_validation.bin:   0%|          | 0/1 [00:00<?, ?it/s]

Batches and model.

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass

def get_batch(split, block_size=128, batch_size=16):
    data = np.memmap(TRAIN_BIN if split=='train' else VAL_BIN, dtype=np.uint32, mode='r')
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    return x.to(device), y.to(device)

class LayerNorm(nn.Module):
    def __init__(self, ndim, bias):
        super().__init__(); self.weight=nn.Parameter(torch.ones(ndim)); self.bias=nn.Parameter(torch.zeros(ndim)) if bias else None
    def forward(self,x): return F.layer_norm(x,self.weight.shape,self.weight,self.bias,1e-5)

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__(); assert config.n_embd % config.n_head==0
        self.c_attn=nn.Linear(config.n_embd,3*config.n_embd,bias=config.bias)
        self.c_proj=nn.Linear(config.n_embd,config.n_embd,bias=config.bias)
        self.attn_dropout=nn.Dropout(config.dropout); self.resid_dropout=nn.Dropout(config.dropout)
        self.n_head=config.n_head; self.n_embd=config.n_embd
    def forward(self,x):
        B,T,C=x.size(); q,k,v=self.c_attn(x).split(self.n_embd,dim=2)
        k=k.view(B,T,self.n_head,C//self.n_head).transpose(1,2)
        q=q.view(B,T,self.n_head,C//self.n_head).transpose(1,2)
        v=v.view(B,T,self.n_head,C//self.n_head).transpose(1,2)
        att=(q@k.transpose(-2,-1))/math.sqrt(k.size(-1)); att=F.softmax(att,dim=-1); att=self.attn_dropout(att)
        y=att@v; y=y.transpose(1,2).contiguous().view(B,T,C); y=self.resid_dropout(self.c_proj(y)); return y

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__(); self.c_fc=nn.Linear(config.n_embd,4*config.n_embd,bias=config.bias)
        self.gelu=nn.GELU(); self.c_proj=nn.Linear(4*config.n_embd,config.n_embd,bias=config.bias); self.dropout=nn.Dropout(config.dropout)
    def forward(self,x): return self.dropout(self.c_proj(self.gelu(self.c_fc(x))))

class Block(nn.Module):
    def __init__(self, config):
        super().__init__(); self.ln1=LayerNorm(config.n_embd, config.bias); self.attn=CausalSelfAttention(config)
        self.ln2=LayerNorm(config.n_embd, config.bias); self.mlp=MLP(config)
    def forward(self,x): x=x+self.attn(self.ln1(x)); x=x+self.mlp(self.ln2(x)); return x

@dataclass
class GPTConfig:
    block_size:int; vocab_size:int; n_layer:int; n_head:int; n_embd:int; dropout:float=0.1; bias:bool=True

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__(); self.config=config
        self.transformer=nn.ModuleDict(dict(
            wte=nn.Embedding(config.vocab_size, config.n_embd),
            wpe=nn.Embedding(config.block_size, config.n_embd),
            drop=nn.Dropout(config.dropout),
            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f=LayerNorm(config.n_embd, config.bias),
        ))
        self.lm_head=nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight=self.lm_head.weight
        self.apply(self._init_weights)
        for pn,p in self.named_parameters():
            if pn.endswith('c_proj.weight'): nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2*config.n_layer))
    def _init_weights(self,m):
        if isinstance(m, nn.Linear): nn.init.normal_(m.weight, mean=0.0, std=0.02); nn.init.zeros_(m.bias) if m.bias is not None else None
        elif isinstance(m, nn.Embedding): nn.init.normal_(m.weight, mean=0.0, std=0.02)
    def forward(self, idx, targets=None):
        b,t=idx.size(); pos=torch.arange(0,t,device=idx.device)
        x=self.transformer.drop(self.transformer.wte(idx)+self.transformer.wpe(pos))
        for block in self.transformer.h: x=block(x)
        x=self.transformer.ln_f(x)
        if targets is not None:
            logits=self.lm_head(x); loss=F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
            return logits, loss
        logits=self.lm_head(x[:,[-1],:]); return logits, None


Training.

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
config = GPTConfig(vocab_size=tokenizer.vocab_size, block_size=128, n_layer=6, n_head=6, n_embd=384)
model = GPT(config).to(device)

base_lr = 3e-4
batch_size = 16
block_size = config.block_size
gradient_accumulation_steps = 4
lr_scale = batch_size/8.0
learning_rate = base_lr * max(lr_scale, 1e-2)
min_lr = learning_rate*0.1
max_iters = 5000
warmup_steps = 500
eval_iters = 200

from torch.optim.lr_scheduler import LinearLR, SequentialLR, CosineAnnealingLR
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9,0.95), weight_decay=0.1, eps=1e-9)
scheduler_warmup = LinearLR(optimizer, total_iters=warmup_steps)
scheduler_decay = CosineAnnealingLR(optimizer, T_max=max_iters-warmup_steps, eta_min=min_lr)
scheduler = SequentialLR(optimizer, schedulers=[scheduler_warmup, scheduler_decay], milestones=[warmup_steps])
scaler = torch.cuda.amp.GradScaler(enabled=(device=='cuda'))


def estimate_loss():
    out={}; model.eval()
    with torch.inference_mode():
        for split in ['train','validation']:
            losses=torch.zeros(eval_iters)
            for k in range(eval_iters):
                X,Y=get_batch(split, block_size=block_size, batch_size=batch_size)
                with torch.amp.autocast(device_type='cuda' if device=='cuda' else 'cpu', dtype=torch.float16 if device=='cuda' else torch.float32):
                    logits, loss = model(X, Y)
                losses[k]=loss.item()
            out[split]=losses.mean().item()
    model.train(); return out

def loss_to_ppl(val): return math.exp(val) if math.isfinite(val) else float('inf')

best_val=float('inf')
best_path='best_tini_ar.pt'
train_losses=[]; val_losses=[]; train_ppl=[]; val_ppl=[]

for step in tqdm(range(max_iters)):
    if step % eval_iters == 0 and step>0:
        losses=estimate_loss()
        t,v = losses['train'], losses['validation']
        train_losses.append(t); val_losses.append(v)
        train_ppl.append(loss_to_ppl(t)); val_ppl.append(loss_to_ppl(v))
        print(f"step {step}: train {t:.4f} (ppl {train_ppl[-1]:.2f}) val {v:.4f} (ppl {val_ppl[-1]:.2f})")
        if v < best_val:
            best_val=v; torch.save(model.state_dict(), best_path)
    X,Y=get_batch('train', block_size=block_size, batch_size=batch_size)
    with torch.amp.autocast(device_type='cuda' if device=='cuda' else 'cpu', dtype=torch.float16 if device=='cuda' else torch.float32):
        logits, loss = model(X,Y)
        loss = loss/gradient_accumulation_steps
    scaler.scale(loss).backward()
    if (step+1)%gradient_accumulation_steps==0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        scaler.step(optimizer); scaler.update(); optimizer.zero_grad(set_to_none=True)
    scheduler.step()


  scaler = torch.cuda.amp.GradScaler(enabled=(device=='cuda'))


  0%|          | 0/5000 [00:00<?, ?it/s]



step 200: train 7.2096 (ppl 1352.32) val 7.0644 (ppl 1169.63)
step 400: train 5.8562 (ppl 349.39) val 5.7601 (ppl 317.39)




step 600: train 5.2702 (ppl 194.45) val 5.2235 (ppl 185.59)
step 800: train 4.8854 (ppl 132.35) val 4.8681 (ppl 130.08)
step 1000: train 4.5883 (ppl 98.33) val 4.6300 (ppl 102.52)
step 1200: train 4.2988 (ppl 73.61) val 4.4125 (ppl 82.47)
step 1400: train 3.4663 (ppl 32.02) val 3.5902 (ppl 36.24)
step 1600: train 0.9656 (ppl 2.63) val 0.9072 (ppl 2.48)
step 1800: train 0.4425 (ppl 1.56) val 0.4313 (ppl 1.54)
step 2000: train 0.2382 (ppl 1.27) val 0.2765 (ppl 1.32)
step 2200: train 0.1565 (ppl 1.17) val 0.1994 (ppl 1.22)
step 2400: train 0.1115 (ppl 1.12) val 0.1672 (ppl 1.18)
step 2600: train 0.0889 (ppl 1.09) val 0.1461 (ppl 1.16)
step 2800: train 0.0720 (ppl 1.07) val 0.1329 (ppl 1.14)
step 3000: train 0.0620 (ppl 1.06) val 0.1253 (ppl 1.13)
step 3200: train 0.0555 (ppl 1.06) val 0.1217 (ppl 1.13)
step 3400: train 0.0510 (ppl 1.05) val 0.1173 (ppl 1.12)
step 3600: train 0.0473 (ppl 1.05) val 0.1132 (ppl 1.12)
step 3800: train 0.0451 (ppl 1.05) val 0.1101 (ppl 1.12)
step 4000: train 0

Generation from best checkpoint.

In [10]:
model.load_state_dict(torch.load('best_tini_ar.pt', map_location=device))
model.eval()

def generate(prompt, max_new_tokens=50, temperature=0.8, top_k=50):
    idx = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt').to(device)
    with torch.no_grad():
        for _ in range(max_new_tokens):
            idx_cond = idx if idx.size(1) <= config.block_size else idx[:, -config.block_size:]
            logits,_ = model(idx_cond)
            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                v,_ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('inf')
            probs = F.softmax(logits, dim=-1)
            next_id = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next_id), dim=1)
    return tokenizer.decode(idx[0].tolist())

print(generate("في قديم الزمان كان هناك طفل صغير يحب القصص."))
print(generate("كانت الطفلة تمشي في الغابة وتسمع الطيور تغني."))


  model.load_state_dict(torch.load('best_tini_ar.pt', map_location=device))


في قديم الزمان كان هناك طفل صغير يحب القصص. يحب القصص قديم الزمان القصص الزمان القصص القصص القصص الزمان القصص قديم الزمان القصصاذ القصص قديم قديم قديم قديم مليء القصص قديم قديم قديم قديم قديم قديم قديم القصص القصص الزمان القصص قديم قديم القصص القصص القصص القصص قديم قديم قديم الزمان القصص قديم قديم قديم الزمان الزمان القصص
كانت الطفلة تمشي في الغابة وتسمع الطيور تغني. الطيور تغني. في الغابة في الطيور تغني في الغابة في الغابة تمشي في الغابة وتس وتس تمشي في الغابة وتس الطيور في الغابة الطيور الغابة. في الغابة الغابة في الغابة في الغابة الغابة الغابة الغابة في الغابة الطفلة الغابة الغابة الغابة في الغابة الغابة الغابة الغابة في الغابة
