In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import wandb

In [None]:
import pandas as pd
from collections import Counter

In [None]:
import plotly.graph_objects as go

In [None]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfangyua[0m ([33mfangyua-univeristy-of-michigan[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import math
import inspect
from dataclasses import dataclass
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.amp import autocast, GradScaler

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [None]:
vocab = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '=', '+', '&', '*']
device = 'cuda' if torch.cuda.is_available() else 'cpu'
padding_token_index = 13
end_token_index = 12

In [None]:
# create a mapping from chars to ints
stoi = {ch:i for i, ch in enumerate(vocab)}
itos = {i:ch for i, ch in enumerate(vocab)}
encode = lambda s:[stoi[c] for c in s] # encoder: take a string, output a list of ints
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of ints, output a string

print(encode("1+2=3&"))
print(decode(encode("1+2=3&")))

[1, 11, 2, 10, 3, 12]
1+2=3&


In [None]:
print(device)

cuda


In [None]:
def get_batch(phase=None, batch_size=1024, block_size=25, mode='train'):

    if mode == 'train':
      # random choose a and b from set
      if phase != 6:
          a = np.random.randint(0, 10**(phase), batch_size)
          b = np.random.randint(0, 10**(phase), batch_size)
          c = a + b
      elif phase == 6:
          exp_a = np.random.choice(np.arange(1, 6), size=batch_size)
          exp_b = np.random.choice(np.arange(1, 6), size=batch_size)
          # print(exp)
          a = np.random.randint(10**(exp_a-1), 10**(exp_a), size=batch_size)
          b = np.random.randint(10**(exp_b-1), 10**(exp_b), size=batch_size)
          c = a + b
    else:
      if phase != 6:
          a = np.random.randint(0, 10**(phase), batch_size)
          b = np.random.randint(0, 10**(phase), batch_size)
          c = a + b
      elif phase == 6:
          exp_a = np.random.choice(np.arange(1, 6), size=batch_size)
          exp_b = np.random.choice(np.arange(1, 6), size=batch_size)
          # print(exp)
          a = np.random.randint(10**(exp_a-1), 10**(exp_a), size=batch_size)
          b = np.random.randint(10**(exp_b-1), 10**(exp_b), size=batch_size)
          c = a + b

    x_list, y_list = [], []
    for i, j, k in zip(a, b, c):
        # construct X: "i+j=k&"
        i_str = str(i)[::-1]
        j_str = str(j)[::-1]
        k_str = str(k)[::-1]
        x_str = f"{i_str}+{j_str}={k_str}&"
        # print(x_str)
        x_encoded = encode(x_str)
        x_padded = x_encoded + [padding_token_index] * (block_size - len(x_encoded))
        x_list.append(torch.tensor(x_padded, dtype=torch.int64))

        # construct Y: "k&"
        y_encoded = encode(x_str)[1:]
        y_encoded.append(end_token_index)
        y_padded = y_encoded + [padding_token_index] * (block_size - len(y_encoded))
        y_list.append(torch.tensor(y_padded, dtype=torch.int64))

    x_tensor = torch.stack(x_list).to(device)
    y_tensor = torch.stack(y_list).to(device)
    return x_tensor, y_tensor

In [None]:
get_batch(phase=6)[0].shape

torch.Size([1024, 25])

In [None]:
class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """

    def __init__(self, ndim, bias=True): # class constructor
        super().__init__()
        # nn.Parameter, pytorch optimize will update the value of this parameter during training
        self.weight = nn.Parameter(torch.ones(ndim)) # trainable parameter
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None # trainable parameter

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class CausalSelfAttention(nn.Module):
    def __init__(self, n_embd, n_head, dropout, block_size, bias=True):
        super().__init__()
        assert n_embd % n_head == 0, "Embedding dimension must be divisible by the number of heads."

        # Store hyperparameters
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout = dropout
        self.block_size = block_size

        # Key, Query, Value projections
        self.c_attn = nn.Linear(n_embd, 3 * n_embd, bias=bias)
        # Output projection
        self.c_proj = nn.Linear(n_embd, n_embd, bias=bias)

        # T-5 PE
        # self.rel_pos_bias = T5RelativePositionBias(block_size, n_head)

        # Regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)

                # Check for Flash Attention availability
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # Causal mask for slow attention
            self.register_buffer(
                "bias",
                torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size)
            )

    def forward(self, x):
        B, T, C = x.size()  # Batch size, sequence length, embedding dimension

        # Compute Q, K, V
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)  # Split into Q, K, V (B, T, n_embd)

        # Reshape for multi-head attention
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, n_head, T, head_size)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, n_head, T, head_size)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, n_head, T, head_size)

        # Compute T5 relative position bias
        # self.rel_pos_bias = self.rel_pos_bias.to(device)  # Move to correct device
        # rel_bias = self.rel_pos_bias(T, device)  # Compute relative position bias
        # (1, num_heads, T, T)

        # Flash Attention or fallback to manual implementation
        if self.flash:
            y = torch.nn.functional.scaled_dot_product_attention(
                q, k, v,
                attn_mask=None,
                dropout_p=self.dropout if self.training else 0,
                is_causal=True
            )
        # else:
        # Manual attention with causal masking
        # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))  # Scaled dot product
        # # att = att + rel_bias  # Apply relative positional bias
        # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))  # Apply causal mask
        # att = F.softmax(att, dim=-1)  # Normalize attention scores
        # att = self.attn_dropout(att)
        # y = att @ v  # Apply attention weights to values (B, n_head, T, head_size)

        # Reshape back to original format
        y = y.transpose(1, 2).contiguous().view(B, T, C)  # Reassemble heads

        # Output projection and residual dropout
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module): # FFN

    def __init__(self, n_embd, dropout, bias=True):
        super().__init__()
        self.c_fc    = nn.Linear(n_embd, 4 * n_embd, bias=bias)
        self.gelu    = nn.GELU() # nonlinear activation function
        self.c_proj  = nn.Linear(4 * n_embd, n_embd, bias=bias)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):
    def __init__(self, n_embd, n_head, dropout, block_size, bias=True):
        super().__init__()
        # LayerNorm and CausalSelfAttention with explicit parameters
        self.ln_1 = LayerNorm(n_embd, bias=bias)
        self.attn = CausalSelfAttention(n_embd, n_head, dropout, block_size, bias=bias)
        self.ln_2 = LayerNorm(n_embd, bias=bias)
        self.mlp = MLP(n_embd, dropout, bias=bias)  # MLP with explicit parameters

    def forward(self, x):
        # Apply residual connection and pre-normalization
        x = x + self.attn(self.ln_1(x))  # Apply LayerNorm before attention
        x = x + self.mlp(self.ln_2(x))  # Apply LayerNorm before MLP
        return x


class GPT(nn.Module):

    def __init__(self, vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias=True):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        super().__init__()
        assert vocab_size is not None
        assert block_size is not None
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.dropout = dropout
        self.bias = bias

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(vocab_size, n_embd), # token embeddings
            # wpe = nn.Embedding(block_size, n_embd), # positional embeddings CHANGE, t-5 positional embedding
            drop = nn.Dropout(dropout),
            h = nn.ModuleList([Block(n_embd, n_head, dropout, block_size, bias=bias) for _ in range(n_layer)]), # a stack of n_layer blocks
            ln_f = LayerNorm(n_embd, bias=bias), # final layer norm
        ))
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False) # projects the final transformer output to the vocab size

        # init all weights
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.cblock_size}"
        # pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        # pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb)# + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        logits = self.lm_head(x)

        loss = None

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=13)
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            # loss = None

        return logits, loss

In [None]:
eval_iters = 100

@torch.no_grad()
def estimate_loss(phase, models):
    out = {}
    models.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        X, Y = get_batch(phase, mode='train')
        padding_mask_x = (X != padding_token_index).long()
        logits, loss = models(X, Y)
        losses[k] = loss.item()
    out['train'] = losses.mean()
    models.train()
    return out

In [None]:
@torch.no_grad()
def generate(model, idx, max_new_tokens, temperature=1, top_k=None):
    """
    Generate a sequence of tokens given an initial sequence.

    Parameters:
        model (nn.Module): The model used for generation.
        idx (torch.Tensor or list): Initial sequence of indices (LongTensor of shape (b,t)).
        max_new_tokens (int): Number of new tokens to generate.
        temperature (float): Scaling factor for logits before softmax.
        top_k (int, optional): If specified, restricts sampling to top k tokens.

    Returns:
        torch.Tensor: The generated sequence.
    """
    idx = idx.unsqueeze(0) if idx.dim() == 1 else idx
    idx = torch.tensor(idx, device=model.device) if not isinstance(idx, torch.Tensor) else idx.to(model.device)

    for _ in range(max_new_tokens):
        # Ensure context length does not exceed model's block size
        idx_cond = idx if idx.size(1) <= model.block_size else idx[:, -model.block_size:]

        # Forward pass to get logits
        logits, _ = model(idx_cond)

        # Extract logits for the last token and apply temperature scaling
        logits = logits[:, -1, :] / temperature

        # Apply top-k filtering if necessary
        if top_k is not None:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = -float('Inf')

        # Convert logits to probabilities
        probs = F.softmax(logits, dim=-1)

        # Sample next token
        idx_next = torch.multinomial(probs, num_samples=1)

        if idx_next == end_token_index:
            break
        # Append sampled token to sequence

        idx = torch.cat((idx, idx_next), dim=1)

    return decode(idx.tolist()[0])


In [None]:
batch_size = 1024 # how many independent sequences will we process in parallel?
block_size = 25 # what is the maximum context length for predictions?
max_iters = 20000 # CHANGE the step size
# num_epochs = 100
eval_interval = 100
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.0
# # torch.manual_seed(1337)
# if torch.cuda.is_available():
#     torch.cuda.manual_seed_all(1337)
bias = True # if using bias inside all Linear layers
vocab_size = len(vocab)

In [None]:
wandb.init(project="self-improve-transformer-train",
           config={
            "learning_rate": 5e-4,
            "batch_size": 1024,
            "block_size": 25,
            "optimizer": "AdamW",
            "n_embd": 384,
            "n_head": 6,
            "n_layer": 6,
            "dropout": 0.0,
            },
           name = "Trial 6 of finding 90%+ accuracy"
)

In [None]:
wandb.run.notes = "Trial 5 of finding "

In [None]:
# run this cell if you wanna initialize a new model
model = GPT(vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias=bias)
m = model.to(device)

In [None]:
# create a PyTorch optimizer for base model
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=5e-4,                  # 学习率 change to 1e-4 from 5e-4
    betas=(0.9, 0.99),        # 动量参数
    eps=1e-12,                # 数值稳定性参数
    weight_decay=0.1          # 权重衰减
)

total_steps = 20000       # 总训练步数 CHANGE
warmup_steps = 1000       # 预热阶段步数
decay_steps = 2000        # 衰减阶段步数
stable_steps = total_steps - warmup_steps - decay_steps  # 稳定阶段步数

# Warmup + Stable + Decay
def lr_lambda(step):
    if step < warmup_steps:
        # linear Warmup（0 to base_lr）
        return step / warmup_steps
    elif step < warmup_steps + stable_steps:
        # stable
        return 1.0
    else:
        # Cosine Decay）
        decay_ratio = (step - warmup_steps - stable_steps) / decay_steps
        return 0.5 * (1 + math.cos(math.pi * decay_ratio))  # cos 1 to 0


scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)

In [None]:
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

phase = 6
counter = 0
best_loss = float('inf')
val_loss_list = []

patience = 20

scaler = GradScaler('cuda')
for iter in tqdm(range(max_iters), desc="Training Progress"):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(phase, model)['train']
        print(f"step {iter}: loss {losses:.4f}, Best Loss so far: {best_loss}, Counter: {counter}")
        log_dict = {"Loss": losses}
        val_loss_list.append(round(losses.item(), 4))
        wandb.log(log_dict)

    # sample a batch of data
    xb, yb = get_batch(phase)

    # evaluate the loss
    with autocast(device_type="cuda", dtype=torch.bfloat16):
        logits, loss = model(xb, yb)

    optimizer.zero_grad(set_to_none=True)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

    scheduler.step()

In [None]:
wandb.finish()

0,1
Loss,█▅▄▂▃▁▁▂▁▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Loss,1.12336


In [None]:

def accuracy_print_one(model, num_digits, need_print=False):
        correct = 0
        wrong = 0

        for j in range(100):
            a = np.random.randint(10**(num_digits-1), 10**(num_digits), size=1)
            b = np.random.randint(10**(num_digits-1), 10**(num_digits), size=1)
            c = a + b
            reversed_a = np.array([str(x)[::-1] for x in a])
            reversed_b = np.array([str(x)[::-1] for x in b])
            reversed_c = np.array([str(x)[::-1] for x in c])
            input = f"{reversed_a.item()}+{reversed_b.item()}="
            context = torch.tensor(encode(input), dtype=torch.long, device=device)

            output1 = generate(model=model, idx=context, max_new_tokens=35, top_k=1)


            if output1 == f"{reversed_a.item()}+{reversed_b.item()}={reversed_c.item()}":
                correct += 1
                wrong = 0
            else:
                wrong = 1
                
            # for testing
            if need_print and wrong:
                print(f"   Input: {input}")
                print(f"  Output: {output1}")
                print(f"Expected: {reversed_a.item()}+{reversed_b.item()}={reversed_c.item()}")
                print("-----------")

        acc = correct / 100
        print(f"Accuracy for {num_digits} digits addition: {acc} ")
        return acc
    
def get_avg_performance(models):
    dict_acc = {}
    for num_dig in range(1, 7):
        dict_acc[num_dig] = accuracy_print_one(models, num_dig, need_print=False)
    return dict_acc

In [None]:
avg_performance = get_avg_performance(model)

Accuracy for 1 digits addition: 1.0 
Accuracy for 2 digits addition: 1.0 
Accuracy for 3 digits addition: 1.0 
Accuracy for 4 digits addition: 1.0 
Accuracy for 5 digits addition: 1.0 
Accuracy for 6 digits addition: 0.34 


In [None]:
filename = f"accuracy_0.96_3w_iter.pt"
save_path = f"/content/drive/MyDrive/URPS/{filename}"
torch.save(model1.state_dict(), save_path)
print(f"Saved best model at {save_path}")

Saved best model at /content/drive/MyDrive/URPS/accuracy_0.96_3w_iter.pt


In [None]:
acc_list = []
for i in range(100):
    acc_list.append(accuracy_print_one(model1, 6, need_print=False))
print(sum(acc_list)/len(acc_list))

Accuracy for 6 digits addition: 0.8 
Accuracy for 6 digits addition: 0.82 
Accuracy for 6 digits addition: 0.87 
Accuracy for 6 digits addition: 0.77 
Accuracy for 6 digits addition: 0.83 
Accuracy for 6 digits addition: 0.75 
Accuracy for 6 digits addition: 0.83 
Accuracy for 6 digits addition: 0.86 
Accuracy for 6 digits addition: 0.82 
Accuracy for 6 digits addition: 0.88 
Accuracy for 6 digits addition: 0.82 
Accuracy for 6 digits addition: 0.82 
Accuracy for 6 digits addition: 0.87 
Accuracy for 6 digits addition: 0.83 
Accuracy for 6 digits addition: 0.8 


KeyboardInterrupt: 

In [None]:
def test_accuracy_on_6(model):
    return accuracy_print_one(model, 6, need_print=False)

In [None]:
def create_optimizer_and_scheduler(model):
    # AdamW
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=5e-4,              # learning rate
        betas=(0.9, 0.99),
        eps=1e-12,
        weight_decay=0.1
    )

    # LR Scheduler
    total_steps = 20000 # change to 30000
    warmup_steps = 1000
    decay_steps = 2000
    stable_steps = total_steps - warmup_steps - decay_steps

    def lr_lambda(step):
        if step < warmup_steps:
            return step / warmup_steps  # Linear warmup 0->1
        elif step < warmup_steps + stable_steps:
            return 1.0                  # Stable
        else:
            # Cosine decay from 1->0
            decay_ratio = (step - warmup_steps - stable_steps) / decay_steps
            return 0.5 * (1 + math.cos(math.pi * decay_ratio))

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
    return optimizer, scheduler

In [None]:
num_runs = 5
threshold = 0.85

for run_idx in range(num_runs):
    print(f"Start run {run_idx + 1}/{num_runs}")
    # initialize model, optimizer, scheduler
    model = GPT(vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias=bias)
    m = model.to(device)
    optimizer, scheduler = create_optimizer_and_scheduler(model)

    # training loop

    # print the number of parameters in the model
    print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

    phase = 6
    counter = 0
    best_loss = float('inf')
    val_loss_list = []

    patience = 20

    scaler = GradScaler('cuda')
    for iter in tqdm(range(max_iters), desc="Training Progress"):
        # every once in a while evaluate the loss on train and val sets
        if iter % eval_interval == 0 or iter == max_iters - 1:
            losses1 = estimate_loss(phase, model)['train']
            print(f"step {iter}: loss {losses1:.4f}, Best Loss so far: {best_loss}, Counter: {counter}")
            log_dict = {"Loss": losses1}
            val_loss_list.append(round(losses1.item(), 4))

            wandb.log(log_dict)

        # sample a batch of data
        xb, yb = get_batch(phase)

        # evaluate the loss
        with autocast(device_type="cuda", dtype=torch.bfloat16):
            logits1, loss1 = model(xb, yb)

        optimizer.zero_grad(set_to_none=True)

        scaler.scale(loss1).backward()
        scaler.step(optimizer)
        scaler.update()

        scheduler.step()
    print(f"Training finished for run {run_idx+1}.\nEvaluating 6-digit accuracy...")

    # 8) Evaluate final performance on 6-digit addition
    acc_on_6 = test_accuracy_on_6(model)

    # 9) If it's good enough, save to Google Drive (or local)
    if acc_on_6 > threshold:
        filename = f"run_{run_idx}_acc_{acc_on_6:.2f}.pt"
        save_path = f"/content/drive/MyDrive/URPS/{filename}"
        torch.save(model.state_dict(), save_path)
        print(f"Saved best model at {save_path}")

print("All runs complete.")

Start run 1/5
10.658304 M parameters


Training Progress:   0%|          | 0/20000 [00:03<?, ?it/s]

step 0: loss 2.6712, Best Loss so far: inf, Counter: 0





Error: You must call wandb.init() before wandb.log()

In [None]:
def accuracy_print(models, num_digits, need_print=False):
        correct = 0
        wrong = 0

        for j in range(100):
            a = np.random.randint(10**(num_digits-1), 10**(num_digits), size=1)
            b = np.random.randint(10**(num_digits-1), 10**(num_digits), size=1)
            c = a + b
            reversed_a = np.array([str(x)[::-1] for x in a])
            reversed_b = np.array([str(x)[::-1] for x in b])
            reversed_c = np.array([str(x)[::-1] for x in c])
            input = f"{reversed_a.item()}+{reversed_b.item()}="
            context = torch.tensor(encode(input), dtype=torch.long, device=device)

            output1 = generate(models[0], context, 100, top_k=1)
            output2 = generate(models[1], context, 100, top_k=1)
            output3 = generate(models[2], context, 100, top_k=1)
            output4 = generate(models[3], context, 100, top_k=1)
            output5 = generate(models[4], context, 100, top_k=1)
            output = [output1, output2, output3, output4, output5]

            counter = Counter(output)
            # 选取出现次数最多的字符串
            most_common_o, count = counter.most_common(1)[0]
            # print("most common output count:", count)

            if most_common_o == f"{reversed_a.item()}+{reversed_b.item()}={reversed_c.item()}":
                correct += 1
                wrong = 0
            else:
                wrong = 1
            if need_print and wrong:
                print(f"   Input: {input}")
                print(f"  Output: {most_common_o}")
                print(f"Expected: {reversed_a.item()}+{reversed_b.item()}={reversed_c.item()}")
                print("-----------")

        acc = correct / 100
        print(f"Accuracy for {num_digits} digits addition: {acc} ")
        return acc

In [None]:
import subprocess

os.system('git config --global user.email "zifeibai@umich.edu"')
os.system('git config --global user.name "ZifeiBai"')

# 2️⃣ **Use Google Drive to store GitHub Token**
GITHUB_TOKEN_PATH = "/content/drive/MyDrive/URPS/github_token.txt"
if os.path.exists(GITHUB_TOKEN_PATH):
    with open(GITHUB_TOKEN_PATH, "r") as f:
        os.environ["GITHUB_TOKEN"] = f.read().strip()
else:
    print("❌ GitHub Token")
    exit(1)

# 3️⃣ **Set up GitHub remote repo**
GIT_PATH = "/content/drive/MyDrive/URPS/Git"
REPO_URL = f"https://{os.environ['GITHUB_TOKEN']}@github.com/ZifeiBai/URPS.git"

if not os.path.exists(GIT_PATH):
    print(f"📁 Creating directory: {GIT_PATH}")
    os.makedirs(GIT_PATH)

# 4️⃣ **If .git/ does not exsit， need to clone**
if not os.path.exists(os.path.join(GIT_PATH, ".git")):
    print("❌ Git repository not found. Cloning...")
    subprocess.run(f"rm -rf {GIT_PATH}", shell=True, check=True)
    subprocess.run(f"git clone {REPO_URL} {GIT_PATH}", shell=True, check=True)

# 5️⃣ **Enter Git repo**
os.chdir(GIT_PATH)
print("📂 Changed working directory to:", os.getcwd())


# 6️⃣ **Check Git status**
status_output = subprocess.run("git status", shell=True, capture_output=True, text=True)
print(status_output.stdout)

#  **Push to Git**
print("🚀 Adding files to Git...")
subprocess.run("git add .", shell=True, check=True)

print("📝 Committing changes...")
commit_output = subprocess.run('git commit -m "Auto update from Google Colab 2.6"', shell=True, capture_output=True, text=True)
print(commit_output.stdout)



print("📤 Pushing to GitHub...")
push_output = subprocess.run("git push origin main", shell=True, capture_output=True, text=True)
if "fatal" in push_output.stderr or "error:" in push_output.stderr:
    print("❌ Real Git Push Error:", push_output.stderr)
else:
    print("✅ Git Push Success!")

In [None]:
model = GPT(vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias)
model.to(device)
model.load_state_dict(state_dict)

# Set the model to evaluation mode.
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(14, 384)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=384, out_features=1152, bias=True)
          (c_proj): Linear(in_features=384, out_features=384, bias=True)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=384, out_features=1536, bias=True)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1536, out_features=384, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=384, out_features=14, bias=False)
)

In [None]:
avg_performance = get_avg_performance(model)

Accuracy for 1 digits addition: 1.0 
Accuracy for 2 digits addition: 1.0 
Accuracy for 3 digits addition: 1.0 
Accuracy for 4 digits addition: 1.0 
Accuracy for 5 digits addition: 1.0 
Accuracy for 6 digits addition: 0.92 


In [None]:
model1 = GPT(vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias)
model1.to(device)

checkpoint_path = "/content/drive/MyDrive/URPS/Models/accuracy_0.96_3w_iter.pt"
model1.load_state_dict(torch.load(checkpoint_path, map_location=device))
model1.eval()
acc_on_6 = test_accuracy_on_6(model1)
print(f"Accuracy on 6-digit addition of base model: {acc_on_6}")

  model.load_state_dict(torch.load(checkpoint_path, map_location=device))


Accuracy for 6 digits addition: 0.89 
Accuracy on 6-digit addition of base model: 0.89


In [None]:
# Helper func
def generate_prompt_OOD(phase):
    """
    Return a list of 'num_prompts' strings for reversed addition
    with 'operand_length' digits each.
    """
    a = np.random.randint(10**(phase-1), 10**phase - 1)
    b = np.random.randint(10**(phase-1), 10**phase - 1)
    prompt_str = f"{str(a)[::-1]}+{str(b)[::-1]}="  # e.g. '12345+54321='

    return prompt_str

def length_filter(predictions, threshold=threshold):
    """
    Filter out predictions that are shorter than the threshold.
    """
    max_len = max(len(pred) for pred in predictions) # 7
    min_len = max_len - threshold # 6
    filtered = [(pred) for pred in predictions if len(pred) >= min_len]

    # print(f"Filtered out {len(predictions) - len(filtered)} short outputs; kept {len(filtered)}.")
    return filtered

def pad_sequence(seq_tensor, block_size, padding_idx):
        """
        Pad (or truncate) a single 1D seq_tensor to length block_size.
        """
        seq_list = seq_tensor.tolist()
        if len(seq_list) < block_size:
            seq_list += [padding_idx] * (block_size - len(seq_list))
        else:
            seq_list = seq_list[:block_size]
        return torch.tensor(seq_list, dtype=torch.long)

In [None]:
# length filter per model, return xb, yb with shape (filtered_batch_size, 24)
def generate_length_filter_OOD(
    model,
    num_samples=100,
    batch_size=1024,
    phase=6,
    threshold=1,
    max_new_tokens=35,
    stoi=stoi,
    itos=itos,
    device='cuda',
    block_size=25,
    padding_idx=padding_token_index,
    eos_idx=end_token_index
):

    all_x = []
    all_y = []

    num_batches = (num_samples + batch_size - 1) // batch_size

    for _ in tqdm(range(num_batches), desc="for each batch"): # for each batch

        prompts = [generate_prompt_OOD(phase) for _ in range(batch_size)] # prompts of size of batch_size=1024

        batch_predictions = []
        for prompt_str in prompts:
            prompt_ids = [stoi[ch] for ch in prompt_str]
            prompt_tensor = torch.tensor(prompt_ids, dtype=torch.long, device=device)

            # single model generate predictions for each prompt
            out_str = generate(
                model=model,
                idx=prompt_tensor,
                max_new_tokens=35,
                top_k=1
            )

            out_ids = [stoi[ch] for ch in out_str if ch in stoi]

            # Add ending_tokens for future training
            if eos_idx is not None:
                out_ids.append(eos_idx)

            # Convert to tensor
            out_tensor = torch.tensor(out_ids, dtype=torch.long, device=device)
            batch_predictions.append(out_tensor)

        # length filtering on this batch
        filtered_batch = length_filter(batch_predictions, threshold=threshold)
        if not filtered_batch:
            continue
        # print(filtered_batch)
        # Pad/Truncate each sequence
        batch_encoded = [pad_sequence(seq, block_size, padding_idx) for seq in filtered_batch]

        # Build (x, y) by shifting
        xb = torch.stack([seq[:-1] for seq in batch_encoded])  # [B, block_size-1]
        yb = torch.stack([seq[1:] for seq in batch_encoded])   # [B, block_size-1]

        all_x.append(xb)
        all_y.append(yb)

    if not all_x:
        return None, None

    final_x = torch.cat(all_x, dim=0).to(device)
    final_y = torch.cat(all_y, dim=0).to(device)
    return final_x, final_y

In [None]:
model1 = GPT(vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias)
model1.to(device)

checkpoint_path = "/content/drive/MyDrive/URPS/Models/accuracy_0.96_3w_iter.pt"
model1.load_state_dict(torch.load(checkpoint_path, map_location=device))
model1.eval()
acc_on_6 = test_accuracy_on_6(model1)

  model1.load_state_dict(torch.load(checkpoint_path, map_location=device))


Accuracy for 6 digits addition: 0.9 
Accuracy on 6-digit addition of base model: 0.9


In [None]:
model2 = GPT(vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias)
model2.to(device)

checkpoint_path = "/content/drive/MyDrive/URPS/Models/model.pth"
model2.load_state_dict(torch.load(checkpoint_path, map_location=device))
model2.eval()
acc_on_6 = test_accuracy_on_6(model2)

  model2.load_state_dict(torch.load(checkpoint_path, map_location=device))


Accuracy for 6 digits addition: 0.91 


In [None]:
model3 = GPT(vocab_size, block_size, n_embd, n_layer, n_head, dropout, bias)
model3.to(device)

checkpoint_path = "/content/drive/MyDrive/URPS/Models/run_0_acc_0.95.pt"
model3.load_state_dict(torch.load(checkpoint_path, map_location=device))
model3.eval()
acc_on_6 = test_accuracy_on_6(model3)

  model3.load_state_dict(torch.load(checkpoint_path, map_location=device))


Accuracy for 6 digits addition: 0.88 


In [None]:
models = [model1, model2, model3]

In [None]:
# Helper functions
def generate_prompt_OOD(phase):
    """
    Return a list of 'num_prompts' strings for reversed addition
    with 'operand_length' digits each.
    """
    a = np.random.randint(10**(phase-1), 10**phase - 1)
    b = np.random.randint(10**(phase-1), 10**phase - 1)
    prompt_str = f"{str(a)[::-1]}+{str(b)[::-1]}="  # e.g. '12345+54321='

    return prompt_str

def length_filter(predictions, threshold=threshold):
    """
    Filter out predictions that are shorter than the threshold.
    """
    max_len = max(len(pred) for pred in predictions) # 7
    min_len = max_len - threshold # 6
    filtered = [(pred) for pred in predictions if len(pred) >= min_len]

    # print(f"Filtered out {len(predictions) - len(filtered)} short outputs; kept {len(filtered)}.")
    return filtered

def pad_sequence(seq_tensor, block_size, padding_idx):
        """
        Pad (or truncate) a single 1D seq_tensor to length block_size.
        """
        seq_list = seq_tensor.tolist()
        if len(seq_list) < block_size:
            seq_list += [padding_idx] * (block_size - len(seq_list))
        else:
            seq_list = seq_list[:block_size]
        return torch.tensor(seq_list, dtype=torch.long)

def majority_vote_filter(pred_tensors, vote_threshold=0.6):
    """
    pred_tensors: list of Tensors, each is 1D token IDs, all passed length filter.
    vote_threshold: e.g. 0.6 => need at least ceil(0.6*k) identical strings
    Returns best_tensor if there's a winning string, else None.
    """
    if not pred_tensors:
        return None

    num_models = len(pred_tensors)
    needed_votes = math.ceil(vote_threshold * num_models) # ceil(5 * 0.6) = 3, ceil(3 * 0.6) = 2

    # decode each tensor -> string
    string_list = []
    for t in pred_tensors:
        string_list.append(decode(t.tolist())) # string_list contains all the predicted strings

    freq = {}
    for s in string_list:
        freq[s] = freq.get(s, 0) + 1

    # find most frequent string
    # best_str is the most frequent string
    # best_count is how many times it appears
    best_str, best_count = None, 0
    for text, count in freq.items():
        if count > best_count:
            best_str = text
            best_count = count

    # compare the frequency with the threshold
    if best_count >= needed_votes:
        best_ids = encode(best_str)
        return torch.tensor(best_ids, dtype=torch.long)
    else:
        return None


In [None]:
def generate_multi_model_length_vote_with_counter(
    models,                # list of models [M1, M2, ...]
    num_samples=100,
    batch_size=1024,
    phase=6,                 # digit length for reversed addition
    block_size=25,
    padding_idx=padding_token_index,
    max_new_tokens=50,
    threshold=1,         # length-filter threshold
    vote_threshold=0.6,    # 5 model 选 3 model
    eos_idx=end_token_index,
    device=device
):

    all_x = []
    all_y = []

    num_models = len(models)
    num_batches = (num_samples + batch_size - 1) // batch_size
    kept_count = 0

    for _ in range(num_batches):
        # generate 'batch_size' prompts of digit length 6
        prompts = [generate_prompt_OOD(phase) for _ in range(batch_size)]

        for prompt_str in prompts: # iterate through all 1024 prompts

            # encode and convert prompt_str into tensor
            prompt_ids = encode(prompt_str)
            prompt_tensor = torch.tensor(prompt_ids, dtype=torch.long, device=device)

            # collect each model's output for this prompt
            model_outputs = []

            for model_s in models:
                out_str = generate(
                    model=model_s,
                    idx=prompt_tensor,
                    max_new_tokens=max_new_tokens,
                    top_k=1
                )
                # re-encode the generated string
                out_ids = encode(out_str)
                if eos_idx is not None:
                    out_ids.append(eos_idx)

                out_tensor = torch.tensor(out_ids, dtype=torch.long, device=device)
                model_outputs.append(out_tensor) # len(model_outputs) == number of models

            # length filter
            # remove any model outputs that length < 6
            filtered = length_filter(model_outputs, threshold=1)
            if not filtered:
                continue

            # majority vote
            best_tensor = majority_vote_filter(filtered, vote_threshold=0.6)
            if best_tensor is None:
                continue

            # pad/truncate
            seq_padded = pad_sequence(best_tensor, block_size, padding_idx)

            # build (x, y)
            xb = seq_padded[:-1]
            yb = seq_padded[1:]
            all_x.append(xb)
            all_y.append(yb)
            kept_count += 1

    final_x = torch.stack(all_x, dim=0).to(device)
    final_y = torch.stack(all_y, dim=0).to(device)

    print(f"Total final pairs kept: {kept_count}")
    return final_x, final_y

In [None]:
x, y = generate_multi_model_length_vote_with_counter(models, num_samples=50000)
self_improvement_data_6_digits = {
    "xb": x,
    "yb": y
}
save_path = "/content/drive/MyDrive/URPS/Data/self_improvement_data_6_digits.pt"
torch.save(self_improvement_data_6_digits, save_path)


Total final pairs kept: 49314


In [193]:
data = torch.load("/content/drive/MyDrive/URPS/Data/self_improvement_data_6_digits.pt")
xb = data["xb"]
yb = data["yb"]
print(xb)
print(yb)

tensor([[ 8,  5,  1,  ..., 13, 13, 13],
        [ 9,  4,  9,  ..., 12, 13, 13],
        [ 4,  9,  2,  ..., 13, 13, 13],
        ...,
        [ 6,  7,  8,  ..., 13, 13, 13],
        [ 3,  1,  2,  ..., 13, 13, 13],
        [ 4,  0,  5,  ..., 12, 13, 13]], device='cuda:0')
tensor([[ 5,  1,  5,  ..., 13, 13, 13],
        [ 4,  9,  5,  ..., 13, 13, 13],
        [ 9,  2,  8,  ..., 13, 13, 13],
        ...,
        [ 7,  8,  0,  ..., 13, 13, 13],
        [ 1,  2,  2,  ..., 13, 13, 13],
        [ 0,  5,  4,  ..., 13, 13, 13]], device='cuda:0')


  data = torch.load("/content/drive/MyDrive/URPS/Data/self_improvement_data_6_digits.pt")
