In [5]:
import torch
from torch import nn

bs, c, h = 2, 32, 64

x = torch.rand((bs, c, h), dtype=torch.float)
w1 = nn.Linear(h, h*2) # (h, c)

y = w1(x)
y.shape

torch.Size([2, 32, 128])

In [21]:
import torch

seq_len = 10

attention_mask = torch.tril(torch.ones((seq_len, seq_len), dtype = torch.bool)).view(1, 1, seq_len, seq_len)
attention_mask = attention_mask.to(torch.bfloat16)
attention_mask = (1.0 - attention_mask) * torch.finfo(torch.float16).min
print(attention_mask)

seqlen = 5
start_pos = 2
if seqlen > 1:
    mask = torch.full((seqlen, seqlen), float("-inf"))
    mask = torch.triu(mask, diagonal=1)
    # When performing key-value caching, we compute the attention scores
    # only for the new sequence. Thus, the matrix of scores is of size
    # (seqlen, cache_len + seqlen), and the only masked entries are (i, j) for
    # j > cache_len + i, since row i corresponds to token cache_len + i.
    mask = torch.hstack(
        [torch.zeros((seqlen, start_pos)), mask]
    )
print(mask)

tensor([[[[    -0., -65536., -65536., -65536., -65536., -65536., -65536.,
           -65536., -65536., -65536.],
          [    -0.,     -0., -65536., -65536., -65536., -65536., -65536.,
           -65536., -65536., -65536.],
          [    -0.,     -0.,     -0., -65536., -65536., -65536., -65536.,
           -65536., -65536., -65536.],
          [    -0.,     -0.,     -0.,     -0., -65536., -65536., -65536.,
           -65536., -65536., -65536.],
          [    -0.,     -0.,     -0.,     -0.,     -0., -65536., -65536.,
           -65536., -65536., -65536.],
          [    -0.,     -0.,     -0.,     -0.,     -0.,     -0., -65536.,
           -65536., -65536., -65536.],
          [    -0.,     -0.,     -0.,     -0.,     -0.,     -0.,     -0.,
           -65536., -65536., -65536.],
          [    -0.,     -0.,     -0.,     -0.,     -0.,     -0.,     -0.,
               -0., -65536., -65536.],
          [    -0.,     -0.,     -0.,     -0.,     -0.,     -0.,     -0.,
               -0.,   

In [23]:
import torch
import torch.nn as nn

class SampleModel(nn.Module):
    def __init__(self):
        super(SampleModel, self).__init__()
        self.linear = nn.Linear(1024, 1024*4)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(1024*4, 1024)
    
    def forward(self, x):
        x = self.linear(x)
        x = self.relu(x)
        x = self.linear2(x)
        return x

In [29]:
import time 
torch.set_default_device("cuda:0")

model = SampleModel()

x = torch.randn(10, 1024, 1024)
start = time.time()
for i in range(100):
    y = model(x)
print("without comppile:", time.time() - start)

model =  torch.compile(model)
start = time.time()
for i in range(100):
    y = model(x)
print("witt comppile:", time.time() - start)


without comppile: 0.721881628036499




witt comppile: 8.020703315734863


In [5]:
import contextlib
import torch

from torch.cuda import Stream


s = Stream()

torch.manual_seed(42)
t1_cpu_pinned = torch.randn(1024**2 * 5, pin_memory=True)
t2_cpu_paged = torch.randn(1024**2 * 5, pin_memory=False)
t3_cuda = torch.randn(1024**2 * 5, device="cuda:0")

assert torch.cuda.is_available()
device = torch.device("cuda", torch.cuda.current_device())


# The function we want to profile
def inner(pinned: bool, streamed: bool):
    with torch.cuda.stream(s) if streamed else contextlib.nullcontext():
        if pinned:
            t1_cuda = t1_cpu_pinned.to(device, non_blocking=True)
        else:
            t2_cuda = t2_cpu_paged.to(device, non_blocking=True)
        t_star_cuda_h2d_event = s.record_event()
    # This operation can be executed during the CPU to GPU copy if and only if the tensor is pinned and the copy is
    #  done in the other stream
    t3_cuda_mul = t3_cuda * t3_cuda * t3_cuda
    t3_cuda_h2d_event = torch.cuda.current_stream().record_event()
    t_star_cuda_h2d_event.synchronize()
    t3_cuda_h2d_event.synchronize()


# Our profiler: profiles the `inner` function and stores the results in a .json file
def benchmark_with_profiler(
    pinned,
    streamed,
) -> None:
    torch._C._profiler._set_cuda_sync_enabled_val(True)
    wait, warmup, active = 1, 1, 2
    num_steps = wait + warmup + active
    rank = 0
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        schedule=torch.profiler.schedule(
            wait=wait, warmup=warmup, active=active, repeat=1, skip_first=1
        ),
    ) as prof:
        for step_idx in range(1, num_steps + 1):
            inner(streamed=streamed, pinned=pinned)
            if rank is None or rank == 0:
                prof.step()
    prof.export_chrome_trace(f"trace_streamed{int(streamed)}_pinned{int(pinned)}.json")

In [6]:
benchmark_with_profiler(streamed=True, pinned=True)

STAGE:2024-12-11 19:41:31 21742:21742 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-12-11 19:41:31 21742:21742 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-12-11 19:41:31 21742:21742 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [5]:
from transformers import AutoTokenizer

tokenizer_path: str = "/home/duyong/model-zoos/meta-llama/Meta-Llama-3.1-8B-Instruct-oooooooold/",
tokenizer = AutoTokenizer.from_pretrained(model=tokenizer_path, padding_side="left", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
model_inputs = tokenizer(
    ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
).to("cuda")

TypeError: from_pretrained() missing 1 required positional argument: 'pretrained_model_name_or_path'

In [1]:
import torch

bin_file = '/home/duyong/model-zoos/meta-llama/Meta-Llama-3.1-8B-Instruct-oooooooold/original/consolidated.00.pth'
state = torch.load(bin_file, map_location='cpu')
state.keys()


dict_keys(['tok_embeddings.weight', 'layers.0.attention.wq.weight', 'layers.0.attention.wk.weight', 'layers.0.attention.wv.weight', 'layers.0.attention.wo.weight', 'layers.0.feed_forward.w1.weight', 'layers.0.feed_forward.w3.weight', 'layers.0.feed_forward.w2.weight', 'layers.0.attention_norm.weight', 'layers.0.ffn_norm.weight', 'layers.1.attention.wq.weight', 'layers.1.attention.wk.weight', 'layers.1.attention.wv.weight', 'layers.1.attention.wo.weight', 'layers.1.feed_forward.w1.weight', 'layers.1.feed_forward.w3.weight', 'layers.1.feed_forward.w2.weight', 'layers.1.attention_norm.weight', 'layers.1.ffn_norm.weight', 'layers.2.attention.wq.weight', 'layers.2.attention.wk.weight', 'layers.2.attention.wv.weight', 'layers.2.attention.wo.weight', 'layers.2.feed_forward.w1.weight', 'layers.2.feed_forward.w3.weight', 'layers.2.feed_forward.w2.weight', 'layers.2.attention_norm.weight', 'layers.2.ffn_norm.weight', 'layers.3.attention.wq.weight', 'layers.3.attention.wk.weight', 'layers.3.atten

In [8]:
import json
from pathlib import Path
from zllm.worker.llama31 import Transformer, ModelArgs

ckpt_dir = '/home/duyong/model-zoos/meta-llama/Meta-Llama-3.1-8B-Instruct-oooooooold/original'
with open(Path(ckpt_dir) / "params.json", "r") as f:
    params = json.loads(f.read())

model_args: ModelArgs = ModelArgs(
    max_seq_len=4096,
    max_batch_size=16,
    flash=False,
    paged=False,
    **params,
)
model = Transformer(model_args)
state_dict =  model.state_dict()

row_parallel_ = ["tok_embeddings"]
col_parallel = ["ln_f.weight", "ln_f.bias"]

for key, value in state_dict.items():
    print(f"{key}: {value.shape}")


tok_embeddings.weight: torch.Size([128256, 4096])
layers.0.attention.wq.weight: torch.Size([4096, 4096])
layers.0.attention.wk.weight: torch.Size([1024, 4096])
layers.0.attention.wv.weight: torch.Size([1024, 4096])
layers.0.attention.wo.weight: torch.Size([4096, 4096])
layers.0.feed_forward.w1.weight: torch.Size([14336, 4096])
layers.0.feed_forward.w2.weight: torch.Size([4096, 14336])
layers.0.feed_forward.w3.weight: torch.Size([14336, 4096])
layers.0.attention_norm.weight: torch.Size([4096])
layers.0.ffn_norm.weight: torch.Size([4096])
layers.1.attention.wq.weight: torch.Size([4096, 4096])
layers.1.attention.wk.weight: torch.Size([1024, 4096])
layers.1.attention.wv.weight: torch.Size([1024, 4096])
layers.1.attention.wo.weight: torch.Size([4096, 4096])
layers.1.feed_forward.w1.weight: torch.Size([14336, 4096])
layers.1.feed_forward.w2.weight: torch.Size([4096, 14336])
layers.1.feed_forward.w3.weight: torch.Size([14336, 4096])
layers.1.attention_norm.weight: torch.Size([4096])
layers.1.

In [2]:
import torch

x = torch.randn(1, 2, 8)
print(x)


tensor([[[ 0.5607,  0.5485, -0.6634, -0.4783, -0.5522,  0.0477, -0.0076,
          -1.6286],
         [ 1.4370,  0.7416,  0.1552,  1.0217,  0.2171, -0.1649, -0.0697,
          -0.0686]]])


In [7]:
x_c = torch.view_as_complex(x.reshape(1, 2, -1, 2))
print(x_c)

x_o = torch.view_as_real(x_c).flatten(2)
print(x_o)

tensor([[[ 0.5607+0.5485j, -0.6634-0.4783j, -0.5522+0.0477j, -0.0076-1.6286j],
         [ 1.4370+0.7416j,  0.1552+1.0217j,  0.2171-0.1649j, -0.0697-0.0686j]]])
tensor([[[ 0.5607,  0.5485, -0.6634, -0.4783, -0.5522,  0.0477, -0.0076,
          -1.6286],
         [ 1.4370,  0.7416,  0.1552,  1.0217,  0.2171, -0.1649, -0.0697,
          -0.0686]]])


In [19]:
dim = 8
max_len = 4
theta = 100.0

freqs = (1.0 / (theta**torch.arange(0, dim, 2)[:(dim//2)]).float() / dim)
print(freqs)

t = torch.arange(max_len, dtype=torch.float32)
print(t)

freqs = torch.outer(t, freqs)
print(freqs)

freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
print(freqs_cis)

tensor([1.2500e-01, 1.2500e-05, 1.2500e-09, 1.2500e-13])
tensor([0., 1., 2., 3.])
tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.2500e-01, 1.2500e-05, 1.2500e-09, 1.2500e-13],
        [2.5000e-01, 2.5000e-05, 2.5000e-09, 2.5000e-13],
        [3.7500e-01, 3.7500e-05, 3.7500e-09, 3.7500e-13]])
tensor([[1.0000+0.0000e+00j, 1.0000+0.0000e+00j, 1.0000+0.0000e+00j,
         1.0000+0.0000e+00j],
        [0.9922+1.2467e-01j, 1.0000+1.2500e-05j, 1.0000+1.2500e-09j,
         1.0000+1.2500e-13j],
        [0.9689+2.4740e-01j, 1.0000+2.5000e-05j, 1.0000+2.5000e-09j,
         1.0000+2.5000e-13j],
        [0.9305+3.6627e-01j, 1.0000+3.7500e-05j, 1.0000+3.7500e-09j,
         1.0000+3.7500e-13j]])


In [21]:
import torch

# 定义径向距离和角度
r = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float)
theta = torch.tensor([0.0, torch.pi / 2, torch.pi], dtype=torch.float)

# 将极坐标转换为笛卡尔坐标
cartesian_coords = torch.polar(r, theta)

print(cartesian_coords)

tensor([ 1.0000e+00+0.0000e+00j, -8.7423e-08+2.0000e+00j,
        -3.0000e+00-2.6227e-07j])


In [None]:
def precompute_freqs_cis_ntk(dim: int, end: int, theta: float = 10000.0, alpha: int= 16):
    theta = theta * alpha ** (dim / (dim-2))
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
    t = torch.arange(end, device=freqs.device)
    freqs = torch.outer(t, freqs).float()
    freqs_cos = torch.cos(freqs)  # real
    freqs_sin = torch.sin(freqs)  # imaginary
    return freqs_cos, freqs_sin