In [48]:
from transformers import LlamaForCausalLM
import torch

if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
else:
    device = 'mps'
    print("MPS is available and will be used.")

token = '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'
model = 'meta-llama/Llama-2-7b-hf'
model = LlamaForCausalLM.from_pretrained(model, torch_dtype=torch.float16, token=token, proxies={
    "http": "http://127.0.0.1:7890",
    "https": "http://127.0.0.1:7890",
})


MPS is available and will be used.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [49]:
model.seqlen = 2048
print("model_seq_len:", model.seqlen)
print("model.config.hidden_size:", model.config.hidden_size)
layers = model.model.layers
print("model.model.layers", model.model.layers)
layers[0] = layers[0].to(device)
print("layers[0]", layers[0])


model_seq_len: 2048
model.config.hidden_size: 4096
model.model.layers ModuleList(
  (0-31): 32 x LlamaDecoderLayer(
    (self_attn): LlamaAttention(
      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (rotary_emb): LlamaRotaryEmbedding()
    )
    (mlp): LlamaMLP(
      (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
      (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
      (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
      (act_fn): SiLUActivation()
    )
    (input_layernorm): LlamaRMSNorm()
    (post_attention_layernorm): LlamaRMSNorm()
  )
)
layers[0] LlamaDecoderLayer(
  (self_attn): LlamaAttention(
    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (k_

In [46]:
import torch.nn as nn
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(10, 1)
        
    def forward(self, x):
        x = self.linear(x)
        return x
    
cache = {'i': 0, 'attention_mask': None}
inps = torch.zeros((4, model.seqlen, model.config.hidden_size)).to(device)
class Catcher(nn.Module):
    def __init__(self, module):
        super().__init__()
        self.module = module
        
    def forward(self, inp, **kwargs):
        inps[cache['i']] = inp
        cache['i'] += 1
        cache['attention_mask'] = kwargs['attention_mask']
        cache['position_ids'] = kwargs['position_ids']
        raise ValueError


In [66]:
token = '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'

def get_ptb_new(nsamples, seed, seqlen, model):
    from datasets import load_dataset
    traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train', token=token)
    testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test', token=token)

    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False, token=token)
    print("traindata['sentence'] ", traindata['sentence'][0] ) 
    trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt')
    testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt')

    import random
    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc
nsamples = 1
seed = 42
seqlen = 2048

dataloader, testloader = get_ptb_new(nsamples, seed, seqlen, "meta-llama/Llama-2-7b-hf")

traindata['sentence']  aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter


In [61]:
for batch in dataloader:
    print(batch[0].shape)

torch.Size([1, 2048])


In [63]:
for batch in dataloader:
    try:
        model(batch[0])
    except ValueError:
        pass

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, mps:0 and cpu!

In [47]:
layers[0] = Catcher(layers[0])
print(layers[0])


Catcher(
  (module): LlamaDecoderLayer(
    (self_attn): LlamaAttention(
      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (rotary_emb): LlamaRotaryEmbedding()
    )
    (mlp): LlamaMLP(
      (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
      (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
      (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
      (act_fn): SiLUActivation()
    )
    (input_layernorm): LlamaRMSNorm()
    (post_attention_layernorm): LlamaRMSNorm()
  )
)


In [27]:
def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
    if type(module) in layers:
        return {name: module}
    res = {}
    for name1, child in module.named_children():
        res.update(find_layers(
            child, layers=layers, name=name + '.' + name1 if name != '' else name1
        ))
    return res

res = find_layers(model)

seq = [list(res.keys())]
print(seq)
for names in seq:
    print(names)
    subset = {n: res[n] for n in names}
    print(subset)

[['0.module']]
['0.module']
{'0.module': Linear(in_features=10, out_features=1, bias=True)}
