In [2]:
# pip install transformers accelerate pandas matplotlib scikit-learn numpy bitsandbytes intell-extension-for-pytorch

In [3]:
import torch, time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import manifold
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch.nn as nn

In [4]:
model_path = "ibm-granite/granite-3b-code-base"
device = "cpu"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [6]:
tokenizer.tokenize("the quick brown fox jumped over the lazy dog.")

['the',
 'Ġquick',
 'Ġbrown',
 'Ġf',
 'ox',
 'Ġjump',
 'ed',
 'Ġover',
 'Ġthe',
 'Ġlazy',
 'Ġdog',
 '.']

In [7]:
tokenizer.tokenize("""
def add(x, y):
    return x + y
                   """)

['Ċ',
 'def',
 'Ġadd',
 '(',
 'x',
 ',',
 'Ġy',
 '):',
 'ĊĠĠĠ',
 'Ġreturn',
 'Ġx',
 'Ġ+',
 'Ġy',
 'ĊĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ']

In [8]:
tokenizer.tokenize("can you help me create a new car insurance policy quote for Roger?")

['can',
 'Ġyou',
 'Ġhelp',
 'Ġme',
 'Ġcreate',
 'Ġa',
 'Ġnew',
 'Ġcar',
 'Ġins',
 'urance',
 'Ġpolicy',
 'Ġquote',
 'Ġfor',
 'ĠR',
 'og',
 'er',
 '?']

In [9]:
tokenizer.tokenize("what is date of patient's salmonella diagnosis")

['what',
 'Ġis',
 'Ġdate',
 'Ġof',
 'Ġpatient',
 "'s",
 'Ġsal',
 'mon',
 'ella',
 'Ġdi',
 'agnosis']

In [10]:
# device "meta" does not load weights
quant = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map=device,
    quantization_config=quant
    )
model.eval()

  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::_addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor
    registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6
  dispatch key: AutocastCPU
  previous kernel: registered at /pytorch/aten/src/ATen/autocast_mode.cpp:327
       new kernel: registered at /opt/workspace/ipex-cpu-dev/csrc/cpu/autocast/autocast_mode.cpp:112 (function operator())


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
  CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 2560, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (o_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=2560, out_features=10240, bias=True)
          (up_proj): Linear8bitLt(in_features=2560, out_features=10240, bias=True)
          (down_proj): Linear8bitLt(in_features=10240, out_features=2560, bias=True)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2560,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2560,), eps=1e-05)
      )
    )
    (nor

In [11]:
tiny = nn.Sequential(
    nn.Linear(3, 3, bias=False, device="cpu"),
    nn.Linear(3, 1, bias=False, device="cpu"),
)
tiny

Sequential(
  (0): Linear(in_features=3, out_features=3, bias=False)
  (1): Linear(in_features=3, out_features=1, bias=False)
)

In [12]:
input = torch.randn(3)
tiny(input)

tensor([0.0425], grad_fn=<SqueezeBackward4>)

In [13]:
torch.autograd.functional.jacobian(tiny, input)

tensor([[ 0.0881, -0.0843, -0.0606]])

In [14]:
t = list(model.children())

In [15]:
embed = t[0].embed_tokens

In [16]:
x = embed(torch.LongTensor(range(49000)))
(x.max(), x.min())

(tensor(0.8750, dtype=torch.float16, grad_fn=<MaxBackward1>),
 tensor(-0.3008, dtype=torch.float16, grad_fn=<MinBackward1>))

In [17]:
embed.weight.shape

torch.Size([49152, 2560])

In [None]:
from transformers import LlamaModel
class WrapLM(nn.Module):
    def __init__(self, llm: LlamaModel):
        super().__init__()
        self.llm = llm
        self.layers = llm.layers
        self.norm = llm.norm
        self.rotary_emb = llm.rotary_emb

    def forward(self, emb: torch.Tensor) -> torch.Tensor:
        attention_mask = None
        position_ids = None
        past_key_values = None
        inputs_embeds = emb
        use_cache = False
        output_attentions = False
        output_hidden_states = False
        cache_position = None

        past_seen_tokens = 0
        cache_position = torch.arange(
            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
        )

        position_ids = cache_position.unsqueeze(0)

        causal_mask = self.llm._update_causal_mask(
            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
        )

        hidden_states = inputs_embeds
        
        position_embeddings = self.rotary_emb(hidden_states, position_ids)

        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
            layer_outputs = decoder_layer(
                hidden_states,
                attention_mask=causal_mask,
                position_ids=position_ids,
                past_key_value=past_key_values,
                output_attentions=output_attentions,
                use_cache=use_cache,
                cache_position=cache_position,
                position_embeddings=position_embeddings,
            )
            hidden_states = layer_outputs[0]

        hidden_states = self.norm(hidden_states)

        return hidden_states

In [33]:

wm = WrapLM(model.model)

In [34]:
embed(torch.LongTensor([[1000]]))

tensor([[[-0.0151,  0.0092, -0.0164,  ..., -0.0131,  0.0002, -0.0076]]],
       dtype=torch.float16, grad_fn=<EmbeddingBackward0>)

In [35]:
emb = embed(torch.LongTensor([[1000]]))
wm.forward(emb)

NameError: name 'llm' is not defined