In [2]:
%pip install -qU transformers bitsandbytes optimum accelerate torch python-dotenv

[0mNote: you may need to restart the kernel to use updated packages.


In [24]:
import dotenv
%reload_ext dotenv
%dotenv 

In [25]:
from huggingface_hub import notebook_login, login

# notebook_login()

In [26]:
import transformers
import accelerate
import torch

print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")
print(f"PyTorch version: {torch.__version__}")

Transformers version: 4.36.2
Accelerate version: 0.26.1
PyTorch version: 2.1.2+cu121


In [27]:
import torch
from accelerate import infer_auto_device_map, init_empty_weights
from transformers import (
  AutoConfig,
  AutoModelForCausalLM,
  AutoTokenizer,
  TextStreamer
)

checkpoint = "meta-llama/Llama-2-7b-hf"

config = AutoConfig.from_pretrained(checkpoint)
with init_empty_weights():
  model = AutoModelForCausalLM.from_config(config)

device_map = infer_auto_device_map(model, no_split_module_classes=["LlamaDecoderLayer"])
device_map

OrderedDict([('model.embed_tokens', 0),
             ('model.layers.0', 0),
             ('model.layers.1', 0),
             ('model.layers.2', 0),
             ('model.layers.3', 0),
             ('model.layers.4', 0),
             ('model.layers.5', 0),
             ('model.layers.6', 0),
             ('model.layers.7', 0),
             ('model.layers.8', 0),
             ('model.layers.9', 0),
             ('model.layers.10', 0),
             ('model.layers.11', 0),
             ('model.layers.12', 0),
             ('model.layers.13', 0),
             ('model.layers.14', 0),
             ('model.layers.15', 0),
             ('model.layers.16', 0),
             ('model.layers.17', 0),
             ('model.layers.18', 'cpu'),
             ('model.layers.19', 'cpu'),
             ('model.layers.20', 'cpu'),
             ('model.layers.21', 'cpu'),
             ('model.layers.22', 'cpu'),
             ('model.layers.23', 'cpu'),
             ('model.layers.24', 'cpu'),
             ('mo

In [28]:
device_map["lm_head"] = 0
device_map["model.decoder.layers.17"] = "cpu"
device_map["model.decoder.layers.16"] = "cpu"

model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    device_map=device_map,
    offload_folder="offload",
    offload_state_dict = True,
    torch_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [30]:
streamer = TextStreamer(tokenizer, skip_special_tokens=True)

# input_text = "Hello my dog is cute and"
# input_text="When you look at the USA through a European's eyes, there is something to"
input_text="Here is a script written in Python to find the RMS for a given sequence of numbers:"
# input_text="The difference between Harry Potter and Hunger Games is"
# input_text="Piglet, Pooh's best friend, likes to"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# enable FlashAttention
with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
  _ = model.generate(**inputs, max_new_tokens=200, temperature=0.7, streamer=streamer, do_sample=True)

# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Here is a script written in Python to find the RMS for a given sequence of 

numbers:

\begin{code}
import numpy as np
import matplotlib.pyplot as plt

def rms(x):
   return np.sqrt(np.mean(np.square(x)))

x = np.array([-2.0, -1.0, 0.0, 1.0, 2.0])

plt.plot(x, label='Original')
plt.plot(rms(x), label='RMS')
plt.xlabel('Values')
plt.ylabel('RMS')
plt.legend()
plt.show()
\end{code}

<a href="https://i.stack.imgur.com/6Fv7J.png" rel="nofollow noreferrer"><IMAGE></a>

The function rms is defined as:

\begin{code}
def rms(x
