# *About*

Forked from: https://colab.research.google.com/drive/1FxlUb_H6Xirhkx4RszAgHeb2uDW7oKIH by https://www.youtube.com/@vrsen

Authored by: RonanKMcGovern of https://research.trelis.com and https://huggingface.co/Trelis

# Install

In [None]:
# https://stackoverflow.com/questions/56081324/why-are-google-colab-shell-commands-not-working
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U einops
!pip install -q -U safetensors
!pip install -q -U torch
!pip install -q -U xformers
!pip install -q -U datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m97.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━

# Import

In [None]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TextStreamer

### Google Drive Mounting below is optional. If you don't use it, remove cache_dir from the model and tokeniser

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
cache_dir = "/content/drive/My Drive/huggingface_cache"
os.makedirs(cache_dir, exist_ok=True) # Ensure the directory exists


# Load Quantized Model

In [None]:
model_id = "Trelis/Llama-2-7b-chat-hf-sharded-bf16-5GB" # sharded model by RonanKMcGovern. Change the model here to load something else.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

config = transformers.AutoConfig.from_pretrained(model_id, trust_remote_code=True)
config.init_device = 'cuda:0' # Unclear whether this really helps a lot or interacts with device_map.

tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, config=config, quantization_config=bnb_config, device_map='auto', trust_remote_code=True, cache_dir=cache_dir) # for inference use 'auto', for training us device_map={"":0}

Downloading (…)lve/main/config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
print(dir(tokenizer))




In [None]:
print("Current EOS token:", tokenizer.eos_token)


Current EOS token: </s>


In [None]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

sample_string = 'hello'

# Tokenize the stringified JSON object
tokens = tokenizer.encode(sample_string)

# Count the number of tokens
token_count = len(tokens)

# Fetch BOS and EOS tokens
BOS_token_id = tokenizer.bos_token_id
EOS_token_id = tokenizer.eos_token_id
BOS_token = tokenizer.decode([BOS_token_id])
EOS_token = tokenizer.decode([EOS_token_id])

# Check and print BOS and EOS tokens
print(f"Beginning of the sequence: {tokens[0]} (BOS token: {BOS_token})")
print(f"End of the sequence: {tokens[-1]} (EOS token: {EOS_token})")
print(f"EOS token: {EOS_token}")

print(f"The number of tokens in the stringified JSON object is: {token_count}")

Beginning of the sequence: 1 (BOS token: <s>)
End of the sequence: 22172 (EOS token: </s>)
EOS token: </s>
The number of tokens in the stringified JSON object is: 2


In [None]:
print(model.config)


LlamaConfig {
  "_name_or_path": "Trelis/Llama-2-7b-chat-hf-sharded-bf16-5GB",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "init_device": "cuda:0",
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false
  },
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.32.0.dev0",
  "use

In [None]:
def stream():
    system_prompt = 'You are a helpful assistant that provides accurate and concise responses'
    user_prompt = 'Tell me about Thumbelina.'

    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

    prompt = f"{B_INST} {B_SYS}{system_prompt.strip()}{E_SYS}{user_prompt.strip()} {E_INST}"

    inputs = tokenizer([prompt], return_tensors="pt").to("cuda:0")
    streamer = TextStreamer(tokenizer)

    # Despite returning the usual output, the streamer will also print the generated text to stdout.
    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=500)

In [None]:
stream()

<s> [INST]<<SYS>>
You are a helpful assistant that provides accurate and concise responses
<</SYS>>


Tell me about Thumbelina.[/INST]

Thumbelina is a classic fairy tale character created by Hans Christian Andersen in 1835. She is a tiny girl, no bigger than a thumb, who lives in a flower pot. The story begins with a young couple, Prince and Princess, who are unable to have children. One day, while the Prince is out, a fairy comes to the flower pot where Thumbelina is living and tells her that she is the daughter of the Prince and Princess.
Thumbelina grows up and meets a young man named Cornelius, who is also looking for a bride. They fall in love, but their families do not approve of their relationship due to their size difference. Thumbelina and Cornelius face many challenges and obstacles, but ultimately their love prevails.
The story of Thumbelina teaches us about the power of love and acceptance, and how even the smallest of creatures can make a big impact in the world. It is a 