# Zephyr 7B with Shap

In [1]:
import os

# needs to be executed before importing torch or transformers
# rattle specific: only use last 3 gpus
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6"

import shap
import torch
from pathlib import Path
from peft import PeftModel
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, pipeline, GenerationConfig

if not torch.cuda.is_available():
    print("Warning: CUDA not available!")

  torch.utils._pytree._register_pytree_node(


In [2]:
gpus = "sequential"
model_name = "HuggingFaceH4/zephyr-7b-beta"

base_path = Path.cwd().parent
data_path = base_path / "data"
model_path = data_path / "trained_adapters" / model_name

generation_config = GenerationConfig(
    max_new_tokens=32,
    temperature=0.7,
    top_k=20,
    top_p=0.95,
    do_sample=True
)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=gpus,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    quantization_config=quantization_config,
)

model.config.update(generation_config.to_dict())

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
    trust_remote_code=True
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map=gpus
)

adapter_name = model_path / "checkpoint-zora_instruct"
model = PeftModel.from_pretrained(model, adapter_name)
pipe.model = model


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [4]:
pipe("hello")[0]["generated_text"]

'hello world.SDG 3 - Good Health and Well-being: Aims to ensure healthy lives and promote well-being for all at all ages.SD'