In [None]:
# Notebook by Daniel Zakrisson at Scaleout - www.scaleoutsystems.com
# 
# This process will load large models (i.e. several .bin files), small models has 
# another process (see separate example).
# 
# Assumes models are downloaded and available in a sub directory, 
# e.g. using 'git clone https://huggingface.co/AI-Sweden-Models/gpt-sw3-6.7b-v2'
# 
# There also needs to be a sub folder named "offload" if the device map (see below) uses any disk.

import torch
from transformers import pipeline, AutoModelForCausalLM

# Initialize GPU
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Empty GPU cache and check how much GPU memory is already allocated (by the CUDA cores)
torch.cuda.empty_cache()
torch.ones(1).cuda()
print(torch.cuda.memory_reserved(0)) 
print(torch.cuda.memory_allocated(0))

In [None]:
# Model initiation is done differently than with smaller models, 
# explicitly initialize empty weights and then tie the weights to avoid 
# loading all weights twice (double memory usage). Accelerate is used to split model across devices.

from transformers import AutoConfig
from accelerate import init_empty_weights, dispatch_model, infer_auto_device_map

checkpoint = 'gpt-sw3-6.7b-v2'
config = AutoConfig.from_pretrained(checkpoint)

with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config)
model.tie_weights()

# Create a device map to explicitly set max memory usage on devices. Avoid GPU out of memory by subtracting the memory used by CUDA cores above.
# try adding dtype='float16' to reduce memory usage (but check performance!)

device_map = infer_auto_device_map(
    model, 
    max_memory={0: "11GiB", "cpu":"30GiB"}
)

# if you get Out Of Memory error in the following steps (or when inferencing), try to explicitly move one or several layers from the GPU to CPU/disk.
import json

#device_map["transformer.h.9"] = "cpu" 
#device_map["transformer.h.10"] = "cpu" 

#print(json.dumps(device_map, indent=2))

In [None]:
from accelerate import load_checkpoint_and_dispatch

model = load_checkpoint_and_dispatch(model, "gpt-sw3-6.7b-v2", device_map=device_map, offload_folder='offload', offload_state_dict=True) #create a folder named 'offload' before running
#model.hf_device_map # print the device map to check what is loaded on different devices (GPUs, CPU and disk)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('gpt-sw3-6.7b-v2', use_fast=True)

prompt = """"
<|endoftext|><s>
User:
Vad är sentimentet i följande mening? Den nya hemsidan är ganska snygg
<s>
Bot:
""".strip()
inputs = tokenizer(prompt, return_tensors="pt").to(0)

output = model.generate(
    inputs=inputs["input_ids"],
    do_sample=True,
    max_new_tokens=100,
    temperature=0.75,
    top_p=0.95,
    repetition_penalty=1.2,
    pad_token_id=tokenizer.eos_token_id
)[0]

generated_text = tokenizer.decode(output)  
print(generated_text)