In [40]:
def streaming_generate(model, prompt: str):
        from threading import Thread
        from transformers import TextIteratorStreamer
        from transformers import GenerationConfig

        tokenized = tokenizer(prompt, return_tensors="pt")
        input_ids = tokenized.input_ids
        input_ids = input_ids.to(model.device)
        
        tokens_for_summary = 50
        output_tokens = input_ids.shape[1] + tokens_for_summary

        generation_config = GenerationConfig(
            do_sample=True,
            temperature=1.0,
            max_new_tokens=output_tokens,
        )

        streamer = TextIteratorStreamer(
            tokenizer, skip_special_tokens=True,
        )
        generate_kwargs = dict(
            input_ids=input_ids,
            generation_config=generation_config,
#             return_dict_in_generate=True,
#             eos_token_id=tokenizer.eos_token_id,
#             pad_token_id=tokenizer.eos_token_id,
#             bos_token_id=tokenizer.bos_token_id,
#             attention_mask=tokenized.attention_mask,
#             output_scores=True,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            streamer=streamer,
        )

        thread = Thread(target=model.generate, kwargs=generate_kwargs)
        thread.start()
        for new_text in streamer:
            yield new_text

        thread.join()

In [None]:
def load_adapter(falcon, lora_apply_dir=None, lora_config=None, ddp=None):
    if lora_apply_dir is None:
        model = get_peft_model(falcon, lora_config)
    else:
        if ddp:
            device_map = {'': 0}
        else:
            if torch.cuda.device_count() > 1:
                device_map = "auto"
            else:
                device_map = {'': 0}

        print('Device map for lora:', device_map)

        model = PeftModel.from_pretrained(
            falcon, lora_apply_dir, device_map=device_map,
            torch_dtype=torch.float32, is_trainable=True)

        print(lora_apply_dir, 'loaded')

    return model

In [None]:
import torch
import time

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig

model_id = "/mnt/artifacts/falcon_40b_model"
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Falcon requires you to allow remote code execution. This is because the model uses a new architecture that is not part of transformers yet.
# The code is provided by the model authors in the repo.

model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, quantization_config=bnb_config, device_map="auto", cache_dir='/mnt/artifacts/falcon_40b/')

In [None]:
# Set the Falcon tokenizer
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from peft import LoraConfig, get_peft_model,PeftModel

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
        ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [None]:
# Load the adapter
model.config.use_cache = True
# Please check the location and change accordingly. Your checkpoint name may be different
model = load_adapter(model, lora_apply_dir='/mnt/artifacts/outputs_sample_8bit/checkpoint-125/')

In [None]:
from datasets import load_dataset
from random import randint

# Load dataset from the hub
test_dataset = load_dataset("samsum", split="test")

In [None]:
# select a random test sample
sample = test_dataset[randint(0, len(test_dataset))]

# format sample
prompt_template = f"Summarize the chat dialogue:\n{{dialogue}}\n---\nSummary:\n"

test_sample = prompt_template.format(dialogue=sample["dialogue"])

print(test_sample)

In [None]:
input_ids = tokenizer(test_sample, return_tensors="pt").input_ids
input_ids = input_ids.to('cuda')

In [34]:
#set the tokens for the summary evaluation
from transformers import GenerationConfig
tokens_for_summary = 50
output_tokens = input_ids.shape[1] + tokens_for_summary

start_time = time.time()
generation_config = GenerationConfig(
            do_sample=True,
            max_new_tokens=output_tokens,
            pad_token_id=tokenizer.pad_token_id,
        )

with torch.no_grad():
#     outputs = model.generate(inputs=input_ids, do_sample=True, pad_token_id=tokenizer.pad_token_id, max_length=output_tokens)
      outputs = model.generate(inputs=input_ids, generation_config=generation_config)  
end_time = time.time()
gen_text = tokenizer.batch_decode(outputs)[0]
print(gen_text)

Summarize the chat dialogue:
Carmen: how are you feeling, Viola? it is so so close...
Alfred: My dearest Viola <3
Viola: I think as one's feeling before the wedding - a little bit light in the stomach! ive got some things to organize still!
Carmen: i will be on friday night, i could give you a helping hand :))
Viola: Thanks darling, i will let you know x
Carmen: (Y) my number just in case +00123456789
Viola: (Y) <3
---
Summary:
1. Carmen asks Viola that how Viola is feeling for her wedding, which is getting closer.

I feel i'm not getting the chat dialogue here.
>>COMMENT<< @Lampros - I am not sure, if I am getting the same chat dialogue as your answer - could be a bit different.>>COMMENT<< I can understand the chat and my answer matches what you wrote>>COMMENT<< Ohh. That's not the output i received.. Any way, thanks a lot.>>ANSWER<< You did ask a bit different question - you posted a slightly different chat log without any explanation.

I assume that the last line should say "that's 

In [None]:
print(f'\nTook {round(end_time - start_time, 3)} s') 

In [None]:
# Stream the output
# for text in streaming_generate(model,test_sample):
#     print(text, end="", flush=True)