# Install the necessary dependencies

In [1]:
#install the dependencies
!pip install -qU transformers accelerate einops xformers bitsandbytes

[0m

# Login to Hugging Face

In [3]:
# from huggingface_hub import login
# login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Initialize and load a pre-trained model for language modeling using the Hugging Face Transformers library.

In [2]:
#import necessary libraries to get the model running
import torch
from torch import cuda, bfloat16
import transformers

model_name = 'tiiuae/falcon-40b'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto'
)
model.eval()
print(f"Model loaded on {device}")

(…)iuae/falcon-40b/resolve/main/config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

(…)40b/resolve/main/configuration_falcon.py:   0%|          | 0.00/7.16k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-40b:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.




(…)lcon-40b/resolve/main/modeling_falcon.py:   0%|          | 0.00/56.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-40b:
- modeling_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


(…)esolve/main/pytorch_model.bin.index.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/9 [00:00<?, ?it/s]

pytorch_model-00001-of-00009.bin:   0%|          | 0.00/9.50G [00:00<?, ?B/s]

pytorch_model-00002-of-00009.bin:   0%|          | 0.00/9.51G [00:00<?, ?B/s]

pytorch_model-00003-of-00009.bin:   0%|          | 0.00/9.51G [00:00<?, ?B/s]

pytorch_model-00004-of-00009.bin:   0%|          | 0.00/9.51G [00:00<?, ?B/s]

pytorch_model-00005-of-00009.bin:   0%|          | 0.00/9.51G [00:00<?, ?B/s]

pytorch_model-00006-of-00009.bin:   0%|          | 0.00/9.51G [00:00<?, ?B/s]

pytorch_model-00007-of-00009.bin:   0%|          | 0.00/9.51G [00:00<?, ?B/s]

pytorch_model-00008-of-00009.bin:   0%|          | 0.00/9.51G [00:00<?, ?B/s]

pytorch_model-00009-of-00009.bin:   0%|          | 0.00/7.58G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

(…)-40b/resolve/main/generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

Model loaded on cuda:0


# Create the tokenizer

In [3]:
#create a tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

(…)n-40b/resolve/main/tokenizer_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

(…)e/falcon-40b/resolve/main/tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

(…)40b/resolve/main/special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

# Convert specific token sequences into their numerical representations a.k.a (token IDs) using a tokenizer

In [4]:
from transformers import StoppingCriteria, StoppingCriteriaList
#convert token sequences into their corresponding token IDs
stop_token_ids = [tokenizer.convert_tokens_to_ids(x) for x in [['Human', ':'], ['AI', ':']]]
stop_token_ids

[[23431, 37], [17362, 37]]

# Convert each integer value in the stop_token_ids list into a PyTorch LongTensor and then move the resulting tensors to a specified device

In [5]:
import torch
#converting each integer value in stop_token_ids into a PyTorch LongTensor and moving it to a specified device
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids

[tensor([23431,    37], device='cuda:0'),
 tensor([17362,    37], device='cuda:0')]

# Define the Stopping criteria for Falcon 40b

In [6]:
from transformers import StoppingCriteria, StoppingCriteriaList
# a custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

# Set up a the text generation pipeline using the Hugging Face Transformers library.

In [7]:
#create the model pipeline
generate_text = transformers.pipeline(
    model=model, #pass the model
    tokenizer=tokenizer, #pass the tokenizer
    return_full_text=True,  #to return the original query, making it easier for prompting.
    task='text-generation', #task
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  #to eliminate unnecessary conversations
    temperature=0.3,  #for 'randomness' of model outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  #max number of tokens to generate in the output
    repetition_penalty=1.1  #without this output begins repeating (make sure to experiment with this)
)

# Generate the output

In [8]:
#generate output
res = generate_text("Explain to me the difference between centrifugal force and centripetal force.")
print(res[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Explain to me the difference between centrifugal force and centripetal force.
Centrifugal force is a fictitious force that acts on an object moving in a circular path. It is directed away from the center of rotation. Centripetal force is the real force that acts on an object moving in a circular path. It is directed toward the center of rotation.
This page was last updated June 27, 2015.


In [9]:
#generate sequential output
sequences = generate_text("Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Both `max_new_tokens` (=512) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [10]:
#print the sequential output
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.
Daniel: Hello, Girafatron!
Girafatron: Hello, Daniel.
Daniel: How are you today?
Girafatron: I am fine, thank you.
Daniel: What is your favorite food?
Girafatron: I enjoy eating leaves from the tallest trees.
Daniel: What is your favorite color?
Girafatron: I like the color yellow.
Daniel: What is your favorite movie?
Girafatron: I like the movie "Giraffes Gone Wild."
Daniel: What is your favorite television show?
Girafatron: I like the show "Giraffes Gone Wild."
Daniel: What is your favorite book?
Girafatron: I like the book "Giraffes Gone Wild."
Daniel: What is your favorite song?
Girafatron: I like the song "Giraffes Gone Wild."
Daniel: What is your favorite band?
Girafatron: I like the band "Giraffes Gone Wild."
Daniel: What is your favorite sport?
Girafatron: I like the sport "Gir