In [2]:
from pathlib import Path
from omnibelt import human_readable_number
import torch
torch.set_default_device('cuda')
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline

In [3]:
def count_parameters(model):
	return sum(p.numel() for p in model.parameters())

In [4]:
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype="auto")

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
print(human_readable_number(count_parameters(model)))

In [34]:
inputs = tokenizer('''Imagine a world where two distinct genres, such as cyberpunk and renaissance romance, have been seamlessly blended. Describe a day in the life of a character living in this unique world.''', return_tensors="pt", return_attention_mask=False)

start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
outputs = model.generate(**inputs, max_length=500)
end_event.record()
torch.cuda.synchronize()
elapsed_time = start_event.elapsed_time(end_event)
total_generated_tokens = len(outputs[0]) - len(inputs['input_ids'][0])
print(f'Speed: {total_generated_tokens / elapsed_time * 1000: .2g} tps')

Speed:  34 tokens per second


In [35]:
text = tokenizer.batch_decode(outputs)[0]
print(text)

Imagine a world where two distinct genres, such as cyberpunk and renaissance romance, have been seamlessly blended. Describe a day in the life of a character living in this unique world.

Answer: In this world, a character named Alex wakes up to the sound of a melodic symphony, composed entirely of futuristic electronic beats. As Alex steps outside, they are greeted by a breathtaking sight - a cityscape adorned with vibrant neon lights and towering skyscrapers. The streets are filled with people dressed in futuristic attire, their movements synchronized to the rhythm of the music.

Alex's day begins with a visit to a futuristic art gallery, where they encounter a mesmerizing installation that combines elements of cyberpunk and renaissance romance. The artwork, created by a renowned artist, depicts a couple locked in an embrace, their bodies intertwined with intricate electronic patterns. The colors and textures evoke a sense of passion and longing, blurring the lines between the two ge

In [32]:
total_generated_tokens = len(outputs[0]) - len(inputs['input_ids'][0])

In [31]:
total_generated_tokens

tensor([-24653,    243,   -495,   -310,   -234,  -6810, -27462,    489,   -384,
           145,  -9575, -29854,    210, -48417, -19161,    489,     77,    -87,
        -33181, -31563,    487, -38873,  -4392,    243,   -610,    213,    238,
          -704,    214,    243,  -1595,  -2377,    213,     72,  -3248,   -495,
           487], device='cuda:0')

In [18]:
# out = pipeline('text-generation', model="microsoft/phi-1_5", trust_remote_code=True)
# out

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

2023-09-27 11:43:36.450276: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
dataset_name = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model name
new_model = "llama-2-7b-miniguanaco"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [3]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training


Downloading readme:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/967k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



Downloading (…)okenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
inputs = tokenizer('''Imagine a world where two distinct genres, such as cyberpunk and renaissance romance, have been seamlessly blended. Describe a day in the life of a character living in this unique world.''', return_tensors="pt", return_attention_mask=False)

start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
outputs = model.generate(**inputs, max_length=500)
end_event.record()
torch.cuda.synchronize()
elapsed_time = start_event.elapsed_time(end_event)
total_generated_tokens = len(outputs[0]) - len(inputs['input_ids'][0])
print(f'Speed: {total_generated_tokens / elapsed_time * 1000: .2g} tps')



Speed:  3.3 tps


In [5]:
text = tokenizer.batch_decode(outputs)[0]
print(text)

<s> Imagine a world where two distinct genres, such as cyberpunk and renaissance romance, have been seamlessly blended. Describe a day in the life of a character living in this unique world.

In the city of Cygnus, a sprawling metropolis nestled between towering mountains and a glimmering lake, the sun rises over the skyline, casting a warm orange glow over the bustling streets. The air is thick with the hum of holographic advertisements and the distant thrum of hovercraft engines.

Our protagonist, a young woman named Aria, stirs in her cozy apartment, nestled in the heart of the city's lower levels. She rubs the sleep from her eyes and reaches for her cybernetic implant, a sleek, silver device that allows her to interface with the city's vast network of computers and databases.

Aria is a data analyst, specializing in the retrieval and interpretation of historical records from the Renaissance era. Her work is in high demand, as the city's wealthy elite are eager to learn more about t

In [None]:

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

In [None]:
# %load_ext tensorboard
# %tensorboard --logdir results/runs

In [6]:
# Ignore warnings
# logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] What is a large language model? [/INST]  A large language model is a type of artificial intelligence (AI) model that is trained on a large dataset of text to generate language outputs that are coherent and natural-sounding. everybody. These models are typically trained on vast amounts of text data, such as books, articles, and websites, and are designed to learn the patterns and structures of language.

Large language models are often used in natural language processing (NLP) tasks such as language translation, text summarization, and language generation. They are also used in chatbots, virtual assistants, and other applications where language understanding and generation is required.

Some of the key features of large language models include:

1. Deep learning architecture: Large language models are typically built using deep learning architectures such as recurrent neural networks (RNNs), long short-term memory (LSTM) networks,


In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [1]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="mistralai/Mistral-7B-v0.1")

2023-10-06 15:20:15.160929: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/966 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
out = pipe('''Space Debris Increase (SDI)
Number of Satellites Launched (NSL)
Satellite Size (SS)
International Space Regulations (ISR)
Technological Advancements (TA)
Orbital Path Density (OPD)
Satellite Functionality (SF)

ISR -> NSL: Observable
International space regulations influence the number of satellites that can be launched.
TA -> NSL: Observable
Technological advancements influence the number of satellites that can be launched.
NSL -> OPD: Observable
The number of satellites launched affects the density of satellites in orbital paths.
ISR -> SS: Observable
International space regulations influence the size of satellites that can be launched.
TA -> SS: Observable
Technological advancements influence the size of satellites being made and launched.
SS -> SF: Observable
The size of satellites influences their functionality.
OPD -> SDI: Observable
The density of satellites in orbital paths influences the increase in space debris.
SF -> SDI: Observable
The functionality of satellites influences the increase in space debris.

Fill in the following probabilities. For each one make sure to provide a range of reasonable values. For example `0.6-0.8` for an event that we can expect to occur 60%-80% of the time.


marginal probability of TA=1: [0.3, 0.5]
marginal probability of SS=1: [0.1, 0.12]
marginal probability of NSL=1: [0.7, 0.8]
marginal probability of ISR=1: [0.''')
print(out[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Space Debris Increase (SDI)
Number of Satellites Launched (NSL)
Satellite Size (SS)
International Space Regulations (ISR)
Technological Advancements (TA)
Orbital Path Density (OPD)
Satellite Functionality (SF)

ISR -> NSL: Observable
International space regulations influence the number of satellites that can be launched.
TA -> NSL: Observable
Technological advancements influence the number of satellites that can be launched.
NSL -> OPD: Observable
The number of satellites launched affects the density of satellites in orbital paths.
ISR -> SS: Observable
International space regulations influence the size of satellites that can be launched.
TA -> SS: Observable
Technological advancements influence the size of satellites being made and launched.
SS -> SF: Observable
The size of satellites influences their functionality.
OPD -> SDI: Observable
The density of satellites in orbital paths influences the increase in space debris.
SF -> SDI: Observable
The functionality of satellites influences