<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/deepseek_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

model_name = "deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

text = "An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)


In [3]:
text = "An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)


An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is a vector of real numbers, and the attention function is a function that maps the query and key-value pairs to the output.

The attention function is a key component of the attention mechanism, which is a type of neural network architecture that allows the network to focus on specific parts of the input data. The attention mechanism is used in various applications, such as machine translation, image captioning, and question answering.

The attention function is typically implemented using a neural network architecture, where the


In [1]:
!pip install datasets -q
!pip install bitsandbytes -q
!pip install -U bitsandbytes -q

In [None]:
!pip install colab-env --quiet

import warnings
warnings.filterwarnings("ignore", message="You seem to be using the pipelines sequentially on GPU")

import colab_env
import os

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

from huggingface_hub import login

login(
  token=access_token_write,
  add_to_git_credential=True
)

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    GenerationConfig,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from datasets import load_dataset
import bitsandbytes
from peft import LoraConfig, PeftModel  # Import LoraConfig from peft

# Load the tokenizer and model with 4-bit quantization
model_name = "deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # Rank of the LoRA update matrices
    lora_alpha=32,  # Scaling factor for the LoRA updates
    target_modules=["q_proj", "k_proj", "v_proj"],  # Specific target modules for DeepSeek LLM
    lora_dropout=0.05,  # Dropout probability for the LoRA layers
    bias="none",  # No bias for the LoRA layers
)


# Apply LoRA
# The model is already quantized during loading using bnb_config, and compute dtype is set to bfloat16.)
from peft import get_peft_model # Import get_peft_model
model = get_peft_model(model, lora_config) # Use get_peft_model to add LoRA to the model

# Configure the model for training
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id


# Load the Spider dataset
spider_dataset = load_dataset("spider")

# Preprocessing function
def preprocess_function(examples):
    inputs = [f"Question: {q} SQL: " for q in examples["question"]]
    targets = examples["query"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_spider = spider_dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    fp16=True,
    logging_steps=10,
    save_steps=500,
    evaluation_strategy="epoch",
    push_to_hub=True,
    hub_model_id="frankmorales2020/deepseek-llm-7b-base-spider",
    report_to="none"
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_spider["train"],
    eval_dataset=tokenized_spider["validation"],
)

# Fine-tune the model
trainer.train()

# Save and push the fine-tuned model to Hugging Face Hub
trainer.save_model()
trainer.push_to_hub()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Epoch,Training Loss,Validation Loss
