In [1]:
import os

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers.trainer import TrainingArguments
from peft import LoraConfig, AutoPeftModelForCausalLM, get_peft_model, PeftModel
from trl import SFTTrainer, DPOTrainer
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset = load_dataset(
    path="lvwerra/stack-exchange-paired",
    split = "train"
)

dataset = dataset.shuffle(seed=42).select(range(1000))

Resolving data files: 100%|██████████| 72/72 [00:00<00:00, 520.76it/s]


In [None]:
dataset[0]['question']

In [None]:
with open('hf_token.key', 'r') as f:
    hf_token = f.read()

base_model = "meta-llama/Meta-Llama-3-8B-Instruct"
new_model = "llama-3-8b-stack-exchange"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model, padding='max_length', truncation=True, token = hf_token)
# Adding a special token for pad token so that eos token can be recognized 
# (https://github.com/unslothai/unsloth/issues/416)
# https://github.com/huggingface/transformers/issues/22794
# https://github.com/huggingface/transformers/issues/23230
tokenizer.add_special_tokens({"pad_token": "<|reserved_special_token_0|>"})
tokenizer.padding_side = "right"
tokenizer.model_max_length = 512

In [None]:
def format_chat_template(row):
    row_json = [
        {"role" : "user", "content": row['question']},
        {"role" : "assistant", "content": row['response_j']}
    ]

    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

In [None]:
dataset = dataset.map(
            format_chat_template,
            num_proc=8
        )

dataset = dataset.train_test_split(test_size=0.1)

In [None]:
# QLoRA Config for 4-bit quntization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

# # For 8 bit quantization
# bnb_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=200.0)

In [None]:
# Load Model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config = bnb_config,
    torch_dtype = torch.bfloat16,
    device_map={'':torch.cuda.current_device()}
)

model.config.use_cache=False
model.config.pad_token_id = tokenizer.pad_token_id # Updating the model config to use the special pad token

In [None]:
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type = "CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

model = get_peft_model(model, peft_config)

In [None]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    overwrite_output_dir=True,
    bf16=True,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=0.1,
    learning_rate=2e-4,
    logging_steps=5,
    logging_strategy="steps",
    log_level="info",
    save_strategy="epoch",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    group_by_length=True,
    report_to="none",
    seed=42
)

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    dataset_text_field="text",
    peft_config=peft_config,
    max_seq_length=tokenizer.model_max_length,
    packing= False
)

In [None]:
trainer.train()

### Saving the pre-trained model

In [None]:
trainer.model.save_pretrained(new_model)

### Merging the base model with the adapter to get full model

In [None]:
base_model = "meta-llama/Meta-Llama-3-8B-Instruct"
new_model = "llama-3-8b-stack-exchange"

In [None]:
# Load Model
base_model_reload = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,    
    trust_remote_code=True,
    torch_dtype = torch.bfloat16,
    device_map={"":torch.cuda.current_device()}
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model, padding='max_length', truncation=True, token = hf_token)
tokenizer.add_special_tokens({"pad_token": "<|reserved_special_token_0|>"})
tokenizer.padding_side = "right"
tokenizer.model_max_length = 512

Merge adapter with the base model

In [None]:
model = PeftModel.from_pretrained(base_model_reload, new_model)

In [None]:
model = model.merge_and_unload()
model.config.pad_token_id = tokenizer.pad_token_id # Updating the model config to use the special pad token

In [None]:
model.save_pretrained("llama-3-8b-stack-exchange-sft")
tokenizer.save_pretrained("llama-3-8b-stack-exchange-sft")

### Load merged Model and Tokenizer for Inference

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "llama-3-8b-stack-exchange-sft",
    torch_dtype = torch.bfloat16,
    device_map={'':torch.cuda.current_device()}
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("llama-3-8b-stack-exchange-sft")

In [None]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

In [None]:
model.config.use_cache = True

messages = [
    {
        "role": "user",
        "content": "Background: \n\nMy DB need to store YYYY-MM-DD and HH:MM:SS data generated from a Machine. Data will be inserted every few minutes, **every day** will have **Thousands of** records\n\nQuestions: \n\nShould I separate Date column into another table with DateID and DateName?\n\nWhat about time? HH:MM:SS, should it be another table or just a column?\n\nHow about query performance?? Should I index Date and Time Column with FK?\n\nWhat's the best practices for Date and Time stamp?"
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

generation_config = model.generation_config
generation_config.pad_token_id = tokenizer.pad_token_id
generation_config.repetition_penalty = 1.5

outputs = model.generate(
    **inputs,
    max_new_tokens=1024,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    num_return_sequences=1,
    generation_config=generation_config
)

text = tokenizer.decode(outputs[0], skip_special_tokens=False)

print(text.split("assistant")[1])

### Direct Preference Optimization

In [None]:
# Tokenizer from pre-trained model
tokenizer = AutoTokenizer.from_pretrained("llama-3-8b-stack-exchange-sft")

In [None]:
# QLoRA Config for 4-bit quntization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

# # For 8 bit quantization
# bnb_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=200.0)

In [None]:
# Load Model
model = AutoModelForCausalLM.from_pretrained(
    "llama-3-8b-stack-exchange-sft",
    low_cpu_mem_usage=True,
    quantization_config = bnb_config,
    torch_dtype = torch.bfloat16,
    device_map={'':torch.cuda.current_device()}
)

model.config.use_cache=False
# model.config.pad_token_id = tokenizer.pad_token_id # Updating the model config to use the special pad token

In [None]:
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type = "CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

In [None]:
def format_chat_template_dpo(row):
    row_json = [
        {"role" : "user", "content": row['question']}
    ]

    prompt = tokenizer.apply_chat_template(row_json, tokenize=False, add_generation_prompt=True)
    chosen = row['response_j'] + tokenizer.eos_token
    rejected = row['response_k'] + tokenizer.eos_token

    return {
        "prompt": prompt,
        "chosen": chosen,
        "rejected": rejected
    }

In [None]:
def get_stack_exchange_paired(sanity_check=False, cache_dir=None, num_proc=24):
    
    dataset = load_dataset(
        path="lvwerra/stack-exchange-paired",
        split = "train"
    )

    dataset = dataset.shuffle(seed=42).select(range(1000))

    original_columns=dataset.column_names
    
    dataset = dataset.map(
                    format_chat_template_dpo,
                    num_proc=24,
                    remove_columns=original_columns
                )

    dataset = dataset.train_test_split(test_size=0.1)
    
    return dataset

In [None]:
dataset = get_stack_exchange_paired()

In [None]:
training_arguments = TrainingArguments(
    output_dir="llama-3-8b-stack-exchange-dpo",
    overwrite_output_dir=True,
    bf16=True,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=0.1,
    learning_rate=2e-4,
    logging_steps=5,
    logging_strategy="steps",
    log_level="info",
    save_strategy="epoch",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    remove_unused_columns=False,
    warmup_steps=10,
    report_to="none",
    seed=42
)

In [None]:
trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_arguments,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_prompt_length=tokenizer.model_max_length,
    max_length=tokenizer.model_max_length
)

In [None]:
trainer.train()

In [None]:
trainer.model.save_pretrained("llama-3-8b-stack-exchange-dpo")

### Merging the base model with the adapter to get full model

In [None]:
base_model = "llama-3-8b-stack-exchange-sft"
new_model = "llama-3-8b-stack-exchange-dpo"

In [None]:
# Load Model
base_model_reload = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,    
    trust_remote_code=True,
    torch_dtype = torch.bfloat16,
    device_map={"":torch.cuda.current_device()}
)

In [None]:
# Tokenizer from pre-trained model
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [None]:
model = PeftModel.from_pretrained(base_model_reload, new_model)

In [None]:
model = model.merge_and_unload()

In [None]:
model.save_pretrained("llama-3-8b-stack-exchange-dpo-merged")
tokenizer.save_pretrained("llama-3-8b-stack-exchange-dpo-merged")

### Loading the merged model for inference

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    "llama-3-8b-stack-exchange-dpo-merged",
    torch_dtype = torch.bfloat16,
    device_map={'':torch.cuda.current_device()}
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.81s/it]


In [5]:
tokenizer = AutoTokenizer.from_pretrained("llama-3-8b-stack-exchange-dpo-merged")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

In [13]:
model.config.use_cache = True

messages = [
    {
        "role": "user",
        "content": "How can I write a Select query for multiple columns in SQL?"
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

generation_config = model.generation_config
generation_config.pad_token_id = tokenizer.pad_token_id
generation_config.repetition_penalty = 1.5

outputs = model.generate(
    **inputs,
    max_new_tokens=1024,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    num_return_sequences=1,
    generation_config=generation_config
)

text = tokenizer.decode(outputs[0], skip_special_tokens=False)

print(text.split("assistant")[1])

<|end_header_id|>

You use the `SELECT` keyword and separate each column with commas:

```
  SELECT col1, 
         col2,
         etc
    FROM table_name;

```  

If you want to select all of them (as opposed to selecting some), just type out an asterisk: 

```
   SELECT *  
     from mytable;   

```

Edit:
---

The OP asked about [the syntax](http://msdn.microsoft.com/en-us/library/ms189499%28v=sql.105%29.aspx) which is used when there are more than one value specified.

> The ALL or ANY operator must be preceded by =, < >, <= >=!= <> but not IN. You cannot combine these operators as follows : any!<>

So if we have this data set :

| id | val |
---------+-----
      0       A   
        -3 B    
          +4 C     
           NULL D      
             E      

Then using "IN" will work fine ([SQL Fiddle Demo here ](https://www.sqlfiddle.com/#!6/d41d8ced59daa)

[![enter image description here][10]][5]

But trying it like so won't give us what most people expect... **Any** means that 

In [14]:
print(text)

<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

How can I write a Select query for multiple columns in SQL?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

You use the `SELECT` keyword and separate each column with commas:

```
  SELECT col1, 
         col2,
         etc
    FROM table_name;

```  

If you want to select all of them (as opposed to selecting some), just type out an asterisk: 

```
   SELECT *  
     from mytable;   

```

Edit:
---

The OP asked about [the syntax](http://msdn.microsoft.com/en-us/library/ms189499%28v=sql.105%29.aspx) which is used when there are more than one value specified.

> The ALL or ANY operator must be preceded by =, < >, <= >=!= <> but not IN. You cannot combine these operators as follows : any!<>

So if we have this data set :

| id | val |
---------+-----
      0       A   
        -3 B    
          +4 C     
           NULL D      
             E      

Then using "IN" will work fine ([SQL Fiddle Demo here