In [None]:
!pip install torch==2.0.1
!pip install transformers==4.32.1
!pip install datasets==2.14.4
!pip install peft==0.5.0
!pip install bitsandbytes==0.41.1
!pip install trl==0.7.1

Collecting transformers==4.32.1
  Using cached transformers-4.32.1-py3-none-any.whl (7.5 MB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.32.1)
  Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.19.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.40.2
    Uninstalling transformers-4.40.2:
      Successfully uninstalled transformers-4.40.2
Successfully installed tokenizers-0.13.3 transformers-4.32.1
Collecting datasets==2.14.4
  Using cached datasets-2.14.4-py3-none-any.whl (519 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.14.4)
  Using cached dill-0.3.7-py3-none-any.whl (115 kB)
Collecting xxhash (from datasets==2.14.4)
  Using cached xxhash-3.4.1-cp310

In [None]:
import json
import re
from tqdm import tqdm
from pprint import pprint

import pandas as pd
import torch
from datasets import Dataset, load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

OUTPUT_DIR = 'experiments'
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "NousResearch/Llama-2-7b-hf"

### Data Loading & Preprocessing

In [None]:
DEFAULT_SYSTEM_PROMPT = "Below is an instruction that describes a task. Write a response that appropriately completes the request"

def format_prompt(query, response, system_prompt = DEFAULT_SYSTEM_PROMPT):
  return f"""###Instruction:
{system_prompt}
### Input:
{query}
### Response:
{response}""" + " </s>"

def generate_alpaca_prompt(example):
    return {
        'text' : format_prompt(example['instruction'], example['output'])
    }

# Read data from csv
dataset = pd.read_csv('CS412_HW4_Step1_Data.csv')
dataset = dataset[['instruction', 'output']]

# Convert the dataset into Huggingface Dataset
dataset = Dataset.from_pandas(dataset)

# Apply Alpaca format to instruction-response pairs
dataset = dataset.map(generate_alpaca_prompt).rename_column('output', 'response')
dataset

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'response', 'text'],
    num_rows: 200
})

### Loading Model & Tokenizer (4 Bit Quantized)

In [None]:
def get_quantized_model_and_tokenizer(model_id):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        use_safetensors=True,
        quantization_config=bnb_config,
        trust_remote_code=True,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return model, tokenizer

model, tokenizer = get_quantized_model_and_tokenizer(MODEL_NAME)

model.config.use_cache = False
model.gradient_checkpointing_enable()

model = prepare_model_for_kbit_training(model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

## Training

In [None]:
### LoRA Configuration
lora_r = 16
lora_alpha = 32
lora_dropout = 0.05
lora_target_modules = ["q_proj","v_proj"]

peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

### Training Arguments
OUTPUT_DIR = "experiments"

training_arguments = TrainingArguments(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    fp16=True,
    max_grad_norm=0.3,
    weight_decay=0.001,
    num_train_epochs=2,
    warmup_ratio=0.03,
    save_strategy="no",
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    logging_strategy='steps',
    logging_steps=0.1,
    save_safetensors=True,
    lr_scheduler_type="constant",
    seed=0,
)

### Trainer Initialization
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_arguments,
)



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


In [None]:
# Finetuning LLama2
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,1.6667
20,1.1858
30,1.0337
40,1.0835
50,0.9837
60,1.1412
70,0.8993
80,0.8477
90,0.9312
100,0.9367


TrainOutput(global_step=100, training_loss=1.070954637527466, metrics={'train_runtime': 331.8128, 'train_samples_per_second': 1.205, 'train_steps_per_second': 0.301, 'total_flos': 1400974124187648.0, 'train_loss': 1.070954637527466, 'epoch': 2.0})

In [None]:
# Saving model
trainer.save_model()

### Inference (Restart the Notebook before running this part)

* **Runtime -> Restart Session**

In [None]:
import json
import re
from tqdm import tqdm
from pprint import pprint

import pandas as pd
import torch
from datasets import Dataset, load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

OUTPUT_DIR = 'experiments'
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "NousResearch/Llama-2-7b-hf"

#### Loading Finetuned Model & Merging the Adapters

In [None]:
# Loading base model (not quantized)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    load_in_4bit=False,
    use_safetensors=True,
    trust_remote_code=True,
    device_map="auto"
)

# Loading the adapter
model = PeftModel.from_pretrained(
    model,
    OUTPUT_DIR,
    device_map='auto'
)

model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
from transformers import GenerationConfig

DEFAULT_SYSTEM_PROMPT = "Below is an instruction that describes a task. Write a response that appropriately completes the request"

def format_prompt_inference(query, system_prompt = DEFAULT_SYSTEM_PROMPT):
  return f"###Instruction:\n{system_prompt}\n\n### Input:\n{query}\n\n### Response:\n".strip()


def llama_inference(model, tokenizer, text):
  prompt = format_prompt_inference(text)
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda:0" if torch.cuda.is_available() else "cpu")
  inputs_length = len(inputs["input_ids"][0])
  generation_config = GenerationConfig(
      penalty_alpha=0.6,
      do_sample=True,
      top_k=5,
      temperature=0.0001,
      repetition_penalty=1.2,
      max_new_tokens=256,
      pad_token_id=tokenizer.eos_token_id
  )
  with torch.inference_mode():
      outputs = model.generate(**inputs, generation_config=generation_config)

  return tokenizer.decode(outputs[0][inputs_length:], skip_special_tokens=True)


def save_predictions_to_csv(model, tokenizer, dataset, output_file):
    predictions = []

    for row in tqdm(dataset):
        prediction = llama_inference(model, tokenizer, row['instruction'])
        predictions.append({'Instruction': row['instruction'], 'Prediction': prediction})

    predictions_df = pd.DataFrame(predictions)
    predictions_df.to_csv(output_file, index=False)
    print('Predictions saved as csv')

In [None]:
# Read data from csv
dataset = pd.read_csv('CS412_HW4_Test_Data.csv')
dataset = dataset[['instruction']]

# Convert the dataset into Huggingface Dataset
dataset = Dataset.from_pandas(dataset)

# Inference and saving the predictions
save_predictions_to_csv(model, tokenizer, dataset, output_file = 'CS412_HW4_Step2_Predictions.csv')

100%|██████████| 29/29 [06:38<00:00, 13.74s/it]

Predictions saved as csv



