In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [None]:
!pip install -U transformers[torch] datasets
!pip install -q bitsandbytes trl peft accelerate
!pip install flash-attn --no-build-isolation





In [None]:
from transformers import BitsAndBytesConfig

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Meta-Llama-3-8B"

# The instruction dataset to use
dataset_name = "drive/MyDrive/project_cs685/dataset/training_dataset.csv"

# Fine-tuned model name
new_model = "drive/MyDrive/project_cs685/model/llama-3-finetuned_v2/causal-reasoning-finetuned"

device_map = {"": 0}


In [None]:


################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True # False

# Load tokenizer and model with QLoRA configuration
compute_dtype = torch.bfloat16



quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,)

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             quantization_config=quantization_config,
                                             device_map="auto")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

In [None]:
import pandas as pd

In [None]:
training_dataset_df = pd.read_csv(dataset_name)

In [None]:
training_dataset_df.head()

Unnamed: 0,question_id,desc_id,given_info,question,answer,meta,reasoning,background
0,5232,nonsense14-arrowhead-nde-modelNone-spec6-q1,For those who are not plizz and are not flurnt...,If we disregard the mediation effect through f...,yes,"{'story_id': 'nonsense14', 'graph_id': 'arrowh...",{'step0': 'Let V2 = snerp; X = plizz; V3 = flu...,"Imagine a self-contained, hypothetical world w..."
1,1816,nonsense11-arrowhead-nde-modelNone-spec2-q1,"For those who are not phuffy and are not jida,...",If we disregard the mediation effect through j...,yes,"{'story_id': 'nonsense11', 'graph_id': 'arrowh...",{'step0': 'Let V2 = choty; X = phuffy; V3 = ji...,"Imagine a self-contained, hypothetical world w..."
2,11496,nonsense20-confounding-marginal-modelNone-spec...,The overall probability of zowpxi is 55%. For ...,Is fluxy less likely than not fluxy overall?,no,"{'story_id': 'nonsense20', 'graph_id': 'confou...",{'step0': 'Let V1 = brippo; X = zowpxi; Y = fl...,"Imagine a self-contained, hypothetical world w..."
3,4238,nonsense13-diamond-nie-modelNone-spec6-q1,"For those who are not fritz, the probability o...",Does fritz negatively affect zibbo through flu...,yes,"{'story_id': 'nonsense13', 'graph_id': 'diamon...",{'step0': 'Let X = fritz; V3 = glopp; V2 = flu...,"Imagine a self-contained, hypothetical world w..."
4,5005,nonsense14-frontdoor-nie-modelNone-spec5-q0,"For those who are not plizz, the probability o...",Does plizz positively affect brifft through fl...,no,"{'story_id': 'nonsense14', 'graph_id': 'frontd...",{'step0': 'Let V1 = snerp; X = plizz; V3 = flu...,"Imagine a self-contained, hypothetical world w..."


In [None]:
instruction_text = "Refer to the given context to respond the question with either 'yes' or 'no'."

#"If you're unsure of the answer, simply state 'na' without guessing."
#"Use the following context to answer the question. The answer will be 'yes' or 'no'. If you don't know the answer, just say 'na', don't try to make up an answer."


def create_prompt_formats(sample):
    """
    Creates a formatted prompt template for a prompt in the instruction dataset

    :param sample: Prompt or sample from the instruction dataset
    """

    # Initialize static strings for the prompt template
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"

    # Combine a prompt with the static strings
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{instruction_text}"
    input_context = f"{INPUT_KEY}\n{sample['background']} {sample['given_info']} {sample['question']}" #if sample["input"] else None
    response = f"{RESPONSE_KEY}\n{sample['answer']}"
    end = f"{END_KEY}"

    # Create a list of prompt template elements
    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    # Join prompt template elements into a single string to create the prompt template
    formatted_prompt = "\n\n".join(parts)

    # Store the formatted prompt template in a new key "text"
    sample["text"] = formatted_prompt #{"role": "user", "content": f"{formatted_prompt}"}
    return sample


In [None]:
from datasets import load_dataset, Dataset
from datasets import Features, Value

In [None]:
# Load dataset (you can process it here)
#context_feat = Features({'text': Value(dtype='string', id=None)})
dataset = Dataset.from_pandas(training_dataset_df,
                              #features=context_feat,
                              split="train") #load_dataset(dataset_name, split="train")
dataset = dataset.map(create_prompt_formats)

Map:   0%|          | 0/9856 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments

In [None]:
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
model_id = "NousResearch/Meta-Llama-3-8B"
trained_model_id = "causal-reasoning-finetuned"
new_model = "drive/MyDrive/project_cs685/model/llama-3-finetuned_v2/" + trained_model_id


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)



tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
output_dir = "drive/MyDrive/project_cs685/model/llama-3-finetuned_v2/results"

In [None]:
dataset.features

{'question_id': Value(dtype='int64', id=None),
 'desc_id': Value(dtype='string', id=None),
 'given_info': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'answer': Value(dtype='string', id=None),
 'meta': Value(dtype='string', id=None),
 'reasoning': Value(dtype='string', id=None),
 'background': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None)}

In [None]:
for i in dataset.select(range(3)):
  print(i["text"])
  print("------------------")

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Refer to the given context to respond the question with either 'yes' or 'no'.

Input:
Imagine a self-contained, hypothetical world with only the following conditions, and without any unmentioned factors or causal relationships: Plizz has a direct effect on flurnt and brifft. Snerp has a direct effect on flurnt and brifft. Flurnt has a direct effect on brifft. Snerp is unobserved. For those who are not plizz and are not flurnt, the probability of brifft is 47%. For those who are not plizz and are flurnt, the probability of brifft is 56%. For those who are plizz and are not flurnt, the probability of brifft is 62%. For those who are plizz and are flurnt, the probability of brifft is 7%. For those who are not plizz and are not snerp, the probability of flurnt is 75%. For those who are not plizz and are snerp, the probability of flurnt is 63%. For those who are plizz 

In [None]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "drive/MyDrive/project_cs685/model/llama-3-finetuned_v2/results"

# Number of training epochs
num_train_epochs = 1

# Batch size per GPU for training
per_device_train_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Optimizer to use
optim = "paged_adamw_32bit"

# Save checkpoint every X updates steps
save_steps = 50

# Log every X updates steps
logging_steps = 50

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = True #False

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3


In [None]:

# based on config
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size, # originally set to 8
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    fp16=fp16,
    bf16=bf16,
    max_steps=max_steps,

    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    weight_decay=weight_decay,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",


    overwrite_output_dir=True,
)


################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1
################################################################################


# based on config
peft_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
        #target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)



In [None]:


################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

trainer = SFTTrainer(
        model=model, #model_id,
        train_dataset=dataset,
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length= max_seq_length, #tokenizer.model_max_length,
        tokenizer=tokenizer,
        args=training_args,
        packing=packing,
        #model_init_kwargs=model_kwargs,

        #eval_dataset=eval_dataset,
    )



train_result = trainer.train()



Map:   0%|          | 0/9856 [00:00<?, ? examples/s]

Step,Training Loss
50,0.9642
100,0.3153
150,0.2427
200,0.1707
250,0.1479
300,0.1433
350,0.1359
400,0.1325
450,0.1293
500,0.1317




In [None]:
# To clear out cache for unsuccessful run
#torch.cuda.empty_cache()

In [None]:
new_model

'drive/MyDrive/project_cs685/model/llama-3-finetuned_v2/causal-reasoning-finetuned'

In [None]:
# Save trained model
trainer.model.save_pretrained(new_model)

