In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import os

In [2]:
from peft import get_peft_model, LoraConfig
import torch

In [3]:
HF_access_token = "hf_xxxx"
model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          token=HF_access_token, 
                                          padding_side="left",
                                          add_eos_token=True,
                                          model_max_length=256)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", token=HF_access_token)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    lora_alpha=64,
    lora_dropout=0.1,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
)

peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 13,207,552 || all params: 3,225,957,376 || trainable%: 0.4094




In [5]:
data_folder = "cs685"  # Folder with text files containing lecture notes, papers, etc.
texts = []

# Load and preprocess text data
for file in os.listdir(data_folder):
    if os.path.isdir(file) or file == ".ipynb_checkpoints":
        continue
    with open(os.path.join(data_folder, file), "r", encoding="utf-8") as f:
        texts.append(f.read())

In [6]:
# Tokenize the texts
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
inputs = tokenizer(texts, return_tensors="pt", padding="max_length", truncation=True)
inputs

{'input_ids': tensor([[128000,  94317,   5127,  ...,   1095,    596,    636],
        [128000,   1169,    596,  ...,    779,    584,    649],
        [128000,  94317,   1095,  ...,   1690,   5627,   1314],
        ...,
        [128000,  94317,   1095,  ...,   2017,    279,  28223],
        [128000,  94317,   1095,  ...,   2017,    279,  28223],
        [128000,    198,   6014,  ...,   2763,    315,   4860]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [7]:
from datasets import Dataset
labels = inputs["input_ids"].clone()  # We use the same tokens as labels

dataset = Dataset.from_dict({
    'input_ids': inputs['input_ids'],
    'attention_mask': inputs['attention_mask'],
    'labels': labels
})

In [None]:
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    save_steps=500,
    num_train_epochs=10,
    learning_rate=1e-4,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,2.971
20,2.7761
30,2.3337
40,1.9882
50,1.5556
60,1.343
70,0.9093


In [None]:
output_dir = './fine_tuned_llama_model'
model.save_pretrained(output_dir, safe_serialization=False)
tokenizer.save_pretrained(output_dir)

In [82]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=3072, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=3072, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=3072, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
              (defaul

In [83]:
def ask_model(prompt, max_tokens=350, temperature=0.7, repetition_penalty=1.2):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,      # Maximum number of tokens in the response
            temperature=temperature,        # Controls randomness in response
            repetition_penalty=repetition_penalty  # Discourages repeated phrases
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [84]:
ask_model("In the context of transfer learning, what does the process of fine-tuning specifically refer to?\
          a) Adjusting the learning rate during the pretraining phase \
          b) Modifying the architecture of the pretrained model to the better suit a specific task \
          c) training a pretrained model for a few additional epochs on a task-specific dataset \
          d) Replacing the transformer attention mechanism for specific tasks")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"In the context of transfer learning, what does the process of fine-tuning specifically refer to?          a) Adjusting the learning rate during the pretraining phase           b) Modifying the architecture of the pretrained model to the better suit a specific task           c) training a pretrained model for a few additional epochs on a task-specific dataset           d) Replacing the transformer attention mechanism for specific tasks\nThis question is about understanding  the concept of  transfer  learning and how  fine-tuning  a pre-trained model can be implemented. The correct option C states that in the process of  fine-tuning, the  pretrained  model is trained for a few more epochs on a task-specific dataset. This would update the model's parameters to make them more task-specific while keeping the rest of the model the same as in the pre-trained state. Option A is incorrect because it refers to something that happens during the  pre-training  phase. Option B is also incorrect si

In [85]:
ask_model(" Explain how the prompt tuning method discussed in class allows us to solve multiple different NLP tasks \
within a single batch")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


' Explain how the prompt tuning method discussed in class allows us to solve multiple different NLP tasks within a single batch\nPrompt tuning  is  allowed  because  large  language  models  are  able  to  capture\ncomplex  relationships  between  words  and  can  be  trained  to  predict  novel  tokens.\nThis  ability  of  language  models  to  handle  outofvocabulary  tokens  enables  us  to\ninstruct  the  model  to  predict  novel  tokens  that  are  specific  to  certain  nlp  tasks,\nwhile  keeping  the  rest  of  the  architecture  and  training  procedure  same  as  before\nbecause  the  cross-entropy  loss  function  used  during  training  no  longer  has  open-voyage\ntokens. We also don’t have to update or retraining all the models that were mentioned earlier such\nas sentiment analysis, question answering and text summarization since we are just updating the\nlanguage model that was-trained on general purpose text without any nlp task specification.\nalso note that all the

In [89]:
ask_model("when were the midterm grades released?")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'when were the midterm grades released? 11/8 update:  midterms are finally over for most students. 11 /8 update : uh so like i said before many of you have your own\nInstructors for CS 685 Spring 2023\nO n. 11 /8, we had 93 some students take a quiz to make up the first assignment um today we ﬁnalized all those quizzes and graded them uh as always we’re doing our best\nto get these grades out as fast as possible while also allowing time for us to grade the homeworks that are due um on February whatever uh 16 or 17 so you’ll have an idea\nof your homework grades pretty soon uh okay so with that let’s um see uh today we’re going to talk about oh God this is a very important\ntopic um alignment ﬁrst from an instructor perspective um so we don’t have a template for our new ﬂip class um but uh we do have a\nteam that is working on creating a template for the ﬂip version of our old class CS 15 in fact we’ve been using this template to build the\nnew CS 685 um instance that we’re on right now

In [None]:
ask_model("What are encoder-decoder models?")