Install necessary packages.
transformers contains the model and many necessary pre-processing tools
peft.git contains components for Parameter efficient fine-tuning
wandb for tracking performance
bitsandbytes and accelerate.git are supports for quantitizing the model into 4-bit

In [None]:
!pip install -q transformers==4.30 einops
!pip install git+https://github.com/huggingface/accelerate
!pip install transformers[torch]
!pip install git+https://github.com/huggingface/peft.git
!pip install datasets bitsandbytes wandb trl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting git+https://github.com/huggingface/accelerate
  Cloning https://github.com/huggingface/accelerate to /tmp/pip-req-build-ssgfyw97
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate /tmp/pip-req-build-ssgfyw97
  Resolved https://github.com/huggingface/accelerate to commit 80da9cfb09bb3cc9f1b385cb55d6b90d025a5fd9
  Installing build dependen

Load the model with 4-bit quantitisation

In [None]:
import torch
import transformers
from transformers import AutoModelForCausalLM, BitsAndBytesConfig


model_name = "tiiuae/falcon-rw-1b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading (…)figuration_falcon.py:   0%|          | 0.00/6.70k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-rw-1b:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)n/modeling_falcon.py:   0%|          | 0.00/56.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-rw-1b:
- modeling_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

Load dataset and preprocess it

In [13]:
from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset("covid_qa_deepset")
dataset = dataset['train'].train_test_split(test_size=606, train_size=1413)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def convert_answer(row):
  answer = row['answers']['text'][0]
  row['answers'] = answer
  return row

def shorten_context(row):
  #if sequence of context is longer than 512, it will be cut off
  if len(torch.split(row["context"], " ")) > 512:
    input_seq = tokenizer.encode("sumarize: " + row["context"], return_tensors="pt", max_length=512, truncation=True)
    summarized_seq = model.generate(input_seq, min_length = 100, max_length = 512)
    row["context"] = tokenizer.decode(summarized_seq[0])
  return row

def output_text(row):
  row['text'] = "### Context: " + row['context'] +  "### Question: " + row['question'] + "### Answer: " + row['answers']
  return row

train_data = dataset["train"]
train_data.add_column(name="text", column=train_data['answers'])
train_data = train_data.map(convert_answer)
train_data = train_data.map(shorten_context)
train_data = train_data.map(output_text)
train_data.remove_columns('document_id')
train_data.remove_columns('is_impossible')
train_data.remove_columns('id')

Flattening the indices:   0%|          | 0/1413 [00:00<?, ? examples/s]

Map:   0%|          | 0/1413 [00:00<?, ? examples/s]

Map:   0%|          | 0/1413 [00:00<?, ? examples/s]

Map:   0%|          | 0/1413 [00:00<?, ? examples/s]

Dataset({
    features: ['document_id', 'context', 'question', 'is_impossible', 'answers', 'text'],
    num_rows: 1413
})

In [14]:
print(dataset)
print(dataset.column_names)

DatasetDict({
    train: Dataset({
        features: ['document_id', 'context', 'question', 'is_impossible', 'id', 'answers'],
        num_rows: 1413
    })
    test: Dataset({
        features: ['document_id', 'context', 'question', 'is_impossible', 'id', 'answers'],
        num_rows: 606
    })
})
{'train': ['document_id', 'context', 'question', 'is_impossible', 'id', 'answers'], 'test': ['document_id', 'context', 'question', 'is_impossible', 'id', 'answers']}


In [24]:
dataset['train']['answers'][0]['text']

['among older adults, especially males, with pre-existing diseases.']

In [15]:
dataset['train']['answers'][0]['text'][0]

'among older adults, especially males, with pre-existing diseases.'

Configure LoRa finetuning

In [10]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

Create training arguments

In [11]:
from transformers import TrainingArguments



training_arguments = TrainingArguments(

    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs = 1,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=5,
    max_steps = 500,
    optim = "paged_adamw_32bit",
    output_dir = "./results",
    lr_scheduler_type = "constant",
    warmup_ratio = 0.03,
    max_grad_norm = 0.3,
    save_steps = 10,
    group_by_length=True,
    report_to=None
)

Load the supervise fine-tuning trainer

In [17]:
from trl import SFTTrainer

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)



Map:   0%|          | 0/1413 [00:00<?, ? examples/s]

In [18]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

Finally, train the model

In [20]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [22]:
import wandb

wandb.init()

[34m[1mwandb[0m: Currently logged in as: [33mbobbyngoson204[0m ([33mintelligentsystem[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.15.10
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20230921_022106-ql8tuta9[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mtwilight-hill-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/intelligentsystem/uncategorized[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/intelligentsystem/uncategorized/runs/ql8tuta9[0m


In [23]:
trainer.train()

Step,Training Loss
5,2.1843
10,2.184
15,2.0238
20,2.032
25,1.8857
30,1.7904
35,1.6556
40,1.5275
45,1.4518
50,1.241


KeyboardInterrupt: ignored