<a href="https://colab.research.google.com/github/fatemafaria142/User-Guided-Approach-for-Science-Exam-Question-Answering/blob/main/TinyLlama_fine_tuning_ScienceQA_text_only_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install accelerate peft bitsandbytes transformers trl datasets

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.7.1-py3-none-any.whl (168 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.7.9-py3-none-any.whl (141 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.1/141.1 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m44.2 MB/s[0m eta [36m0:00:00

# **Load the required packages**

In [2]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os

### **Dataset Link:** https://huggingface.co/datasets/sciq

In [3]:
dataset="tasksource/ScienceQA_text_only"
model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
output_model="tinyllama-ScienceQA-v1"

# **Dataset preparation**

In [5]:
def prepare_train_data(data_id):
    data = load_dataset(data_id, split="train")
    data_df = data.to_pandas()
    data_df["text"] = data_df[["question", "solution"]].apply(lambda x: "<|im_start|>User\n" + x["question"] + " <|im_end|>\n<|im_start|>Assistant\n" + x["solution"] + "<|im_end|>\n", axis=1)
    data = Dataset.from_pandas(data_df)
    return data

In [6]:
data = prepare_train_data(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.73M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/576k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/619k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6508 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2144 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2224 [00:00<?, ? examples/s]

In [7]:
data

Dataset({
    features: ['question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution', 'text'],
    num_rows: 6508
})

In [8]:
data[0]

{'question': 'Which tense does the sentence use?\nMona will print her name with care.',
 'choices': ['present tense', 'future tense', 'past tense'],
 'answer': 1,
 'hint': '',
 'task': 'closed choice',
 'grade': 'grade2',
 'subject': 'language science',
 'topic': 'verbs',
 'category': 'Verb tense',
 'skill': 'Is the sentence in the past, present, or future tense?',
 'lecture': 'Present tense verbs tell you about something that is happening now.\nMost present-tense verbs are regular. They have no ending, or they end in -s or -es.\nTwo verbs are irregular in the present tense, to be and to have. You must remember their forms.\nPast tense verbs tell you about something that has already happened.\nMost past-tense verbs are regular. They end in -ed.\nSome verbs are irregular in the past tense. You must remember their past-tense forms.\nFuture tense verbs tell you about something that is going to happen.\nAll future-tense verbs use the word will.\nPresent | Past | Future\nwalk, walks | walke

## **We have to model the Model (not the base version)**

In [9]:
def get_model_and_tokenizer(mode_id):

    tokenizer = AutoTokenizer.from_pretrained(mode_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        mode_id, quantization_config=bnb_config, device_map="auto"
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model, tokenizer

In [10]:
model, tokenizer = get_model_and_tokenizer(model_id)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# **Setting up the LoRA**

In [11]:
peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )

In [18]:
training_arguments = TrainingArguments(
        output_dir=output_model,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=3,
        max_steps=250,
        fp16=True,
        # push_to_hub=True
    )

In [19]:
trainer = SFTTrainer(
        model=model,
        train_dataset=data,
        peft_config=peft_config,
        dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length=1024
    )

Map:   0%|          | 0/6508 [00:00<?, ? examples/s]

In [20]:
trainer.train()

Step,Training Loss
10,2.4302
20,1.8096
30,1.6252
40,1.4527
50,1.3624
60,1.2438
70,1.2665
80,1.2542
90,1.2298
100,1.1449


TrainOutput(global_step=250, training_loss=1.2281870651245117, metrics={'train_runtime': 220.6928, 'train_samples_per_second': 4.531, 'train_steps_per_second': 1.133, 'total_flos': 727223482847232.0, 'train_loss': 1.2281870651245117, 'epoch': 0.15})

# **Merging the LoRA with the base model**

In [21]:
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os

model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, load_in_8bit=False,
                                             device_map="auto",
                                             trust_remote_code=True)

model_path = "/content/tinyllama-ScienceQA-v1/checkpoint-250"

peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")

model = peft_model.merge_and_unload()

In [22]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)

# **Inference from the LLM for one examples**

In [23]:
def formatted_prompt(question)-> str:
    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"

In [25]:
def generate_response(user_input):

    prompt = formatted_prompt(user_input)

    inputs = tokenizer([prompt], return_tensors="pt")
    generation_config = GenerationConfig(
        penalty_alpha=0.6,
        do_sample=True,
        top_k=5,
        temperature=0.5,
        repetition_penalty=1.2,
        max_new_tokens=12,
        pad_token_id=tokenizer.eos_token_id
    )
    start_time = perf_counter()

    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

    outputs = model.generate(**inputs, generation_config=generation_config)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()  # Remove leading and trailing whitespaces
    print(decoded_output)
    output_time = perf_counter() - start_time
    print(f"Time taken for inference: {round(output_time, 2)} seconds")

In [27]:
generate_response(user_input='Which word would you find on a dictionary page with the following guide words? file - four')

<|im_start|>user
Which word would you find on a dictionary page with the following guide words? file - four<|im_end|>
<|im_start|>assistant:
Look for words that end in four. The first two
Time taken for inference: 0.51 seconds


# **Inference from the LLM for list of examples**

In [28]:
def formatted_prompt(question)-> str:
    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"

In [29]:
from transformers import GenerationConfig
from time import perf_counter

def generate_responses(user_inputs):
    for user_input in user_inputs:
        prompt = formatted_prompt(user_input)

        inputs = tokenizer([prompt], return_tensors="pt")
        generation_config = GenerationConfig(
            penalty_alpha=0.6,
            do_sample=True,
            top_k=5,
            temperature=0.5,
            repetition_penalty=1.2,
            max_new_tokens=12,
            pad_token_id=tokenizer.eos_token_id
        )

        start_time = perf_counter()
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        outputs = model.generate(**inputs, generation_config=generation_config)
        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()  # Remove leading and trailing whitespaces
        decoded_output = decoded_output.replace('<', '')  # Remove the '<' symbol
        decoded_output = decoded_output.replace('>', '')  # Remove the '>' symbol
        print(decoded_output)
        output_time = perf_counter() - start_time
        print(f"Time taken for inference: {round(output_time, 2)} seconds\n")


In [30]:
# Example usage
user_inputs = [
    'Does this passage describe the weather or the climate? There was rain and sleet in Sioux Falls, South Dakota, last weekend.',
    'Which is a simple sentence?',
    'Which tense does the sentence use? Mr. Norman signed his name on the letter.',]
generate_responses(user_inputs)


|im_start|user
Does this passage describe the weather or the climate? There was rain and sleet in Sioux Falls, South Dakota, last weekend.|im_end|
|im_start|assistant:
This passage describes a change of weather. The weather changed
Time taken for inference: 0.55 seconds

|im_start|user
Which is a simple sentence?|im_end|
|im_start|assistant:
The first sentence uses the verb tell. The second sentence
Time taken for inference: 0.53 seconds

|im_start|user
Which tense does the sentence use? Mr. Norman signed his name on the letter.|im_end|
|im_start|assistant:
The second verb is in present continuous tense, which
Time taken for inference: 0.65 seconds

