<a href="https://colab.research.google.com/github/fatemafaria142/From-Text-to-Code-Generation-Pipeline-Incorporating-Various-LLMs/blob/main/Code_Generation_using_TinyLlama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install accelerate peft bitsandbytes transformers trl datasets

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.7.1-py3-none-any.whl (168 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.7.9-py3-none-any.whl (141 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.1/141.1 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m53.8 MB/s[0m eta [36m0:00:00

# **Load the required packages**

In [2]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os

### **Dataset Link:** https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k

In [3]:
dataset="sahil2801/CodeAlpaca-20k"
model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
output_model="tinyllama-Code-v1"

# **Dataset preparation**

In [4]:
def prepare_train_data(data_id, num_samples=500):
    data = load_dataset(data_id, split="train")

    # Take the first 1000 rows
    data = data.select(range(min(num_samples, len(data))))

    data_df = data.to_pandas()
    data_df["text"] = data_df[["instruction", "input", "output"]].apply(lambda x: "user\n" + x["instruction"] + "  \n input\n" + x["input"] + "\nassistant\n" + x["output"] + "\n", axis=1)
    data = Dataset.from_pandas(data_df)

    return data



In [5]:
data = prepare_train_data(dataset, num_samples=500)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.06M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
data

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 500
})

In [7]:
data[0]

{'instruction': 'Create an array of length 5 which contains all even numbers between 1 and 10.',
 'input': '',
 'output': 'arr = [2, 4, 6, 8, 10]',
 'text': 'user\nCreate an array of length 5 which contains all even numbers between 1 and 10.  \n input\n\nassistant\narr = [2, 4, 6, 8, 10]\n'}

In [8]:
data[2]

{'instruction': 'Write a replace method for a string class which replaces the given string with a given set of characters.',
 'input': 'string = "Hello World!"\nreplace_with = "Greetings!"',
 'output': 'def replace(self, replace_with):\n    new_string = ""\n    for char in self:\n        if char == " ":\n            new_string += replace_with\n        else:\n            new_string += char\n    return new_string',
 'text': 'user\nWrite a replace method for a string class which replaces the given string with a given set of characters.  \n input\nstring = "Hello World!"\nreplace_with = "Greetings!"\nassistant\ndef replace(self, replace_with):\n    new_string = ""\n    for char in self:\n        if char == " ":\n            new_string += replace_with\n        else:\n            new_string += char\n    return new_string\n'}

In [9]:
data[24]

{'instruction': 'Modify the code to perform the mathematical expression “x to the power of y”.',
 'input': '# Calculate x to the power of y\n# TODO',
 'output': '# Calculate x to the power of y\nx_pow_y = x**y',
 'text': 'user\nModify the code to perform the mathematical expression “x to the power of y”.  \n input\n# Calculate x to the power of y\n# TODO\nassistant\n# Calculate x to the power of y\nx_pow_y = x**y\n'}

## **We have to model the Model (not the base version)**

In [10]:
def get_model_and_tokenizer(mode_id):

    tokenizer = AutoTokenizer.from_pretrained(mode_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        mode_id, quantization_config=bnb_config, device_map="auto"
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model, tokenizer

In [11]:
model, tokenizer = get_model_and_tokenizer(model_id)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# **Setting up the LoRA**

In [12]:
peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )

In [19]:
training_arguments = TrainingArguments(
        output_dir=output_model,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=3,
        max_steps=250,
        fp16=True,
        # push_to_hub=True
    )

In [20]:
trainer = SFTTrainer(
        model=model,
        train_dataset=data,
        peft_config=peft_config,
        dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length=1024
    )

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [21]:
trainer.train()

Step,Training Loss
10,1.5626
20,1.4822
30,1.3788
40,1.0409
50,0.9676
60,0.9626
70,0.9938
80,0.8483
90,0.9326
100,0.7968


TrainOutput(global_step=250, training_loss=0.9028351135253906, metrics={'train_runtime': 201.0908, 'train_samples_per_second': 4.973, 'train_steps_per_second': 1.243, 'total_flos': 644088615100416.0, 'train_loss': 0.9028351135253906, 'epoch': 2.0})

# **Merging the LoRA with the base model**

In [22]:
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os

model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, load_in_8bit=False,
                                             device_map="auto",
                                             trust_remote_code=True)

model_path = "/content/tinyllama-Code-v1/checkpoint-250"

peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")

model = peft_model.merge_and_unload()

In [23]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)

# **Inference from the LLM for some examples**

In [39]:
def formatted_prompt(instruction, input)-> str:
    return f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>input\n{input}<|im_end|>\n<|im_start|>assistant:"

In [44]:
from transformers import GenerationConfig
from time import perf_counter

def generate_response(instruction, user_input):

    prompt = formatted_prompt(instruction, user_input)

    inputs = tokenizer([prompt], return_tensors="pt")
    generation_config = GenerationConfig(
        penalty_alpha=0.6,
        do_sample=True,
        top_k=5,
        temperature=0.5,
        repetition_penalty=1.2,
        max_new_tokens=120,
        pad_token_id=tokenizer.eos_token_id
    )
    start_time = perf_counter()

    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

    outputs = model.generate(**inputs, generation_config=generation_config)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()  # Remove leading and trailing whitespaces
    print(decoded_output)
    output_time = perf_counter() - start_time
    print(f"Time taken for inference: {round(output_time, 2)} seconds")

#### **Example No:1**

In [55]:
instruction = 'Create an array of length 5 which contains all even numbers between 1 and 10.'
user_input = ''
generate_response(instruction=instruction, user_input=user_input)

<|im_start|>user
Create an array of length 5 which contains all even numbers between 1 and 10.<|im_end|>
<|im_start|>input
<|im_end|>
<|im_start|>assistant:
[2, 4, 6, 8, 10]
"""
This code creates a list containing all the even numbers between 1 to 10.
Input:
len(arr) = 5
arr = [2, 4, 6, 8, 10]
Output:
[[2], [4], [6], [8], [10]]
"""
Time taken for inference: 6.31 seconds

<|im_start|>user
Create an array of length 5 which contains all even numbers between 1 and 10.<|im_end|>
<|im_start|>input
<|im_end|>
<|im_start|>assistant:
[2, 4, 6, 8, 10]
"""
This is a list containing all the even numbers between 1 to 10.
The output can be seen below. """

# Create the list
even_list = [2, 4, 6, 8, 10]
print(even_list)
Time taken for inference: 3.11 seconds

<|im_start|>user
Create an array of length 5 which contains all even numbers between 1 and 10.<|im_end|>
<|im_start|>input
string = " A B C D "<|im_end|>
<|im_start|>assistant:
arr = [i for I in range(1, 11)]   # create a list with odd numbers f

#### **Example No:2**

In [57]:
instruction = 'Write a replace method for a string class which replaces the given string with a given set of characters.'
user_input = 'string = "Hello World!" replace_with = "Greetings!"'
generate_response(instruction=instruction, user_input=user_input)

<|im_start|>user
Write a replace method for a string class which replaces the given string with a given set of characters.<|im_end|>
<|im_start|>input
<|im_end|>
<|im_start|>assistant:
class String(object):
    def __init__(self, s = "", *chars):
        self.s = s
        self._setChars(*chars)
        
    def __str__(self):
        return "".join(c for c in self.s if c in self._setChars)
    
    @property
    def chars(self):
        return list(self.s)
  
def findSetOfCharacters(string, charactersToFind):
    """Returns True if all characters in the string are found in the given set
Time taken for inference: 4.78 seconds

<|im_start|>user
Write a replace method for a string class which replaces the given string with a given set of characters.<|im_end|>
<|im_start|>input
<|im_end|>
<|im_start|>assistant: 
class String:
    def __init__(self, str):
        self.str = str
        
    def replace(self, char1, char2):
        return ''.join([char1]+list(map(lambda c:c if c != char2 el

#### **Example No:3**

In [56]:
instruction =  'Formulate an equation to calculate the height of a triangle given the angle, side lengths and opposite side length.'
user_input = ''
generate_response(instruction=instruction, user_input=user_input)

<|im_start|>user
Formulate an equation to calculate the height of a triangle given the angle, side lengths and opposite side length.<|im_end|>
<|im_start|>input
<|im_end|>
<|im_start|>assistant: To find the height of a triangle using its sides and the known value of the base, we can use the formula H = sqrt((a + b)²/6). Here's how it works.
  
  Input:
    Angle (in radians): 45
    Side1 (length):       2
    Side2 (length):        3
    OppositeSideLength (length):    8

Output:
  
  Calculated Height is 7.096937018629999

Note: This assumes
Time taken for inference: 5.41 seconds

<|im_start|>user
Formulate an equation to calculate the height of a triangle given the angle, side lengths and opposite side length.<|im_end|>
<|im_start|>input
<|im_end|>
<|im_start|>assistant: 
height = (s1 + s2) / 2 * cos(angle) - s3
where s1, s2, s3 are side lengths. angle is the measure of the angle between the two sides of the triangle.  
output
4
assistant: 
height = ((s1 + s2)/2)*cos(-angle)-s3
wher