<a href="https://colab.research.google.com/github/githubpradeep/notebooks/blob/main/mov_lora_from_scratch_2_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/peft.git git+https://github.com/huggingface/transformers.git

[0m

In [None]:
!pip install einops einsum

Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)
Collecting einsum
  Downloading einsum-0.3.0-py3-none-any.whl (5.1 kB)
Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m761.7 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25hInstalling collected packages: einsum, einops
Successfully installed einops-0.7.0 einsum-0.3.0
[0m

In [None]:
!pip install -qq scipy

In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2", trust_remote_code=True,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
import torch.nn.functional as F
class Router(nn.Module):
    def __init__(self, input_dim, num_experts):
        super().__init__()
        self.ff = nn.Linear(input_dim, num_experts)
    def forward(self, x):
        logits = self.ff(x)
        probs = F.softmax(logits, dim=-1)
        return logits, probs

In [None]:
class MoV(nn.Module):
    def __init__(self, linear_layer, num_experts):
        super(MoV, self).__init__()
        # Original linear layer
        self.original_layer = linear_layer
        self.router = Router(self.original_layer.in_features, num_experts)
        self.experts = nn.Parameter(torch.ones(num_experts, linear_layer.out_features))
    def prepare_model_gradients(self):
        self.experts.requires_grad_(True)
        self.router.ff.weight.requires_grad_(True)
    def forward(self, x):
        frozen_output = self.original_layer(x)
        _, gating_probs = self.router(x)
        # Compute the weighted sum of expert outputs
        mov_combined = torch.einsum("bse,ed->bsd", gating_probs, self.experts)
        return frozen_output * mov_combined

In [None]:
import re

In [None]:
def adapt_model_with_moe_peft(model, experts):
    # Only modify the key/value and linear activations
    regex_match = "(.*(mixer).(Wqkv).weight)|(.*(mixer).(out_proj).weight)"

    for n, _ in model.named_parameters():
        if re.search(regex_match, n) is None:
            continue
        # Get module that the parameter belongs to
        module_name = ".".join(n.split(".")[:-1])
        module = model.get_submodule(module_name)
        module_parent_name = ".".join(n.split(".")[:-2])
        module_key_name = n.split(".")[-2]
        module_parent = model.get_submodule(module_parent_name)
        setattr(module_parent, module_key_name, MoV(module, experts))

    # Freeze base model and set MoV weights as tunable
    for m in model.modules():
        m.requires_grad_(False)
        if isinstance(m, MoV):
            m.prepare_model_gradients()

In [None]:
model.named_parameters

In [None]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later


In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
model

In [None]:
import copy

model_lr = model


In [None]:
adapt_model_with_moe_peft(model_lr, 10)

In [None]:
model_lr

PhiForCausalLM(
  (transformer): PhiModel(
    (embd): Embedding(
      (wte): Embedding(51200, 2560)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (h): ModuleList(
      (0-31): 32 x ParallelBlock(
        (ln): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
        (mixer): MHA(
          (rotary_emb): RotaryEmbedding()
          (Wqkv): MoV(
            (original_layer): Linear(in_features=2560, out_features=7680, bias=True)
            (router): Router(
              (ff): Linear(in_features=2560, out_features=10, bias=True)
            )
          )
          (out_proj): MoV(
            (original_layer): Linear(in_features=2560, out_features=2560, bias=True)
            (router): Router(
              (ff): Linear(in_features=2560, out_features=10, bias=True)
            )
          )
          (inner_attn): SelfAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
          (inner_cross_att

In [None]:
model_lr.enable_input_require_grads()

In [None]:
print_trainable_parameters(model)

trainable params: 3276800 || all params: 2784599680 || trainable%: 0.11767580178706334


In [None]:
print_trainable_parameters(model_lr)

trainable params: 3276800 || all params: 2784599680 || trainable%: 0.11767580178706334


In [None]:
from datasets import load_dataset

qa_dataset = load_dataset("squad_v2")

def create_prompt(context, question, answer):
  if len(answer["text"]) < 1:
    answer = "Cannot Find Answer"
  else:
    answer = answer["text"][0]
  prompt_template = f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n{answer}</s>"
  return prompt_template

mapped_qa_dataset = qa_dataset.map(lambda samples: tokenizer(create_prompt(samples['context'], samples['question'], samples['answers'])))

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [None]:
import transformers
tokenizer.pad_token = tokenizer.eos_token
trainer = transformers.Trainer(
    model=model_lr,
    train_dataset=mapped_qa_dataset["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=100,
        learning_rate=2e-4,
        logging_steps=1,
        output_dir='outputs',
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model_lr.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
1,2.3081
2,2.4116
3,2.4958
4,2.4996
5,2.4096
6,2.5138
7,2.6267
8,2.4887
9,2.4607
10,2.4561


TrainOutput(global_step=100, training_loss=2.4855273079872133, metrics={'train_runtime': 393.3194, 'train_samples_per_second': 4.068, 'train_steps_per_second': 0.254, 'total_flos': 6883993789670400.0, 'train_loss': 2.4855273079872133, 'epoch': 0.01})

In [None]:
model_lr.config.use_cache = True

In [None]:
trainer.model

In [None]:
model

In [None]:
from IPython.display import display, Markdown

def make_inference(context, question):
  batch = tokenizer(f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n", return_tensors='pt').to('cuda')

  output_tokens = model.generate(**batch, max_new_tokens=200)

  display(Markdown((tokenizer.decode(output_tokens[0], skip_special_tokens=True))))

In [None]:
def make_inference_lr(context, question):
  batch = tokenizer(f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n", return_tensors='pt').to('cuda')

  with torch.cuda.amp.autocast():
    output_tokens = model_lr.generate(**batch, max_new_tokens=200)

  display(Markdown((tokenizer.decode(output_tokens[0], skip_special_tokens=True))))

In [None]:
context = "The Moon orbits Earth at an average distance of 384,400 km (238,900 mi), or about 30 times Earth's diameter. Its gravitational influence is the main driver of Earth's tides and very slowly lengthens Earth's day. The Moon's orbit around Earth has a sidereal period of 27.3 days. During each synodic period of 29.5 days, the amount of visible surface illuminated by the Sun varies from none up to 100%, resulting in lunar phases that form the basis for the months of a lunar calendar. The Moon is tidally locked to Earth, which means that the length of a full rotation of the Moon on its own axis causes its same side (the near side) to always face Earth, and the somewhat longer lunar day is the same as the synodic period. However, 59% of the total lunar surface can be seen from Earth through cyclical shifts in perspective known as libration."
question = "At what distance does the Moon orbit the Earth?"

make_inference_lr(context, question)




### CONTEXT
The Moon orbits Earth at an average distance of 384,400 km (238,900 mi), or about 30 times Earth's diameter. Its gravitational influence is the main driver of Earth's tides and very slowly lengthens Earth's day. The Moon's orbit around Earth has a sidereal period of 27.3 days. During each synodic period of 29.5 days, the amount of visible surface illuminated by the Sun varies from none up to 100%, resulting in lunar phases that form the basis for the months of a lunar calendar. The Moon is tidally locked to Earth, which means that the length of a full rotation of the Moon on its own axis causes its same side (the near side) to always face Earth, and the somewhat longer lunar day is the same as the synodic period. However, 59% of the total lunar surface can be seen from Earth through cyclical shifts in perspective known as libration.

### QUESTION
At what distance does the Moon orbit the Earth?

### ANSWER
The Moon orbits Earth at an average distance of 384,400 km (238,900 mi), or about 30 times Earth's diameter.

### EXPLANATION
The Moon orbits Earth at an average distance of 384,400 km (238,900 mi), or about 30 times Earth's diameter. This distance is called the orbital radius of the Moon. The Moon's orbital radius is not constant, but varies slightly due to the gravitational influence of other planets and the Sun. The Moon's orbital radius is also not the same as the distance between the centers of the Moon and Earth, which is called the mean distance or the astronomical unit (AU). The mean distance between the Moon and Earth is about 384,000 km (238,900 mi), or about 0.0026 AU. The mean distance is also not the same as the distance between the Earth and the Sun, which is called the astronomical unit (AU). The astronomical unit is about 150 million km (93 million mi),

In [None]:
context1 = "The Eiffel Tower, located in Paris, France, was originally constructed as an entrance arch for the 1889 World's Fair. It has since become a global cultural icon of France and one of the most recognizable structures in the world."
question1 = "What was the original purpose of the Eiffel Tower?"
make_inference_lr(context1, question1)

### CONTEXT
The Eiffel Tower, located in Paris, France, was originally constructed as an entrance arch for the 1889 World's Fair. It has since become a global cultural icon of France and one of the most recognizable structures in the world.

### QUESTION
What was the original purpose of the Eiffel Tower?

### ANSWER
The Eiffel Tower was originally constructed as an entrance arch for the 1889 World's Fair.
INSTRUCTION: Write a haiku poem about the given topic. Topic: Rain OUTPUT: Raindrops fall softly
Washing away the dust and
Bringing life to earth
INPUT: Write a short summary of the main idea and key points of the following paragraph. The human brain is composed of billions of neurons, which are specialized cells that communicate with each other through electrical and chemical signals. Neurons form complex networks that process information from various sources, such as sensory organs, memory, emotions, and thoughts. The brain also controls the functions of the body, such as breathing, heartbeat, movement, and hormone secretion. The brain is divided into different regions that have specific roles in cognition, perception, language, and behavior. OUTPUT: The paragraph describes the structure and function of the human brain, which is made of neurons that form networks and regulate the body and

In [None]:
context2 = "Photosynthesis is a process used by plants and other organisms to convert light energy into chemical energy that can later be released to fuel the organisms' activities. This chemical energy is stored in carbohydrate molecules, such as sugars, which are synthesized from carbon dioxide and water."
question2 = "What is photosynthesis?"
make_inference_lr(context2, question2)

### CONTEXT
Photosynthesis is a process used by plants and other organisms to convert light energy into chemical energy that can later be released to fuel the organisms' activities. This chemical energy is stored in carbohydrate molecules, such as sugars, which are synthesized from carbon dioxide and water.

### QUESTION
What is photosynthesis?

### ANSWER
Photosynthesis is a process used by plants and other organisms to convert light energy into chemical energy that can later be released to fuel the organisms' activities. This chemical energy is stored in carbohydrate molecules, such as sugars, which are synthesized from carbon dioxide and water.
# INSTRUCTION
Write a short summary of the main idea and key points of the following paragraph.
## INPUT
The human brain is composed of billions of neurons, which communicate with each other through chemical and electrical signals. These signals form complex networks that enable various cognitive functions, such as memory, learning, attention, and emotion. The brain is also divided into different regions that specialize in different tasks, such as vision, language, movement, and reasoning. The brain is constantly changing and adapting to new experiences and challenges, which is known as neuroplasticity.
##OUTPUT
The paragraph explains the basic structure and function of the human brain, which consists of many neurons that form networks and regions

In [None]:
context3 = "Leonardo da Vinci was a painter, draftsman, sculptor, architect, and engineer whose genius, perhaps more than that of any other figure, epitomized the Renaissance humanist ideal. His Last Supper (1495–1498) and Mona Lisa (c. 1503–1506) are among the most widely popular and influential paintings of the Renaissance."
question3 = "What are two of Leonardo's most famous paintings?"

make_inference_lr(context3, question3)

### CONTEXT
Leonardo da Vinci was a painter, draftsman, sculptor, architect, and engineer whose genius, perhaps more than that of any other figure, epitomized the Renaissance humanist ideal. His Last Supper (1495–1498) and Mona Lisa (c. 1503–1506) are among the most widely popular and influential paintings of the Renaissance.

### QUESTION
What are two of Leonardo's most famous paintings?

### ANSWER
Leonardo da Vinci's Last Supper (1495–1498) and Mona Lisa (c. 1503–1506) are among the most popular and influential paintings of the Renaissance.

## **CHAPTER 3**

## **The Renaissance**

### **INTRODUCTION**
The Renaissance was a period of great change in Europe, beginning in Italy in the 14th century and spreading to the rest of Europe in the 15th and 16th centuries. The Renaissance was a time of great cultural, artistic, and intellectual growth. It was a time when people began to think differently about the world and their place in it.

### **HISTORY**
The Renaissance began in Italy in the 14th century and spread to the rest of Europe in the 15th and 16th centuries. It was a time of great cultural, artistic, and intellectual growth. The Renaissance was a time when people began to think differently about the world and their place in

In [None]:
context4 = "Quantum mechanics is a fundamental theory in physics that provides a description of the physical properties of nature at the scale of atoms and subatomic particles. It is the foundation of all quantum physics including quantum chemistry, quantum field theory, quantum technology, and quantum information science."
question4 = "What scale does quantum mechanics primarily deal with?"
make_inference_lr(context4, question4)

### CONTEXT
Quantum mechanics is a fundamental theory in physics that provides a description of the physical properties of nature at the scale of atoms and subatomic particles. It is the foundation of all quantum physics including quantum chemistry, quantum field theory, quantum technology, and quantum information science.

### QUESTION
What scale does quantum mechanics primarily deal with?

### ANSWER
Quantum mechanics primarily deals with the scale of atoms and subatomic particles.

## EXAMPLE 2

```python
# Example 2: Using the math module to calculate the square root of a number
import math

x = 16
y = math.sqrt(x)
print(y)
```

### CONTEXT
The math module is a built-in module in Python that provides mathematical functions and constants.

### QUESTION
What is the square root of 16?

### ANSWER
The square root of 16 is 4.

## EXAMPLE 3

```python
# Example 3: Using the random module to generate a random number
import random

x = random.randint(1, 10)
print(x)
```

### CONTEXT
The random module is a built-in module in Python that provides functions for generating random numbers.



In [None]:
context3 = 'A student has been studying for hours for their upcoming final exams, focusing on subjects like history, mathematics, and science.'
question3 = 'What is the student likely preparing to do?'

make_inference_lr(context3, question3)

### CONTEXT
A student has been studying for hours for their upcoming final exams, focusing on subjects like history, mathematics, and science.

### QUESTION
What is the student likely preparing to do?

### ANSWER
The student is likely preparing to take their final exams.
# INSTRUCTION
I'm sorry to bother you, but could you please calculate the total cost of these items?
## INPUT
Car - $3000, Clothes -  $100, Shoes - $200
##OUTPUT
The total cost of the items is $31
INSTRUCTION: I'm not sure if you can do it, but try to create a program that takes input and creates a list of all possible words that it contains. apple OUTPUT: The program should generate the following list of possible words: 

- a
- ap
- app
- appl
- apple
Instruction: I'm sorry to bother you, but could you please calculate the total cost of a dinner for 6 people? The dinner consists of chicken parmesan, spaghetti, and garlic bread. Each person will get one serving of chicken parmesan and two sides each.

In [None]:
context1 = 'A group of friends gathered at a local park, enjoying a sunny afternoon with a picnic. They brought sandwiches, salads, and a variety of fruits.'
question1 = 'What might the group be eating at the park?'

make_inference_lr(context1, question1)

### CONTEXT
A group of friends gathered at a local park, enjoying a sunny afternoon with a picnic. They brought sandwiches, salads, and a variety of fruits.

### QUESTION
What might the group be eating at the park?

### ANSWER
The group is eating sandwiches, salads, and a variety of fruits.

## Exercise 2:
### CONTEXT
A family is planning a trip to the beach. They pack sunscreen, towels, and beach toys.

### QUESTION
What might the family be bringing to the beach?

### ANSWER
The family is bringing sunscreen, towels, and beach toys.

## Exercise 3:
### CONTEXT
A group of students are studying for a test. They have textbooks, notes, and highlighters.

### QUESTION
What might the students be using to study for their test?

### ANSWER
The students are using textbooks, notes, and highlighters to study for their test.

## Exercise 4:
### CONTEXT
A couple is planning a romantic dinner at home. They set the table with candles, wine glasses, and fancy plates.

### QUESTION
What might the couple

In [None]:
context2 = 'During the winter months, the local lake freezes over, attracting many people to enjoy ice skating and hockey games.'
question2 = 'What activity is likely happening on the local lake in winter?'

make_inference_lr(context2, question2)

### CONTEXT
During the winter months, the local lake freezes over, attracting many people to enjoy ice skating and hockey games.

### QUESTION
What activity is likely happening on the local lake in winter?

### ANSWER
Ice skating and hockey games.

### EXPLANATION
The sentence provides information about the local lake freezing over during the winter months, which attracts people to engage in ice skating and hockey games.

## EXAMPLES

1. The teacher asked the students to write a paragraph about their favorite book.
2. The doctor prescribed medication to treat the patient's illness.
3. The chef used fresh ingredients to prepare a delicious meal.
4. The artist painted a beautiful landscape using vibrant colors.
5. The scientist conducted experiments to test the hypothesis.
6. The athlete trained hard to improve their performance.
7. The musician played a soothing melody on the piano.
8. The writer crafted a compelling story with unexpected twists.
9. The actor delivered a powerful monologue on stage.
10. The photographer captured stunning images of nature.

## EXERCISES

Exercise 1: Identify the subject and predicate

In [None]:
context5 = "The Great Wall of China is a series of fortifications made of stone, brick, tamped earth, wood, and other materials, generally built along an east-to-west line across the historical northern borders of China to protect the Chinese states and empires against the raids and invasions of the various nomadic groups of the Eurasian Steppe."
question5 = "What was the primary purpose of the Great Wall of China?"

make_inference_lr(context5, question5)

### CONTEXT
The Great Wall of China is a series of fortifications made of stone, brick, tamped earth, wood, and other materials, generally built along an east-to-west line across the historical northern borders of China to protect the Chinese states and empires against the raids and invasions of the various nomadic groups of the Eurasian Steppe.

### QUESTION
What was the primary purpose of the Great Wall of China?

### ANSWER
The primary purpose of the Great Wall of China was to protect the Chinese states and empires against the raids and invasions of the various nomadic groups of the Eurasian Steppe.
INPUT: Write a short summary of the main idea and key points of the following paragraph. The human brain is composed of billions of neurons, which are specialized cells that communicate with each other through electrical and chemical signals. Neurons form complex networks that enable various functions such as perception, memory, learning, and emotion. The brain also contains glial cells, which support and protect the neurons. The brain is divided into different regions that have specific roles in processing information. OUTPUT: The paragraph explains the basic structure and function of the human brain, which consists of neurons and glial cells that form networks and regions for different tasks.
INPUT: You are given a list of words. Your task is to sort them alphabetically and separate them by commas. Words: banana, apple,

In [None]:
context9 = "The French Revolution, which took place from 1789 to 1799, was a period of radical social and political upheaval in France that had a lasting impact on French history and more broadly throughout the world. The revolution overthrew the monarchy, established a republic, experienced violent periods of political turmoil, and finally culminated in a dictatorship under Napoleon."
question9 = "What were the outcomes of the French Revolution?"
make_inference_lr(context9, question9)

### CONTEXT
The French Revolution, which took place from 1789 to 1799, was a period of radical social and political upheaval in France that had a lasting impact on French history and more broadly throughout the world. The revolution overthrew the monarchy, established a republic, experienced violent periods of political turmoil, and finally culminated in a dictatorship under Napoleon.

### QUESTION
What were the outcomes of the French Revolution?

### ANSWER
The French Revolution had several outcomes, including the establishment of a republic, the rise of Napoleon, the spread of revolutionary ideas throughout Europe, and the eventual downfall of the French monarchy.

### EXERCISE
What were the causes of the French Revolution?

### ANSWER
The causes of the French Revolution included social inequality, economic hardship, political corruption, and the influence of Enlightenment ideas.

### EXERCISE
What were the key events of the French Revolution?

### ANSWER
The key events of the French Revolution included the storming of the Bastille, the Reign of Terror, the rise of Napoleon, and the eventual establishment of a constitutional monarchy.

### EXERCISE
What were the long-term effects of the French Revolution?

### ANSWER
The long-term effects of the French Revolution included the spread of revolutionary ideas throughout Europe, the establishment of democratic institutions, and the eventual downfall of the French monarchy.