In [None]:
!pip install -q -U sentencepiece

In [None]:
# Import the required modules
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

# Load the model and tokenizer
model_name_or_path = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

# Define the LoRA configuration
lora_config = LoraConfig(
    r=8, # rank of the low-rank matrix
    lora_alpha=32, # scaling factor of the low-rank matrix
    lora_dropout=0.1, # dropout rate of the low-rank matrix
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)

# Load the dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Define the training arguments
training_args = TrainingArguments(
    output_dir="lora-gpt2", # output directory
    num_train_epochs=1, # number of training epochs
    per_device_train_batch_size=16, # batch size per device during training
    per_device_eval_batch_size=16, # batch size for evaluation
    logging_steps=500, # number of steps between logging
    save_steps=500, # number of steps between model saves
    evaluation_strategy="steps", # evaluate every logging_steps
    eval_steps=500, # number of steps between evaluations
    learning_rate=5e-5, # learning rate
    weight_decay=0.01, # weight decay
)

# Define the trainer
trainer = Trainer(
    model=model, # the model to train
    args=training_args, # the training arguments
    train_dataset=dataset["train"], # the training dataset
    eval_dataset=dataset["validation"], # the evaluation dataset
)

# Start the training
trainer.train()


In [None]:
from pathlib import Path
from sentencepiece import SentencePieceProcessor
from typing import List


class Tokenizer:
    def __init__(self, model_path: str):
        assert Path(model_path).exists(), model_path
        self._model = SentencePieceProcessor(model_file=model_path)
        assert self._model.vocab_size() == self._model.get_piece_size()

    @property
    def n_words(self) -> int:
        return self._model.vocab_size()

    @property
    def bos_id(self) -> int:
        return self._model.bos_id()

    @property
    def eos_id(self) -> int:
        return self._model.eos_id()

    @property
    def pad_id(self) -> int:
        return self._model.pad_id()

    def encode(self, s: str, bos: bool = True) -> List[int]:
        assert isinstance(s, str)
        t = self._model.encode(s)
        if bos:
            t = [self.bos_id, *t]
        return t

    def decode(self, t: List[int]) -> str:
        return self._model.decode(t)

In [None]:
import torch
from typing import Tuple

# The code you provided
def precompute_freqs_cis(dim: int, end: int, theta: float) -> torch.Tensor:
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
    t = torch.arange(end, device=freqs.device)  # type: ignore
    freqs = torch.outer(t, freqs).float()  # type: ignore
    return torch.polar(torch.ones_like(freqs), freqs)  # complex64


def apply_rotary_emb(
    xq: torch.Tensor,
    xk: torch.Tensor,
    freqs_cis: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
    freqs_cis = freqs_cis[:, None, :]
    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(2)
    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(2)
    return xq_out.type_as(xq), xk_out.type_as(xk)

# Some dummy input tensors for xq and xk
# Assume the batch size is 2, the sequence length is 4, and the hidden size is 8
xq = torch.randn(2, 4, 8)
xk = torch.randn(2, 4, 8)

# Some values for dim, end, and theta
# Assume the dim is 8, the end is 4, and the theta is 1.0
dim = 2
end = 2
theta = 1.0

# Compute the freqs_cis tensor
freqs_cis = precompute_freqs_cis(dim, end, theta)
print(freqs_cis)
# Apply the rotary embedding to xq and xk
xq_out, xk_out = apply_rotary_emb(xq, xk, freqs_cis)

# Print the output tensors
print(xq_out)
print(xk_out)


tensor([[1.0000+0.0000j],
        [0.5403+0.8415j]])
tensor([[[ 0.3074,  0.9158,  0.2200, -1.6933,  0.7677,  0.8516,  0.8772,
           1.0275],
         [ 0.0847,  1.1582, -0.5125, -0.3631,  0.3798, -0.0918, -2.0022,
           0.3902],
         [ 0.0321, -0.2443, -1.1355, -0.5867,  0.4276,  0.5811,  0.3322,
           0.9117],
         [ 1.6778, -1.9925, -0.2339,  0.2268,  0.0569,  0.3529,  0.2872,
           0.0841]],

        [[ 0.1932, -0.5233,  0.8993,  1.0861, -1.2082,  0.3150, -0.1533,
          -0.0888],
         [-0.1965,  1.2208,  0.8005,  0.3558,  0.8528, -0.5554,  0.6259,
           0.2706],
         [ 0.8407, -1.0696, -2.5355,  0.3427,  0.7675, -0.2888,  2.1063,
          -1.0799],
         [ 0.4715,  0.2453,  2.0120, -0.7407, -0.9866, -0.2389, -0.8688,
          -0.3356]]])
tensor([[[-0.4202, -0.3106,  1.1324,  1.6704, -0.2345, -1.6656,  0.9083,
           1.3251],
         [ 0.9377,  0.5710,  0.0924, -0.4687, -0.1797, -0.9308, -1.5040,
          -0.3121],
         [ 0.

In [None]:
!pip install -q -U simple_parsing

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/113.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m61.4/113.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.4/113.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import dataclasses
from typing import List

import torch
import torch.nn.functional as F
from simple_parsing.helpers import Serializable
from torch import nn

# The code you provided
@dataclasses.dataclass
class MoeArgs(Serializable):
    num_experts: int
    num_experts_per_tok: int


class MoeLayer(nn.Module):
    def __init__(self, experts: List[nn.Module], gate: nn.Module, moe_args: MoeArgs):
        super().__init__()
        assert len(experts) > 0
        self.experts = nn.ModuleList(experts)
        self.gate = gate
        self.args = moe_args

    def forward(self, inputs: torch.Tensor):
        gate_logits = self.gate(inputs)
        weights, selected_experts = torch.topk(gate_logits, self.args.num_experts_per_tok)
        weights = F.softmax(weights, dim=1, dtype=torch.float).to(inputs.dtype)
        results = torch.zeros_like(inputs)
        for i, expert in enumerate(self.experts):

            batch_idx, nth_expert = torch.where(selected_experts == i)
            results[batch_idx] += weights[batch_idx, nth_expert, None] * expert(
                inputs[batch_idx]
            )
        return results

# # Some dummy input tensors for inputs
# # Assume the batch size is 2, the sequence length is 4, and the hidden size is 8
# inputs = torch.randn(2, 4, 8)

# # Some values for num_experts and num_experts_per_tok
# # Assume the num_experts is 4 and the num_experts_per_tok is 2
# num_experts = 4
# num_experts_per_tok = 2

# # Define the experts and the gate modules
# # Assume the experts are simple linear layers and the gate is a linear layer followed by a softmax
# experts = [nn.Linear(8, 8) for _ in range(num_experts)]
# gate = nn.Sequential(nn.Linear(8, num_experts), nn.Softmax(dim=-1))

# # Create an instance of the MoE layer
# moe_layer = MoeLayer(experts, gate, MoeArgs(num_experts, num_experts_per_tok))

# # The original code
# outputs1 = moe_layer(inputs)

# # The modified code
# outputs2 = moe_layer(inputs)

# # Compare the outputs
# print(torch.equal(outputs1, outputs2)) # This should print True


The projections in the architecture are part of the Mistral 7B model, which is a large language model that uses grouped-query attention and sliding window attention to handle long sequences efficiently. The projections are linear transformations that map the input features to different dimensions for different purposes. Here is a brief explanation of each projection:

- `(q_proj)`: This projection maps the input features to the query features, which are used to compute the attention scores with the key features. The output dimension is the same as the input dimension (4096) to preserve the information.
- `(k_proj)`: This projection maps the input features to the key features, which are used to compute the attention scores with the query features. The output dimension is smaller than the input dimension (1024) to reduce the computation cost and memory footprint of the attention matrix.
- `(v_proj)`: This projection maps the input features to the value features, which are used to compute the weighted sum of the attention outputs. The output dimension is the same as the key dimension (1024) to match the attention matrix size.
- `(o_proj)`: This projection maps the value features to the output features, which are the final result of the attention layer. The output dimension is the same as the input dimension (4096) to restore the information and match the next layer's input size.
- `(rotary_emb)`: This is a special embedding layer that applies a rotation matrix to the input features based on their positions. This helps the model to capture the relative positions of the tokens without using positional embeddings.
- `(mlp)`: This is a multilayer perceptron that consists of three linear projections and a SiLU activation function. It is used to apply a non-linear transformation to the output features of the attention layer. The projections are:
    - `(gate_proj)`: This projection maps the output features to a larger dimension (14336) and applies a sigmoid function to create a gate vector.
    - `(up_proj)`: This projection maps the output features to the same larger dimension (14336) and applies a SiLU function to create an activation vector.
    - `(down_proj)`: This projection maps the element-wise product of the gate vector and the activation vector to the original dimension (4096) and adds a residual connection to the output features.

The mathematical equations for the projections are:

$$
\begin{aligned}
Q &= q\_proj(X) \\
K &= k\_proj(X) \\
V &= v\_proj(X) \\
A &= \text{softmax}(\frac{QK^T}{\sqrt{d_k}}) \\
O &= o\_proj(AV) \\
R &= \text{rotary\_emb}(O) \\
G &= \text{sigmoid}(gate\_proj(R)) \\
U &= \text{SiLU}(up\_proj(R)) \\
D &= down\_proj(G \odot U) \\
Y &= D + R
\end{aligned}
$$

where $X$ is the input features, $Q$ is the query features, $K$ is the key features, $V$ is the value features, $A$ is the attention matrix, $O$ is the output features, $R$ is the rotated features, $G$ is the gate vector, $U$ is the activation vector, $D$ is the down-projected features, $Y$ is the final output features, $d_k$ is the key dimension, and $\odot$ is the element-wise product.