From d2fbcb23a2592e01f3c3ac3f1f4b5ffa92067e51 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Mon, 26 Jun 2023 00:30:44 +0200
Subject: [PATCH 01/27] docs: gguf spec first pass

---
 docs/gguf.md | 372 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 372 insertions(+)
 create mode 100644 docs/gguf.md

diff --git a/docs/gguf.md b/docs/gguf.md
new file mode 100644
index 000000000..6e0590f93
--- /dev/null
+++ b/docs/gguf.md
@@ -0,0 +1,372 @@
+# GGUF
+
+GGUF is a file format for storing models for inference with GGML and executors based on GGML. GGUF is a binary format that is designed for fast loading and saving of models, and for ease of reading. Models are traditionally developed using PyTorch or another framework, and then converted to GGUF for use in GGML.
+
+It is a successor file format to GGML, GGMF and GGJT, and is designed to be unambiguous by containing all the information needed to load a model. It is also designed to be extensible, so that new features can be added to GGML without breaking compatibility with older models.
+
+For more information about the motivation behind GGUF, see [Current State of Affairs](#current-state-of-affairs).
+
+## Specification
+
+GGUF is a format based on the existing GGJT, but makes a few changes to the format to make it more extensible and easier to use. The following features are desired:
+
+- Single-file deployment: they can be easily distributed and loaded, and do not require any external files for additional information.
+- Extensible: new features can be added to GGML without breaking compatibility with existing models.
+- `mmap` compatibility: models can be loaded using `mmap` for fast loading and saving.
+- Easy to use: models can be easily loaded and saved using a small amount of code, with no need for external libraries, regardless of the language used.
+- Full information: all information needed to load a model is contained in the model file, and no additional information needs to be provided by the user.
+
+The key difference between GGJT and GGUF is the use of a key-value structure for the hyperparameters (now referred to as metadata), rather than a list of untyped values. This allows for new metadata to be added without breaking compatibility with existing models, and to annotate the model with additional information that may be useful for inference or for identifying the model.
+
+### File Structure
+
+GGUF files are structured as follows. They assume the use of a global `ALIGNMENT` constant, which is the alignment of the model data. This is currently 64 bytes, but may change in the future. [^1] To achieve this, where relevant, the file is padded with `0x00` bytes to the next multiple of `ALIGNMENT`.
+
+[^1]: This may be moved to a per-model key-value pair in the future.
+
+```c
+enum ggml_type {
+    GGML_TYPE_F32  = 0,
+    GGML_TYPE_F16  = 1,
+    GGML_TYPE_Q4_0 = 2,
+    GGML_TYPE_Q4_1 = 3,
+    // GGML_TYPE_Q4_2 = 4, support has been removed
+    // GGML_TYPE_Q4_3 (5) support has been removed
+    GGML_TYPE_Q5_0 = 6,
+    GGML_TYPE_Q5_1 = 7,
+    GGML_TYPE_Q8_0 = 8,
+    GGML_TYPE_Q8_1 = 9,
+    // k-quantizations
+    GGML_TYPE_Q2_K = 10,
+    GGML_TYPE_Q3_K = 11,
+    GGML_TYPE_Q4_K = 12,
+    GGML_TYPE_Q5_K = 13,
+    GGML_TYPE_Q6_K = 14,
+    GGML_TYPE_Q8_K = 15,
+    GGML_TYPE_I8,
+    GGML_TYPE_I16,
+    GGML_TYPE_I32,
+    GGML_TYPE_COUNT,
+};
+
+enum gguf_metadata_value_type: uint32_t {
+    /// The value is a 8-bit unsigned integer.
+    GGUF_METADATA_VALUE_TYPE_UINT8 = 0,
+    /// The value is a 8-bit signed integer.
+    GGUF_METADATA_VALUE_TYPE_INT8 = 1,
+    /// The value is a 16-bit unsigned little-endian integer.
+    GGUF_METADATA_VALUE_TYPE_UINT16 = 2,
+    /// The value is a 16-bit signed little-endian integer.
+    GGUF_METADATA_VALUE_TYPE_INT16 = 3,
+    /// The value is a 32-bit unsigned little-endian integer.
+    GGUF_METADATA_VALUE_TYPE_UINT32 = 4,
+    /// The value is a 32-bit signed little-endian integer.
+    GGUF_METADATA_VALUE_TYPE_INT32 = 5,
+    /// The value is a 32-bit IEEE754 floating point number.
+    GGUF_METADATA_VALUE_TYPE_FLOAT32 = 6,
+    /// The value is a boolean.
+    /// 1-byte value where 0 is false and 1 is true.
+    /// Anything else is invalid, and should be treated as either the model being invalid or the reader being buggy.
+    GGUF_METADATA_VALUE_TYPE_BOOL = 7,
+    /// The value is a UTF-8 non-null-terminated string, with length prepended.
+    GGUF_METADATA_VALUE_TYPE_STRING = 8,
+    /// The value is an array of other values, with the length and type prepended.
+    GGUF_METADATA_VALUE_TYPE_ARRAY = 9,
+}
+
+/// A string in GGUF.
+struct gguf_string_t {
+    /// The length of the string, in bytes.
+    uint32_t len;
+    /// The string as a UTF-8 non-null-terminated string.
+    char string[len];
+}
+
+union gguf_metadata_value_t {
+    uint8_t uint8;
+    int8_t int8;
+    uint16_t uint16;
+    int16_t int16;
+    uint32_t uint32;
+    int32_t int32;
+    float float32;
+    bool bool_;
+    gguf_string_t string;
+    struct {
+        uint32_t len;
+        gguf_metadata_value_type type;
+        gguf_metadata_value_t array[len];
+    } array;
+};
+
+struct gguf_metadata_kv_t {
+    /// A standard GGUF string, with the following caveats:
+    /// - It must be a valid ASCII string.
+    /// - It must be a hierarchical key, where each segment is `lower_snake_case` and separated by a `.`.
+    /// - It must be at most 2^16-1 bytes long.
+    /// Any keys that do not follow these rules are invalid.
+    gguf_string_t key;
+
+    /// The length of the value, in bytes
+    uint32_t value_len;
+    /// The type of the value.
+    /// Must be one of the `gguf_metadata_value_type` values.
+    gguf_metadata_value_type value_type;
+    /// The value.
+    gguf_metadata_value_t value;
+};
+
+struct gguf_header_t {
+    // Magic number to announce that this is a GGUF file.
+    // Must be `'GGUF'`/`0x47475546`.
+    uint32_t magic;
+    // The version of the format implemented.
+    // Must be `1` for version described in this spec.
+    //
+    // This version should only be increased for structural changes to the format.
+    // Changes that do not affect the structure of the file should instead update the metadata
+    // to signify the change.
+    uint32_t version;
+    // The number of tensors in the file.
+    // This is explicit, instead of being included in the metadata, to ensure it is always present
+    // for loading the tensors.
+    uint32_t tensor_count;
+    // The number of metadata key-value pairs.
+    uint32_t metadata_kv_count;
+    // The metadata key-value pairs.
+    gguf_metadata_kv_t metadata_kv[metadata_kv_count];
+};
+
+struct gguf_tensor_info_t {
+    /// The name of the tensor.
+    gguf_string_t name;
+    /// The number of dimensions in the tensor.
+    /// Currently at most two, but this may change in the future.
+    uint32_t n_dimensions;
+    /// The dimensions of the tensor.
+    uint32_t dimensions[n_dimensions];
+    /// The number of elements in the tensor.
+    uint32_t n_elements;
+    /// The type of the tensor.
+    ggml_type type;
+    /// The offset of the tensor's data in this file in bytes.
+    /// Must be a multiple of `ALIGNMENT`.
+    uint64_t offset;
+};
+
+struct gguf_file_t {
+    // The header of the file.
+    gguf_header_t header;
+
+    // Padding to the nearest multiple of `ALIGNMENT`.
+    uint8_t _padding[ALIGNMENT - (sizeof(header) % ALIGNMENT)];
+
+    // Tensor infos, which can be used to locate the tensor data.
+    gguf_tensor_info_t tensor_infos[header.tensor_count];
+
+    // Tensor data.
+    //
+    // This is arbitrary binary data corresponding to the weights of the model. This data should be close
+    // or identical to the data in the original model file, but may be different due to quantization or
+    // other optimizations for inference. Any such deviations should be recorded in the metadata or as
+    // part of the architecture definition.
+    //
+    // Each tensor's data must be stored within this array, and located through its `tensor_infos` entry.
+    // The offset of each tensor's data must be a multiple of `ALIGNMENT`, and the space between tensors
+    // should be padded to `ALIGNMENT` bytes.
+    uint8_t tensor_data[];
+};
+```
+
+## Standardized key-value pairs
+
+The following key-value pairs are standardized. This list may grow in the future as more use cases are discovered. Where possible, names are shared with the original model definitions to make it easier to map between the two.
+
+Not all of these are required, but they are all recommended. Keys that are required are bolded. For omitted pairs, the reader should assume that the value is unknown and either default or error as appropriate.
+
+### General
+
+- **`general.architecture: string`**: describes what architecture this model implements. All lowercase ASCII, with only `[a-z0-9]+` characters allowed. Known values include:
+  - `llama`
+  - `mpt`
+  - `gptneox`
+  - `gptj`
+  - `gpt2`
+  - `bloom`
+  - `falcon`
+  - `rwkv`
+- **`general.quantization_version: u32`**: version of quantization scheme
+- `general.file_type: string`: type of the majority of the tensors in the file. This shouldn't have any semantic meaning and should be purely informational, hence the use of `string`.
+- `general.license: string`: SPDX license of the model
+- `general.description: string`: information about the model, including provenance
+- `general.url.original_source: string`: path to the original model that this GGML file was created from
+
+### LLM
+
+In the following, `[llm]` is used to fill in for the name of a specific LLM architecture. They will be used in each architecture's section.
+
+- `[llm].context_length: u32`: size of the maximum supported context
+- `[llm].hidden_size: u32`: embedding layer size
+- `[llm].num_layers: u32`: number of layers
+- `[llm].num_rotary: u32`: `int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))`
+- `[llm].use_parallel_residual: bool`: whether or not the parallel residual logic should be used
+- `[llm].max_seq_len: u32`: Maximum sequence length
+- `[llm].attention.num_heads: u32`: number of attention heads
+- `[llm].attention.alibi_bias_max: f32`: The maximum bias to use for ALiBI
+- `[llm].attention.clip_kqv: f32`: **TODO**: what is this?
+- `[llm].num_mult: u32`: **TODO**: what is this?
+- `[llm].rot: u32`: **TODO**: what is this?
+- `[llm].num_rot: u32`: **TODO**: what is this?
+
+#### Models
+
+The following sections describe the metadata for each model architecture. Each key specified _must_ be present.
+
+##### LLaMA
+
+- `llama.context_length`
+- `llama.hidden_size`
+- `llama.num_layers`
+- `llama.num_mult`
+- `llama.rot`
+- `llama.attention.num_heads`
+
+##### MPT
+
+- `mpt.max_seq_len`
+- `mpt.hidden_size`
+- `mpt.num_layers`
+- `mpt.attention.num_heads`
+- `mpt.attention.alibi_bias_max`
+- `mpt.attention.clip_kqv`
+
+##### GPT-NeoX
+
+- `gptneox.context_length`
+- `gptneox.hidden_size`
+- `gptneox.num_layers`
+- `gptneox.num_rot`
+- `gptneox.use_parallel_residual`
+- `gptneox.attention.num_heads`
+
+##### GPT-J
+
+- `gptj.context_length`
+- `gptj.hidden_size`
+- `gptj.num_layers`
+- `gptj.num_rot`
+- `gptj.attention.num_heads`
+
+##### GPT-2
+
+- `gpt2.context_length`
+- `gpt2.hidden_size`
+- `gpt2.num_layers`
+- `gpt2.attention.num_heads`
+
+##### BLOOM
+
+- `bloom.context_length`
+- `bloom.hidden_size`
+- `bloom.num_layers`
+- `bloom.num_mult`
+- `bloom.attention.num_heads`
+
+##### Falcon
+
+**TODO**.
+
+##### RWKV
+
+**TODO**.
+
+#### Prompting
+
+**TODO**: Include prompt format, and/or metadata about how it should be used (instruction, conversation, autocomplete, etc).
+
+### Tokenizer
+
+The following keys are used to describe the tokenizer of the model. It is recommended that model authors support as many of these as possible, as it will allow for better tokenization quality with supported executors.
+
+#### Embedded
+
+GGML supports an embedded vocabulary that may be lossily compressed from a more complete tokenizer. This should enable inferencing of the model, but it may not fully capture the nuances of tokenization. When a more accurate tokenizer is available and supported, it should be used instead.
+
+**TODO**: Add more details about how this works, and what kind of tokenizer it's expecting. Should this be called something more specific instead?
+
+- `tokenizer.embedded.tokens: array[string]`: A list of tokens.
+- `tokenizer.embedded.scores: array[f32]`: If present, the score/probability of each token. If not present, all tokens are assumed to have equal probability. Must be the same length as `tokens`.
+
+#### Hugging Face
+
+Hugging Face maintains their own `tokenizers` library that supports a wide variety of tokenizers. If your executor uses this library, it may be able to use the model's tokenizer directly.
+
+- `tokenizer.huggingface.json: string`: the entirety of the HF `tokenizer.json` for a given model (e.g. <https://huggingface.co/mosaicml/mpt-7b-instruct/blob/main/tokenizer.json>). Included for compatibility with executors that support HF tokenizers directly.
+
+#### Other
+
+Other tokenizers may be used, but are not necessarily standardized. They may be executor-specific. They will be documented here as they are discovered/further developed.
+
+- `tokenizer.rwkv.world: string`: a RWKV World tokenizer, like [this](https://github.com/BlinkDL/ChatRWKV/blob/main/tokenizer/rwkv_vocab_v20230424.txt). This text file should be included verbatim.
+
+### Computation graph
+
+This is a future extension and still needs to be discussed, and may necessitate a new GGUF version. At the time of writing, the primary blocker is the stabilization of the computation graph format.
+
+A sample computation graph of GGML nodes could be included in the model itself, allowing an executor to run the model without providing its own implementation of the architecture. This would allow for a more consistent experience across executors, and would allow for more complex architectures to be supported without requiring the executor to implement them.
+
+## Migration
+
+All existing Python conversion scripts will be consolidated to use one `gguf` library. They will take models from Hugging Face or elsewhere and produce compliant GGUF files with all of the recommended metadata.
+
+Existing models do not have enough information to be directly converted to GGUF. Instead, a migration tool may be built that takes an existing GGML/GGMF/GGJT file and prompts the user for the missing information. This tool will be executor-agnostic, and will be able to produce a GGUF file that can be used by any executor. This tool may hardcode settings for models with known hashes to ease the migration process, such that a user can run `./migrate nous-hermes-13b.ggmlv3.q5_1.bin` and obtain a `nous-hermes-13b.ggmlv3.q5_1.gguf` file that is ready to use and consistent with uploaded models.
+
+---
+
+## Current State of Affairs
+
+The following information is provided for context, but is not necessary to understand the rest of this document.
+
+### Overview
+
+At present, there are three GGML file formats floating around for LLMs:
+
+- **GGML** (unversioned): baseline format, with no versioning or alignment.
+- **GGMF** (versioned): the same as GGML, but with versioning. Only one version exists.
+- **GGJT**: Aligns the tensors to allow for use with `mmap`, which requires alignment. v1, v2 and v3 are identical, but the latter versions use a different quantization scheme that is incompatible with previous versions.
+
+GGML is primarily used by the examples in `ggml`, while GGJT is used by `llama.cpp` models. Other executors may use any of the three formats, but this is not 'officially' supported.
+
+These formats share the same fundamental structure:
+
+- a magic number with an optional version number
+- model-specific hyperparameters, including
+  - metadata about the model, such as the number of layers, the number of heads, etc.
+  - a `ftype` that describes the type of the majority of the tensors,
+    - for GGML files, the quantization version is encoded in the `ftype` divided by 1000
+- an embedded vocabulary, which is a list of strings with length prepended. The GGMF/GGJT formats embed a f32 score next to the strings.
+- finally, a list of tensors with their length-prepended name, type, and (aligned, in the case of GGJT) tensor data
+
+Notably, this structure does not identify what model architecture the model belongs to, nor does it offer any flexibility for changing the structure of the hyperparameters. This means that the only way to add new hyperparameters is to add them to the end of the list, which is a breaking change for existing models.
+
+### Drawbacks
+
+Unfortunately, over the last few months, there are a few issues that have become apparent with the existing models:
+
+- There's no way to identify which model architecture a given model is for, because that information isn't present
+  - Similarly, existing programs cannot intelligently fail upon encountering new architectures
+- Adding or removing any new hyperparameters is a breaking change, which is impossible for a reader to detect without using heuristics
+- Each model architecture requires its own conversion script to their architecture's variant of GGML
+- Maintaining backwards compatibility without breaking the structure of the format requires clever tricks, like packing the quantization version into the ftype, which are not guaranteed to be picked up by readers/writers, and are not consistent between the two formats
+
+### Why not other formats?
+
+There are a few other formats that could be used, but issues include:
+
+- requiring additional dependencies to load or save the model, which is complicated in a C environment
+- limited or no support for 4-bit quantization
+- existing cultural expectations (e.g. whether or not the model is a directory or a file)
+- lack of support for embedded vocabularies
+- lack of control over direction of future development
+
+Ultimately, it is likely that GGUF will remain necessary for the foreseeable future, and it is better to have a single format that is well-documented and supported by all executors than to contort an existing format to fit the needs of GGML.

From 23eda2e5c491714daa60d015fb3de8d8f446b88a Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Tue, 27 Jun 2023 01:09:39 +0200
Subject: [PATCH 02/27] docs(gguf): update with review comments

---
 docs/gguf.md | 108 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 59 insertions(+), 49 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 6e0590f93..9747a30fb 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -22,6 +22,8 @@ The key difference between GGJT and GGUF is the use of a key-value structure for
 
 GGUF files are structured as follows. They assume the use of a global `ALIGNMENT` constant, which is the alignment of the model data. This is currently 64 bytes, but may change in the future. [^1] To achieve this, where relevant, the file is padded with `0x00` bytes to the next multiple of `ALIGNMENT`.
 
+Fields, including arrays, are written sequentially without alignment unless otherwise specified.
+
 [^1]: This may be moved to a per-model key-value pair in the future.
 
 ```c
@@ -50,35 +52,37 @@ enum ggml_type {
 };
 
 enum gguf_metadata_value_type: uint32_t {
-    /// The value is a 8-bit unsigned integer.
+    // The value is a 8-bit unsigned integer.
     GGUF_METADATA_VALUE_TYPE_UINT8 = 0,
-    /// The value is a 8-bit signed integer.
+    // The value is a 8-bit signed integer.
     GGUF_METADATA_VALUE_TYPE_INT8 = 1,
-    /// The value is a 16-bit unsigned little-endian integer.
+    // The value is a 16-bit unsigned little-endian integer.
     GGUF_METADATA_VALUE_TYPE_UINT16 = 2,
-    /// The value is a 16-bit signed little-endian integer.
+    // The value is a 16-bit signed little-endian integer.
     GGUF_METADATA_VALUE_TYPE_INT16 = 3,
-    /// The value is a 32-bit unsigned little-endian integer.
+    // The value is a 32-bit unsigned little-endian integer.
     GGUF_METADATA_VALUE_TYPE_UINT32 = 4,
-    /// The value is a 32-bit signed little-endian integer.
+    // The value is a 32-bit signed little-endian integer.
     GGUF_METADATA_VALUE_TYPE_INT32 = 5,
-    /// The value is a 32-bit IEEE754 floating point number.
+    // The value is a 32-bit IEEE754 floating point number.
     GGUF_METADATA_VALUE_TYPE_FLOAT32 = 6,
-    /// The value is a boolean.
-    /// 1-byte value where 0 is false and 1 is true.
-    /// Anything else is invalid, and should be treated as either the model being invalid or the reader being buggy.
+    // The value is a boolean.
+    // 1-byte value where 0 is false and 1 is true.
+    // Anything else is invalid, and should be treated as either the model being invalid or the reader being buggy.
     GGUF_METADATA_VALUE_TYPE_BOOL = 7,
-    /// The value is a UTF-8 non-null-terminated string, with length prepended.
+    // The value is a UTF-8 non-null-terminated string, with length prepended.
     GGUF_METADATA_VALUE_TYPE_STRING = 8,
-    /// The value is an array of other values, with the length and type prepended.
+    // The value is an array of other values, with the length and type prepended.
+    ///
+    // Arrays can be nested, and the length of the array is the number of elements in the array, not the number of bytes.
     GGUF_METADATA_VALUE_TYPE_ARRAY = 9,
 }
 
-/// A string in GGUF.
+// A string in GGUF.
 struct gguf_string_t {
-    /// The length of the string, in bytes.
+    // The length of the string, in bytes.
     uint32_t len;
-    /// The string as a UTF-8 non-null-terminated string.
+    // The string as a UTF-8 non-null-terminated string.
     char string[len];
 }
 
@@ -93,26 +97,29 @@ union gguf_metadata_value_t {
     bool bool_;
     gguf_string_t string;
     struct {
+        // Number of elements, not bytes
         uint32_t len;
+        // Any value type is valid, including arrays.
         gguf_metadata_value_type type;
+        // The array of values.
         gguf_metadata_value_t array[len];
     } array;
 };
 
 struct gguf_metadata_kv_t {
-    /// A standard GGUF string, with the following caveats:
-    /// - It must be a valid ASCII string.
-    /// - It must be a hierarchical key, where each segment is `lower_snake_case` and separated by a `.`.
-    /// - It must be at most 2^16-1 bytes long.
-    /// Any keys that do not follow these rules are invalid.
+    // A standard GGUF string, with the following caveats:
+    // - It must be a valid ASCII string.
+    // - It must be a hierarchical key, where each segment is `lower_snake_case` and separated by a `.`.
+    // - It must be at most 2^16-1 bytes long.
+    // Any keys that do not follow these rules are invalid.
     gguf_string_t key;
 
-    /// The length of the value, in bytes
+    // The length of the value, in bytes
     uint32_t value_len;
-    /// The type of the value.
-    /// Must be one of the `gguf_metadata_value_type` values.
+    // The type of the value.
+    // Must be one of the `gguf_metadata_value_type` values.
     gguf_metadata_value_type value_type;
-    /// The value.
+    // The value.
     gguf_metadata_value_t value;
 };
 
@@ -138,19 +145,19 @@ struct gguf_header_t {
 };
 
 struct gguf_tensor_info_t {
-    /// The name of the tensor.
+    // The name of the tensor.
     gguf_string_t name;
-    /// The number of dimensions in the tensor.
-    /// Currently at most two, but this may change in the future.
+    // The number of dimensions in the tensor.
+    // Currently at most 4, but this may change in the future.
     uint32_t n_dimensions;
-    /// The dimensions of the tensor.
+    // The dimensions of the tensor.
     uint32_t dimensions[n_dimensions];
-    /// The number of elements in the tensor.
+    // The number of elements in the tensor.
     uint32_t n_elements;
-    /// The type of the tensor.
+    // The type of the tensor.
     ggml_type type;
-    /// The offset of the tensor's data in this file in bytes.
-    /// Must be a multiple of `ALIGNMENT`.
+    // The offset of the tensor's data in this file in bytes.
+    // Must be a multiple of `ALIGNMENT`.
     uint64_t offset;
 };
 
@@ -184,6 +191,10 @@ The following key-value pairs are standardized. This list may grow in the future
 
 Not all of these are required, but they are all recommended. Keys that are required are bolded. For omitted pairs, the reader should assume that the value is unknown and either default or error as appropriate.
 
+The community can develop their own key-value pairs to carry additional data. However, these should be namespaced with the relevant community name to avoid collisions. For example, the `rustformers` community might use `rustformers.` as a prefix for all of their keys.
+
+If a particular community key is widely used, it may be promoted to a standardized key.
+
 ### General
 
 - **`general.architecture: string`**: describes what architecture this model implements. All lowercase ASCII, with only `[a-z0-9]+` characters allowed. Known values include:
@@ -198,8 +209,9 @@ Not all of these are required, but they are all recommended. Keys that are requi
 - **`general.quantization_version: u32`**: version of quantization scheme
 - `general.file_type: string`: type of the majority of the tensors in the file. This shouldn't have any semantic meaning and should be purely informational, hence the use of `string`.
 - `general.license: string`: SPDX license of the model
-- `general.description: string`: information about the model, including provenance
-- `general.url.original_source: string`: path to the original model that this GGML file was created from
+- `general.description: string`: free-form description of the model including anything that isn't covered by the other fields
+- `general.source.url: string`: URL to the source of the model. Can be a GitHub repo, a paper, etc.
+- `general.source.huggingface.repository: string`: Hugging Face model repository that this model is either hosted on or based on
 
 ### LLM
 
@@ -208,15 +220,13 @@ In the following, `[llm]` is used to fill in for the name of a specific LLM arch
 - `[llm].context_length: u32`: size of the maximum supported context
 - `[llm].hidden_size: u32`: embedding layer size
 - `[llm].num_layers: u32`: number of layers
-- `[llm].num_rotary: u32`: `int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))`
+- `[llm].num_ff: u32`: The length of the feedforward layer.
 - `[llm].use_parallel_residual: bool`: whether or not the parallel residual logic should be used
 - `[llm].max_seq_len: u32`: Maximum sequence length
 - `[llm].attention.num_heads: u32`: number of attention heads
 - `[llm].attention.alibi_bias_max: f32`: The maximum bias to use for ALiBI
-- `[llm].attention.clip_kqv: f32`: **TODO**: what is this?
-- `[llm].num_mult: u32`: **TODO**: what is this?
-- `[llm].rot: u32`: **TODO**: what is this?
-- `[llm].num_rot: u32`: **TODO**: what is this?
+- `[llm].attention.clip_kqv: f32`: Value (`C`) to clamp the values of the `Q`, `K`, and `V` tensors between (`[-C, C]`).
+- `[llm].rope.num_dims: u32`: The number of rotary dimensions for RoPE.
 
 #### Models
 
@@ -227,8 +237,8 @@ The following sections describe the metadata for each model architecture. Each k
 - `llama.context_length`
 - `llama.hidden_size`
 - `llama.num_layers`
-- `llama.num_mult`
-- `llama.rot`
+- `llama.num_ff`
+- `llama.rope.num_dims`
 - `llama.attention.num_heads`
 
 ##### MPT
@@ -245,8 +255,8 @@ The following sections describe the metadata for each model architecture. Each k
 - `gptneox.context_length`
 - `gptneox.hidden_size`
 - `gptneox.num_layers`
-- `gptneox.num_rot`
 - `gptneox.use_parallel_residual`
+- `gptneox.rope.num_dims`
 - `gptneox.attention.num_heads`
 
 ##### GPT-J
@@ -254,7 +264,7 @@ The following sections describe the metadata for each model architecture. Each k
 - `gptj.context_length`
 - `gptj.hidden_size`
 - `gptj.num_layers`
-- `gptj.num_rot`
+- `gptj.rope.num_dims`
 - `gptj.attention.num_heads`
 
 ##### GPT-2
@@ -269,7 +279,7 @@ The following sections describe the metadata for each model architecture. Each k
 - `bloom.context_length`
 - `bloom.hidden_size`
 - `bloom.num_layers`
-- `bloom.num_mult`
+- `bloom.num_ff`
 - `bloom.attention.num_heads`
 
 ##### Falcon
@@ -288,14 +298,14 @@ The following sections describe the metadata for each model architecture. Each k
 
 The following keys are used to describe the tokenizer of the model. It is recommended that model authors support as many of these as possible, as it will allow for better tokenization quality with supported executors.
 
-#### Embedded
+#### GGML
 
-GGML supports an embedded vocabulary that may be lossily compressed from a more complete tokenizer. This should enable inferencing of the model, but it may not fully capture the nuances of tokenization. When a more accurate tokenizer is available and supported, it should be used instead.
+GGML supports an embedded vocabulary that may be lossily compressed from a more complete tokenizer. It is simplistic and specific to GGML. This should enable inferencing of the model, but it may not fully capture the nuances of tokenization. When a more accurate tokenizer is available and supported, it should be used instead.
 
-**TODO**: Add more details about how this works, and what kind of tokenizer it's expecting. Should this be called something more specific instead?
+It is not guaranteed to be standardized across models, and may change in the future. It is recommended that model authors use a more standardized tokenizer if possible.
 
-- `tokenizer.embedded.tokens: array[string]`: A list of tokens.
-- `tokenizer.embedded.scores: array[f32]`: If present, the score/probability of each token. If not present, all tokens are assumed to have equal probability. Must be the same length as `tokens`.
+- `tokenizer.ggml.tokens: array[string]`: A list of tokens.
+- `tokenizer.ggml.scores: array[f32]`: If present, the score/probability of each token. If not present, all tokens are assumed to have equal probability. Must be the same length as `tokens`.
 
 #### Hugging Face
 

From b30329381b902b13f6ff6f20f75e807a33ee8241 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Tue, 27 Jun 2023 23:57:14 +0200
Subject: [PATCH 03/27] docs(gguf): update with review comments

---
 docs/gguf.md | 100 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 72 insertions(+), 28 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 9747a30fb..52b9e916c 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -197,6 +197,8 @@ If a particular community key is widely used, it may be promoted to a standardiz
 
 ### General
 
+#### Required
+
 - **`general.architecture: string`**: describes what architecture this model implements. All lowercase ASCII, with only `[a-z0-9]+` characters allowed. Known values include:
   - `llama`
   - `mpt`
@@ -207,9 +209,20 @@ If a particular community key is widely used, it may be promoted to a standardiz
   - `falcon`
   - `rwkv`
 - **`general.quantization_version: u32`**: version of quantization scheme
+
+#### General metadata
+
+- `general.name`: The name of the model. This should be a human-readable name that can be used to identify the model. It should be unique within the community that the model is defined in.
+- `general.author`: The author of the model.
+- `general.url`: URL to the model's homepage. This can be a GitHub repo, a paper, etc.
+- `general.description: string`: free-form description of the model including anything that isn't covered by the other fields
 - `general.file_type: string`: type of the majority of the tensors in the file. This shouldn't have any semantic meaning and should be purely informational, hence the use of `string`.
 - `general.license: string`: SPDX license of the model
-- `general.description: string`: free-form description of the model including anything that isn't covered by the other fields
+
+#### Source metadata
+
+Information about where this model came from. This is useful for tracking the provenance of the model, and for finding the original source if the model is modified. For a model that was converted from GGML, for example, these keys would point to the model that was converted from.
+
 - `general.source.url: string`: URL to the source of the model. Can be a GitHub repo, a paper, etc.
 - `general.source.huggingface.repository: string`: Hugging Face model repository that this model is either hosted on or based on
 
@@ -217,16 +230,16 @@ If a particular community key is widely used, it may be promoted to a standardiz
 
 In the following, `[llm]` is used to fill in for the name of a specific LLM architecture. They will be used in each architecture's section.
 
-- `[llm].context_length: u32`: size of the maximum supported context
+- `[llm].context_length: u32`: length of the context (in tokens) that the model was trained on. For most architectures, this is the hard limit on the length of the input. Architectures, like RWKV, that are not reliant on transformer-style attention may be able to handle larger inputs, but this is not guaranteed.
 - `[llm].hidden_size: u32`: embedding layer size
-- `[llm].num_layers: u32`: number of layers
-- `[llm].num_ff: u32`: The length of the feedforward layer.
+- `[llm].n_layers: u32`: the number of attention+feedforward layers (i.e. the bulk of the LLM). Does not include the input or embedding layers.
+- `[llm].n_ff: u32`: the length of the feedforward layer
 - `[llm].use_parallel_residual: bool`: whether or not the parallel residual logic should be used
-- `[llm].max_seq_len: u32`: Maximum sequence length
-- `[llm].attention.num_heads: u32`: number of attention heads
-- `[llm].attention.alibi_bias_max: f32`: The maximum bias to use for ALiBI
-- `[llm].attention.clip_kqv: f32`: Value (`C`) to clamp the values of the `Q`, `K`, and `V` tensors between (`[-C, C]`).
-- `[llm].rope.num_dims: u32`: The number of rotary dimensions for RoPE.
+- `[llm].attention.n_heads: u32`: number of attention heads
+- `[llm].attention.max_alibi_bias: f32`: The maximum bias to use for ALiBI
+- `[llm].attention.clamp_kqv: f32`: value (`C`) to clamp the values of the `Q`, `K`, and `V` tensors between (`[-C, C]`)
+- `[llm].rope.n_dims: u32`: the number of rotary dimensions for RoPE
+- `[llm].rope.scale: f32`: a scale factor for RoPE to adjust the context length
 
 #### Models
 
@@ -236,17 +249,21 @@ The following sections describe the metadata for each model architecture. Each k
 
 - `llama.context_length`
 - `llama.hidden_size`
-- `llama.num_layers`
-- `llama.num_ff`
-- `llama.rope.num_dims`
-- `llama.attention.num_heads`
+- `llama.n_layers`
+- `llama.n_ff`
+- `llama.rope.n_dims`
+- `llama.attention.n_heads`
+
+###### Optional
+
+- `llama.rope.scale`
 
 ##### MPT
 
-- `mpt.max_seq_len`
+- `mpt.context_length`
 - `mpt.hidden_size`
-- `mpt.num_layers`
-- `mpt.attention.num_heads`
+- `mpt.n_layers`
+- `mpt.attention.n_heads`
 - `mpt.attention.alibi_bias_max`
 - `mpt.attention.clip_kqv`
 
@@ -254,37 +271,50 @@ The following sections describe the metadata for each model architecture. Each k
 
 - `gptneox.context_length`
 - `gptneox.hidden_size`
-- `gptneox.num_layers`
+- `gptneox.n_layers`
 - `gptneox.use_parallel_residual`
-- `gptneox.rope.num_dims`
-- `gptneox.attention.num_heads`
+- `gptneox.rope.n_dims`
+- `gptneox.attention.n_heads`
+
+###### Optional
+
+- `gptneox.rope.scale`
 
 ##### GPT-J
 
 - `gptj.context_length`
 - `gptj.hidden_size`
-- `gptj.num_layers`
-- `gptj.rope.num_dims`
-- `gptj.attention.num_heads`
+- `gptj.n_layers`
+- `gptj.rope.n_dims`
+- `gptj.attention.n_heads`
+
+###### Optional
+
+- `gptj.rope.scale`
 
 ##### GPT-2
 
 - `gpt2.context_length`
 - `gpt2.hidden_size`
-- `gpt2.num_layers`
-- `gpt2.attention.num_heads`
+- `gpt2.n_layers`
+- `gpt2.attention.n_heads`
 
 ##### BLOOM
 
 - `bloom.context_length`
 - `bloom.hidden_size`
-- `bloom.num_layers`
-- `bloom.num_ff`
-- `bloom.attention.num_heads`
+- `bloom.n_layers`
+- `bloom.n_ff`
+- `bloom.attention.n_heads`
 
 ##### Falcon
 
-**TODO**.
+- `falcon.context_length`
+- `falcon.hidden_size`
+- `falcon.n_layers`
+- `falcon.attention.num_heads`
+- `falcon.attention.num_heads_kv`
+- `falcon.attention.use_norm`
 
 ##### RWKV
 
@@ -294,6 +324,15 @@ The following sections describe the metadata for each model architecture. Each k
 
 **TODO**: Include prompt format, and/or metadata about how it should be used (instruction, conversation, autocomplete, etc).
 
+### LoRA
+
+**TODO**: Figure out what metadata is needed for LoRA. Probably desired features:
+
+- match an existing model exactly, so that it can't be misapplied
+- be marked as a LoRA so executors won't try to run it by itself
+
+Should this be an architecture, or should it share the details of the original model with additional fields to mark it as a LoRA?
+
 ### Tokenizer
 
 The following keys are used to describe the tokenizer of the model. It is recommended that model authors support as many of these as possible, as it will allow for better tokenization quality with supported executors.
@@ -306,6 +345,11 @@ It is not guaranteed to be standardized across models, and may change in the fut
 
 - `tokenizer.ggml.tokens: array[string]`: A list of tokens.
 - `tokenizer.ggml.scores: array[f32]`: If present, the score/probability of each token. If not present, all tokens are assumed to have equal probability. Must be the same length as `tokens`.
+- `tokenizer.ggml.bos_token_id: u32`: Beginning of sequence marker
+- `tokenizer.ggml.eos_token_id: u32`: End of sequence marker
+- `tokenizer.ggml.unk_token_id: u32`: Unknown token
+- `tokenizer.ggml.sep_token_id: u32`: Separator token
+- `tokenizer.ggml.pad_token_id: u32`: Padding token
 
 #### Hugging Face
 

From 2bcd348f88ce7d6db68f7c64a9dba45646fb3b1a Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Wed, 28 Jun 2023 13:40:20 +0200
Subject: [PATCH 04/27] docs(gguf): quant version optional for unquant

---
 docs/gguf.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 52b9e916c..b2183d52f 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -208,7 +208,7 @@ If a particular community key is widely used, it may be promoted to a standardiz
   - `bloom`
   - `falcon`
   - `rwkv`
-- **`general.quantization_version: u32`**: version of quantization scheme
+- **`general.quantization_version: u32`**: version of quantization scheme. Not required if the model is not quantized (i.e. no tensors are quantized). If any tensors are quantized, this _must_ be present.
 
 #### General metadata
 

From 576e3069e71243ced9282b581ddb0841d34e1c3a Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Mon, 10 Jul 2023 00:13:15 +0200
Subject: [PATCH 05/27] docs(gguf): normalize naming, add whisper

---
 docs/gguf.md | 97 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 57 insertions(+), 40 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index b2183d52f..221bc9353 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -230,16 +230,16 @@ Information about where this model came from. This is useful for tracking the pr
 
 In the following, `[llm]` is used to fill in for the name of a specific LLM architecture. They will be used in each architecture's section.
 
-- `[llm].context_length: u32`: length of the context (in tokens) that the model was trained on. For most architectures, this is the hard limit on the length of the input. Architectures, like RWKV, that are not reliant on transformer-style attention may be able to handle larger inputs, but this is not guaranteed.
-- `[llm].hidden_size: u32`: embedding layer size
-- `[llm].n_layers: u32`: the number of attention+feedforward layers (i.e. the bulk of the LLM). Does not include the input or embedding layers.
-- `[llm].n_ff: u32`: the length of the feedforward layer
-- `[llm].use_parallel_residual: bool`: whether or not the parallel residual logic should be used
-- `[llm].attention.n_heads: u32`: number of attention heads
-- `[llm].attention.max_alibi_bias: f32`: The maximum bias to use for ALiBI
-- `[llm].attention.clamp_kqv: f32`: value (`C`) to clamp the values of the `Q`, `K`, and `V` tensors between (`[-C, C]`)
-- `[llm].rope.n_dims: u32`: the number of rotary dimensions for RoPE
-- `[llm].rope.scale: f32`: a scale factor for RoPE to adjust the context length
+- `[llm].context_length: u32`: Also known as `n_ctx`. length of the context (in tokens) that the model was trained on. For most architectures, this is the hard limit on the length of the input. Architectures, like RWKV, that are not reliant on transformer-style attention may be able to handle larger inputs, but this is not guaranteed.
+- `[llm].embedding_length: u32`: Also known as `n_embd`. Embedding layer size.
+- `[llm].layer_count: u32`: Also known as `n_layers`. The number of attention+feedforward layers (i.e. the bulk of the LLM). Does not include the input or embedding layers.
+- `[llm].feedforward_length: u32`: Also known as `n_ff`. The length of the feedforward layer.
+- `[llm].use_parallel_residual: bool`: Whether or not the parallel residual logic should be used.
+- `[llm].attention.head_count: u32`: Also known as `n_head`. Number of attention heads.
+- `[llm].attention.max_alibi_bias: f32`: The maximum bias to use for ALiBI.
+- `[llm].attention.clamp_kqv: f32`: Value (`C`) to clamp the values of the `Q`, `K`, and `V` tensors between (`[-C, C]`).
+- `[llm].rope.dimension_count: u32`: The number of rotary dimensions for RoPE.
+- `[llm].rope.scale: f32`: A scale factor for RoPE to adjust the context length.
 
 #### Models
 
@@ -248,11 +248,11 @@ The following sections describe the metadata for each model architecture. Each k
 ##### LLaMA
 
 - `llama.context_length`
-- `llama.hidden_size`
-- `llama.n_layers`
-- `llama.n_ff`
-- `llama.rope.n_dims`
-- `llama.attention.n_heads`
+- `llama.embedding_length`
+- `llama.layer_count`
+- `llama.feedforward_length`
+- `llama.rope.dimension_count`
+- `llama.attention.head_count`
 
 ###### Optional
 
@@ -261,20 +261,20 @@ The following sections describe the metadata for each model architecture. Each k
 ##### MPT
 
 - `mpt.context_length`
-- `mpt.hidden_size`
-- `mpt.n_layers`
-- `mpt.attention.n_heads`
+- `mpt.embedding_length`
+- `mpt.layer_count`
+- `mpt.attention.head_count`
 - `mpt.attention.alibi_bias_max`
 - `mpt.attention.clip_kqv`
 
 ##### GPT-NeoX
 
 - `gptneox.context_length`
-- `gptneox.hidden_size`
-- `gptneox.n_layers`
+- `gptneox.embedding_length`
+- `gptneox.layer_count`
 - `gptneox.use_parallel_residual`
-- `gptneox.rope.n_dims`
-- `gptneox.attention.n_heads`
+- `gptneox.rope.dimension_count`
+- `gptneox.attention.head_count`
 
 ###### Optional
 
@@ -283,10 +283,10 @@ The following sections describe the metadata for each model architecture. Each k
 ##### GPT-J
 
 - `gptj.context_length`
-- `gptj.hidden_size`
-- `gptj.n_layers`
-- `gptj.rope.n_dims`
-- `gptj.attention.n_heads`
+- `gptj.embedding_length`
+- `gptj.layer_count`
+- `gptj.rope.dimension_count`
+- `gptj.attention.head_count`
 
 ###### Optional
 
@@ -295,31 +295,48 @@ The following sections describe the metadata for each model architecture. Each k
 ##### GPT-2
 
 - `gpt2.context_length`
-- `gpt2.hidden_size`
-- `gpt2.n_layers`
-- `gpt2.attention.n_heads`
+- `gpt2.embedding_length`
+- `gpt2.layer_count`
+- `gpt2.attention.head_count`
 
 ##### BLOOM
 
 - `bloom.context_length`
-- `bloom.hidden_size`
-- `bloom.n_layers`
-- `bloom.n_ff`
-- `bloom.attention.n_heads`
+- `bloom.embedding_length`
+- `bloom.layer_count`
+- `bloom.feedforward_length`
+- `bloom.attention.head_count`
 
 ##### Falcon
 
 - `falcon.context_length`
-- `falcon.hidden_size`
-- `falcon.n_layers`
-- `falcon.attention.num_heads`
-- `falcon.attention.num_heads_kv`
+- `falcon.embedding_length`
+- `falcon.layer_count`
+- `falcon.attention.head_count`
+- `falcon.attention.head_count_kv`
 - `falcon.attention.use_norm`
 
 ##### RWKV
 
 **TODO**.
 
+##### Whisper
+
+Keys that do not have types defined should be assumed to share definitions with `llm.` keys.
+(For example, `whisper.context_length` is equivalent to `llm.context_length`.)
+This is because they are both transformer models.
+
+- `whisper.encoder.context_length`
+- `whisper.encoder.embedding_length`
+- `whisper.encoder.layer_count`
+- `whisper.encoder.mels_count: u32`
+- `whisper.encoder.attention.head_count`
+
+- `whisper.decoder.context_length`
+- `whisper.decoder.embedding_length`
+- `whisper.decoder.layer_count`
+- `whisper.decoder.attention.head_count`
+
 #### Prompting
 
 **TODO**: Include prompt format, and/or metadata about how it should be used (instruction, conversation, autocomplete, etc).
@@ -347,9 +364,9 @@ It is not guaranteed to be standardized across models, and may change in the fut
 - `tokenizer.ggml.scores: array[f32]`: If present, the score/probability of each token. If not present, all tokens are assumed to have equal probability. Must be the same length as `tokens`.
 - `tokenizer.ggml.bos_token_id: u32`: Beginning of sequence marker
 - `tokenizer.ggml.eos_token_id: u32`: End of sequence marker
-- `tokenizer.ggml.unk_token_id: u32`: Unknown token
-- `tokenizer.ggml.sep_token_id: u32`: Separator token
-- `tokenizer.ggml.pad_token_id: u32`: Padding token
+- `tokenizer.ggml.unknown_token_id: u32`: Unknown token
+- `tokenizer.ggml.separator_token_id: u32`: Separator token
+- `tokenizer.ggml.padding_token_id: u32`: Padding token
 
 #### Hugging Face
 

From 24260bf1c723644ffeeee0c309b686d46cc6e21f Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 23 Jul 2023 17:13:36 +0200
Subject: [PATCH 06/27] docs(gguf): more review updates

---
 docs/gguf.md | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 221bc9353..30b56e771 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -235,9 +235,19 @@ In the following, `[llm]` is used to fill in for the name of a specific LLM arch
 - `[llm].layer_count: u32`: Also known as `n_layers`. The number of attention+feedforward layers (i.e. the bulk of the LLM). Does not include the input or embedding layers.
 - `[llm].feedforward_length: u32`: Also known as `n_ff`. The length of the feedforward layer.
 - `[llm].use_parallel_residual: bool`: Whether or not the parallel residual logic should be used.
+- `[llm].tensor_data_layout: string`: When a model is converted to GGUF, tensors may be rearranged to improve performance. This key describes the layout of the tensor data. This is not required; if not present, it is assumed to be `reference`.
+  - `reference`: tensors are laid out in the same order as the original model
+  - further options can be found for each architecture in their respective sections
+
+#### Attention
+
 - `[llm].attention.head_count: u32`: Also known as `n_head`. Number of attention heads.
+- `[llm].attention.head_count_kv: u32`: The number of heads per group used in Grouped-Query-Attention. If not present, the model does not use GQA.
 - `[llm].attention.max_alibi_bias: f32`: The maximum bias to use for ALiBI.
 - `[llm].attention.clamp_kqv: f32`: Value (`C`) to clamp the values of the `Q`, `K`, and `V` tensors between (`[-C, C]`).
+
+#### RoPE
+
 - `[llm].rope.dimension_count: u32`: The number of rotary dimensions for RoPE.
 - `[llm].rope.scale: f32`: A scale factor for RoPE to adjust the context length.
 
@@ -257,6 +267,15 @@ The following sections describe the metadata for each model architecture. Each k
 ###### Optional
 
 - `llama.rope.scale`
+- `llama.attention.head_count_kv`
+- `llama.tensor_data_layout`:
+  - `llama.cpp`:
+    ```python
+    def permute(weights: NDArray, n_head: int) -> NDArray:
+        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+                    .swapaxes(1, 2)
+                    .reshape(weights.shape))
+    ```
 
 ##### MPT
 
@@ -316,9 +335,41 @@ The following sections describe the metadata for each model architecture. Each k
 - `falcon.attention.head_count_kv`
 - `falcon.attention.use_norm`
 
+###### Optional
+
+- `falcon.tensor_data_layout`:
+
+  - `llama.cpp` (this name may be inaccurate depending on where the Falcon implementation ends up):
+
+    ```python
+    # The original query_key_value tensor contains n_head_kv "kv groups",
+    # each consisting of n_head/n_head_kv query weights followed by one key
+    # and one value weight (shared by all query heads in the kv group).
+    # This layout makes it a big pain to work with in GGML.
+    # So we rearrange them here,, so that we have n_head query weights
+    # followed by n_head_kv key weights followed by n_head_kv value weights,
+    # in contiguous fashion.
+
+    if "query_key_value" in src:
+        qkv = model[src].view(
+            n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
+
+        q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
+        k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
+        v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
+
+        model[src] = torch.cat((q,k,v)).reshape_as(model[src])
+    ```
+
 ##### RWKV
 
-**TODO**.
+The vocabulary size is the same as the number of rows in the `head` matrix.
+
+- `rwkv.architecture_version: u32`: The only allowed value currently is 4. Version 5 is expected to appear some time in the future.
+- `rwkv.context_length: u32`: Length of the context used during training or fine-tuning. RWKV is able to handle larger context than this limit, but the output quality may suffer.
+- `rwkv.layer_count: u32`
+- `rwkv.embedding_length: u32`
+- `rwkv.feedforward_length: u32`
 
 ##### Whisper
 
@@ -360,8 +411,14 @@ GGML supports an embedded vocabulary that may be lossily compressed from a more
 
 It is not guaranteed to be standardized across models, and may change in the future. It is recommended that model authors use a more standardized tokenizer if possible.
 
-- `tokenizer.ggml.tokens: array[string]`: A list of tokens.
+- `tokenizer.ggml.model: string`: The name of the tokenizer model.
+  - `llama`: Llama style SentencePiece (tokens and scores extracted from HF `tokenizer.model`)
+  - `replit`: Replit style SentencePiece (tokens and scores extracted from HF `spiece.model`)
+  - `gpt2`: GPT-2 / GPT-NeoX style BPE (tokens extracted from HF `tokenizer.json`)
+  - `rwkv`: RWKV tokenizer
+- `tokenizer.ggml.tokens: array[string]`: A list of tokens indexed by the token ID used by the model.
 - `tokenizer.ggml.scores: array[f32]`: If present, the score/probability of each token. If not present, all tokens are assumed to have equal probability. Must be the same length as `tokens`.
+- `tokenizer.ggml.merges: array[string]`: If present, the merges of the tokenizer. If not present, the tokens are assumed to be atomic.
 - `tokenizer.ggml.bos_token_id: u32`: Beginning of sequence marker
 - `tokenizer.ggml.eos_token_id: u32`: End of sequence marker
 - `tokenizer.ggml.unknown_token_id: u32`: Unknown token

From 0133f2e5f908b7bfd2454ff3ba17dc00c9f0ffaf Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Tue, 25 Jul 2023 22:26:25 +0200
Subject: [PATCH 07/27] docs(gguf): add norm eps and added_tokens

---
 docs/gguf.md | 63 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 25 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 30b56e771..796d0a411 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -28,7 +28,7 @@ Fields, including arrays, are written sequentially without alignment unless othe
 
 ```c
 enum ggml_type {
-    GGML_TYPE_F32  = 0,
+    GGML_TYPE_float32  = 0,
     GGML_TYPE_F16  = 1,
     GGML_TYPE_Q4_0 = 2,
     GGML_TYPE_Q4_1 = 3,
@@ -208,7 +208,7 @@ If a particular community key is widely used, it may be promoted to a standardiz
   - `bloom`
   - `falcon`
   - `rwkv`
-- **`general.quantization_version: u32`**: version of quantization scheme. Not required if the model is not quantized (i.e. no tensors are quantized). If any tensors are quantized, this _must_ be present.
+- **`general.quantization_version: uint32`**: version of quantization scheme. Not required if the model is not quantized (i.e. no tensors are quantized). If any tensors are quantized, this _must_ be present.
 
 #### General metadata
 
@@ -230,10 +230,10 @@ Information about where this model came from. This is useful for tracking the pr
 
 In the following, `[llm]` is used to fill in for the name of a specific LLM architecture. They will be used in each architecture's section.
 
-- `[llm].context_length: u32`: Also known as `n_ctx`. length of the context (in tokens) that the model was trained on. For most architectures, this is the hard limit on the length of the input. Architectures, like RWKV, that are not reliant on transformer-style attention may be able to handle larger inputs, but this is not guaranteed.
-- `[llm].embedding_length: u32`: Also known as `n_embd`. Embedding layer size.
-- `[llm].layer_count: u32`: Also known as `n_layers`. The number of attention+feedforward layers (i.e. the bulk of the LLM). Does not include the input or embedding layers.
-- `[llm].feedforward_length: u32`: Also known as `n_ff`. The length of the feedforward layer.
+- `[llm].context_length: uint32`: Also known as `n_ctx`. length of the context (in tokens) that the model was trained on. For most architectures, this is the hard limit on the length of the input. Architectures, like RWKV, that are not reliant on transformer-style attention may be able to handle larger inputs, but this is not guaranteed.
+- `[llm].embedding_length: uint32`: Also known as `n_embd`. Embedding layer size.
+- `[llm].layer_count: uint32`: Also known as `n_layers`. The number of attention+feedforward layers (i.e. the bulk of the LLM). Does not include the input or embedding layers.
+- `[llm].feedforward_length: uint32`: Also known as `n_ff`. The length of the feedforward layer.
 - `[llm].use_parallel_residual: bool`: Whether or not the parallel residual logic should be used.
 - `[llm].tensor_data_layout: string`: When a model is converted to GGUF, tensors may be rearranged to improve performance. This key describes the layout of the tensor data. This is not required; if not present, it is assumed to be `reference`.
   - `reference`: tensors are laid out in the same order as the original model
@@ -241,15 +241,17 @@ In the following, `[llm]` is used to fill in for the name of a specific LLM arch
 
 #### Attention
 
-- `[llm].attention.head_count: u32`: Also known as `n_head`. Number of attention heads.
-- `[llm].attention.head_count_kv: u32`: The number of heads per group used in Grouped-Query-Attention. If not present, the model does not use GQA.
-- `[llm].attention.max_alibi_bias: f32`: The maximum bias to use for ALiBI.
-- `[llm].attention.clamp_kqv: f32`: Value (`C`) to clamp the values of the `Q`, `K`, and `V` tensors between (`[-C, C]`).
+- `[llm].attention.head_count: uint32`: Also known as `n_head`. Number of attention heads.
+- `[llm].attention.head_count_kv: uint32`: The number of heads per group used in Grouped-Query-Attention. If not present, the model does not use GQA.
+- `[llm].attention.max_alibi_bias: float32`: The maximum bias to use for ALiBI.
+- `[llm].attention.clamp_kqv: float32`: Value (`C`) to clamp the values of the `Q`, `K`, and `V` tensors between (`[-C, C]`).
+- `[llm].attention.layer_norm_epsilon: float32`: Layer normalization epsilon.
+- `[llm].attention.layer_norm_rms_epsilon: float32`: Layer RMS normalization epsilon.
 
 #### RoPE
 
-- `[llm].rope.dimension_count: u32`: The number of rotary dimensions for RoPE.
-- `[llm].rope.scale: f32`: A scale factor for RoPE to adjust the context length.
+- `[llm].rope.dimension_count: uint32`: The number of rotary dimensions for RoPE.
+- `[llm].rope.scale: float32`: A scale factor for RoPE to adjust the context length.
 
 #### Models
 
@@ -263,6 +265,7 @@ The following sections describe the metadata for each model architecture. Each k
 - `llama.feedforward_length`
 - `llama.rope.dimension_count`
 - `llama.attention.head_count`
+- `llama.attention.layer_norm_rms_epsilon`
 
 ###### Optional
 
@@ -285,6 +288,7 @@ The following sections describe the metadata for each model architecture. Each k
 - `mpt.attention.head_count`
 - `mpt.attention.alibi_bias_max`
 - `mpt.attention.clip_kqv`
+- `mpt.attention.layer_norm_epsilon`
 
 ##### GPT-NeoX
 
@@ -294,6 +298,7 @@ The following sections describe the metadata for each model architecture. Each k
 - `gptneox.use_parallel_residual`
 - `gptneox.rope.dimension_count`
 - `gptneox.attention.head_count`
+- `gptneox.attention.layer_norm_epsilon`
 
 ###### Optional
 
@@ -306,6 +311,7 @@ The following sections describe the metadata for each model architecture. Each k
 - `gptj.layer_count`
 - `gptj.rope.dimension_count`
 - `gptj.attention.head_count`
+- `gptj.attention.layer_norm_epsilon`
 
 ###### Optional
 
@@ -317,6 +323,7 @@ The following sections describe the metadata for each model architecture. Each k
 - `gpt2.embedding_length`
 - `gpt2.layer_count`
 - `gpt2.attention.head_count`
+- `gpt2.attention.layer_norm_epsilon`
 
 ##### BLOOM
 
@@ -325,6 +332,7 @@ The following sections describe the metadata for each model architecture. Each k
 - `bloom.layer_count`
 - `bloom.feedforward_length`
 - `bloom.attention.head_count`
+- `bloom.attention.layer_norm_epsilon`
 
 ##### Falcon
 
@@ -334,6 +342,7 @@ The following sections describe the metadata for each model architecture. Each k
 - `falcon.attention.head_count`
 - `falcon.attention.head_count_kv`
 - `falcon.attention.use_norm`
+- `falcon.attention.layer_norm_epsilon`
 
 ###### Optional
 
@@ -365,11 +374,11 @@ The following sections describe the metadata for each model architecture. Each k
 
 The vocabulary size is the same as the number of rows in the `head` matrix.
 
-- `rwkv.architecture_version: u32`: The only allowed value currently is 4. Version 5 is expected to appear some time in the future.
-- `rwkv.context_length: u32`: Length of the context used during training or fine-tuning. RWKV is able to handle larger context than this limit, but the output quality may suffer.
-- `rwkv.layer_count: u32`
-- `rwkv.embedding_length: u32`
-- `rwkv.feedforward_length: u32`
+- `rwkv.architecture_version: uint32`: The only allowed value currently is 4. Version 5 is expected to appear some time in the future.
+- `rwkv.context_length: uint32`: Length of the context used during training or fine-tuning. RWKV is able to handle larger context than this limit, but the output quality may suffer.
+- `rwkv.layer_count: uint32`
+- `rwkv.embedding_length: uint32`
+- `rwkv.feedforward_length: uint32`
 
 ##### Whisper
 
@@ -380,7 +389,7 @@ This is because they are both transformer models.
 - `whisper.encoder.context_length`
 - `whisper.encoder.embedding_length`
 - `whisper.encoder.layer_count`
-- `whisper.encoder.mels_count: u32`
+- `whisper.encoder.mels_count: uint32`
 - `whisper.encoder.attention.head_count`
 
 - `whisper.decoder.context_length`
@@ -417,13 +426,17 @@ It is not guaranteed to be standardized across models, and may change in the fut
   - `gpt2`: GPT-2 / GPT-NeoX style BPE (tokens extracted from HF `tokenizer.json`)
   - `rwkv`: RWKV tokenizer
 - `tokenizer.ggml.tokens: array[string]`: A list of tokens indexed by the token ID used by the model.
-- `tokenizer.ggml.scores: array[f32]`: If present, the score/probability of each token. If not present, all tokens are assumed to have equal probability. Must be the same length as `tokens`.
+- `tokenizer.ggml.scores: array[float32]`: If present, the score/probability of each token. If not present, all tokens are assumed to have equal probability. Must be the same length as `tokens`.
 - `tokenizer.ggml.merges: array[string]`: If present, the merges of the tokenizer. If not present, the tokens are assumed to be atomic.
-- `tokenizer.ggml.bos_token_id: u32`: Beginning of sequence marker
-- `tokenizer.ggml.eos_token_id: u32`: End of sequence marker
-- `tokenizer.ggml.unknown_token_id: u32`: Unknown token
-- `tokenizer.ggml.separator_token_id: u32`: Separator token
-- `tokenizer.ggml.padding_token_id: u32`: Padding token
+- `tokenizer.ggml.added_tokens: array[string]`: If present, tokens that were added after training.
+
+##### Special tokens
+
+- `tokenizer.ggml.bos_token_id: uint32`: Beginning of sequence marker
+- `tokenizer.ggml.eos_token_id: uint32`: End of sequence marker
+- `tokenizer.ggml.unknown_token_id: uint32`: Unknown token
+- `tokenizer.ggml.separator_token_id: uint32`: Separator token
+- `tokenizer.ggml.padding_token_id: uint32`: Padding token
 
 #### Hugging Face
 
@@ -472,7 +485,7 @@ These formats share the same fundamental structure:
   - metadata about the model, such as the number of layers, the number of heads, etc.
   - a `ftype` that describes the type of the majority of the tensors,
     - for GGML files, the quantization version is encoded in the `ftype` divided by 1000
-- an embedded vocabulary, which is a list of strings with length prepended. The GGMF/GGJT formats embed a f32 score next to the strings.
+- an embedded vocabulary, which is a list of strings with length prepended. The GGMF/GGJT formats embed a float32 score next to the strings.
 - finally, a list of tensors with their length-prepended name, type, and (aligned, in the case of GGJT) tensor data
 
 Notably, this structure does not identify what model architecture the model belongs to, nor does it offer any flexibility for changing the structure of the hyperparameters. This means that the only way to add new hyperparameters is to add them to the end of the list, which is a breaking change for existing models.

From e9988f76e35340654d621f8caa8f5ca8e6c1d3f0 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Wed, 26 Jul 2023 14:46:26 +0200
Subject: [PATCH 08/27] docs(gguf): move padding

---
 docs/gguf.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 796d0a411..8fad730e1 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -165,12 +165,12 @@ struct gguf_file_t {
     // The header of the file.
     gguf_header_t header;
 
-    // Padding to the nearest multiple of `ALIGNMENT`.
-    uint8_t _padding[ALIGNMENT - (sizeof(header) % ALIGNMENT)];
-
     // Tensor infos, which can be used to locate the tensor data.
     gguf_tensor_info_t tensor_infos[header.tensor_count];
 
+    // Padding to the nearest multiple of `ALIGNMENT`.
+    uint8_t _padding[ALIGNMENT - (sizeof(header + tensor_infos) % ALIGNMENT)];
+
     // Tensor data.
     //
     // This is arbitrary binary data corresponding to the weights of the model. This data should be close

From f4c4d6a421a8158293aaeb3b775a9be78599cfab Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Thu, 27 Jul 2023 09:10:48 +0200
Subject: [PATCH 09/27] docs(gguf): remove migration tool

---
 docs/gguf.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 8fad730e1..a6133c5b4 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -456,12 +456,6 @@ This is a future extension and still needs to be discussed, and may necessitate
 
 A sample computation graph of GGML nodes could be included in the model itself, allowing an executor to run the model without providing its own implementation of the architecture. This would allow for a more consistent experience across executors, and would allow for more complex architectures to be supported without requiring the executor to implement them.
 
-## Migration
-
-All existing Python conversion scripts will be consolidated to use one `gguf` library. They will take models from Hugging Face or elsewhere and produce compliant GGUF files with all of the recommended metadata.
-
-Existing models do not have enough information to be directly converted to GGUF. Instead, a migration tool may be built that takes an existing GGML/GGMF/GGJT file and prompts the user for the missing information. This tool will be executor-agnostic, and will be able to produce a GGUF file that can be used by any executor. This tool may hardcode settings for models with known hashes to ease the migration process, such that a user can run `./migrate nous-hermes-13b.ggmlv3.q5_1.bin` and obtain a `nous-hermes-13b.ggmlv3.q5_1.gguf` file that is ready to use and consistent with uploaded models.
-
 ---
 
 ## Current State of Affairs

From 39da254bc4ac0490a3c449e3308f338357ef64dd Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Thu, 27 Jul 2023 09:19:21 +0200
Subject: [PATCH 10/27] docs(gguf): make offset base explicit

---
 docs/gguf.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/gguf.md b/docs/gguf.md
index a6133c5b4..bc5710acc 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -157,6 +157,10 @@ struct gguf_tensor_info_t {
     // The type of the tensor.
     ggml_type type;
     // The offset of the tensor's data in this file in bytes.
+    // This offset is relative to `tensor_data`, not to the start
+    // of the file, to make it easier for writers to write the file.
+    // Readers should consider exposing this offset relative to the
+    // file to make it easier to read the data.
     // Must be a multiple of `ALIGNMENT`.
     uint64_t offset;
 };

From a6d1cc12eee788ab81fd7174af7c76aea9a8c7de Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Thu, 27 Jul 2023 11:20:51 +0200
Subject: [PATCH 11/27] docs(gguf): fix replace oops

---
 docs/gguf.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index bc5710acc..8246700b6 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -28,7 +28,7 @@ Fields, including arrays, are written sequentially without alignment unless othe
 
 ```c
 enum ggml_type {
-    GGML_TYPE_float32  = 0,
+    GGML_TYPE_F32  = 0,
     GGML_TYPE_F16  = 1,
     GGML_TYPE_Q4_0 = 2,
     GGML_TYPE_Q4_1 = 3,

From 1d134ece1a31370af0fca60a09cd6421b24c75ef Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 6 Aug 2023 21:11:10 +0200
Subject: [PATCH 12/27] docs(gguf): alignment metadata+tensor name len max

---
 docs/gguf.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 8246700b6..84652c8b5 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -20,12 +20,10 @@ The key difference between GGJT and GGUF is the use of a key-value structure for
 
 ### File Structure
 
-GGUF files are structured as follows. They assume the use of a global `ALIGNMENT` constant, which is the alignment of the model data. This is currently 64 bytes, but may change in the future. [^1] To achieve this, where relevant, the file is padded with `0x00` bytes to the next multiple of `ALIGNMENT`.
+GGUF files are structured as follows. They use a global alignment specified in the `general.alignment` metadata field. Where required, the file is padded with `0x00` bytes to the next multiple of `general.alignment`.
 
 Fields, including arrays, are written sequentially without alignment unless otherwise specified.
 
-[^1]: This may be moved to a per-model key-value pair in the future.
-
 ```c
 enum ggml_type {
     GGML_TYPE_F32  = 0,
@@ -107,10 +105,10 @@ union gguf_metadata_value_t {
 };
 
 struct gguf_metadata_kv_t {
-    // A standard GGUF string, with the following caveats:
+    // The key of the metadata. It is a standard GGUF string, with the following caveats:
     // - It must be a valid ASCII string.
     // - It must be a hierarchical key, where each segment is `lower_snake_case` and separated by a `.`.
-    // - It must be at most 2^16-1 bytes long.
+    // - It must be at most 2^16-1/65535 bytes long.
     // Any keys that do not follow these rules are invalid.
     gguf_string_t key;
 
@@ -145,7 +143,8 @@ struct gguf_header_t {
 };
 
 struct gguf_tensor_info_t {
-    // The name of the tensor.
+    // The name of the tensor. It is a standard GGUF string, with the caveat that
+    // it must be at most 64 bytes long.
     gguf_string_t name;
     // The number of dimensions in the tensor.
     // Currently at most 4, but this may change in the future.
@@ -213,6 +212,7 @@ If a particular community key is widely used, it may be promoted to a standardiz
   - `falcon`
   - `rwkv`
 - **`general.quantization_version: uint32`**: version of quantization scheme. Not required if the model is not quantized (i.e. no tensors are quantized). If any tensors are quantized, this _must_ be present.
+- **`general.alignment: uint32`**: the global alignment to use, as described above. This can vary to allow for different alignment schemes, but it must be a multiple of 8.
 
 #### General metadata
 

From 2a90bbfe180b97d10937aea21ed6f3874dc22716 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Mon, 14 Aug 2023 21:05:20 +0200
Subject: [PATCH 13/27] docs(gguf): clarification, fixes, tensor names

---
 docs/gguf.md | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 84652c8b5..953304fdc 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -4,7 +4,7 @@ GGUF is a file format for storing models for inference with GGML and executors b
 
 It is a successor file format to GGML, GGMF and GGJT, and is designed to be unambiguous by containing all the information needed to load a model. It is also designed to be extensible, so that new features can be added to GGML without breaking compatibility with older models.
 
-For more information about the motivation behind GGUF, see [Current State of Affairs](#current-state-of-affairs).
+For more information about the motivation behind GGUF, see [Historical State of Affairs](#historical-state-of-affairs).
 
 ## Specification
 
@@ -211,7 +211,7 @@ If a particular community key is widely used, it may be promoted to a standardiz
   - `bloom`
   - `falcon`
   - `rwkv`
-- **`general.quantization_version: uint32`**: version of quantization scheme. Not required if the model is not quantized (i.e. no tensors are quantized). If any tensors are quantized, this _must_ be present.
+- **`general.quantization_version: uint32`**: The version of the quantization format. Not required if the model is not quantized (i.e. no tensors are quantized). If any tensors are quantized, this _must_ be present. This is separate to the quantization scheme described in `general.file_type` because the quantization version may change without changing the file type (e.g. the quantization scheme is Q5_K, and the quantization version is 4).
 - **`general.alignment: uint32`**: the global alignment to use, as described above. This can vary to allow for different alignment schemes, but it must be a multiple of 8.
 
 #### General metadata
@@ -236,7 +236,7 @@ In the following, `[llm]` is used to fill in for the name of a specific LLM arch
 
 - `[llm].context_length: uint32`: Also known as `n_ctx`. length of the context (in tokens) that the model was trained on. For most architectures, this is the hard limit on the length of the input. Architectures, like RWKV, that are not reliant on transformer-style attention may be able to handle larger inputs, but this is not guaranteed.
 - `[llm].embedding_length: uint32`: Also known as `n_embd`. Embedding layer size.
-- `[llm].layer_count: uint32`: Also known as `n_layers`. The number of attention+feedforward layers (i.e. the bulk of the LLM). Does not include the input or embedding layers.
+- `[llm].block_count: uint32`: The number of blocks of attention+feedforward layers (i.e. the bulk of the LLM). Does not include the input or embedding layers.
 - `[llm].feedforward_length: uint32`: Also known as `n_ff`. The length of the feedforward layer.
 - `[llm].use_parallel_residual: bool`: Whether or not the parallel residual logic should be used.
 - `[llm].tensor_data_layout: string`: When a model is converted to GGUF, tensors may be rearranged to improve performance. This key describes the layout of the tensor data. This is not required; if not present, it is assumed to be `reference`.
@@ -430,7 +430,8 @@ It is not guaranteed to be standardized across models, and may change in the fut
   - `gpt2`: GPT-2 / GPT-NeoX style BPE (tokens extracted from HF `tokenizer.json`)
   - `rwkv`: RWKV tokenizer
 - `tokenizer.ggml.tokens: array[string]`: A list of tokens indexed by the token ID used by the model.
-- `tokenizer.ggml.scores: array[float32]`: If present, the score/probability of each token. If not present, all tokens are assumed to have equal probability. Must be the same length as `tokens`.
+- `tokenizer.ggml.scores: array[float32]`: If present, the score/probability of each token. If not present, all tokens are assumed to have equal probability. If present, it must have the same length and index as `tokens`.
+- `tokenizer.ggml.token_type: array[uint32]`: The token type (1=normal, 2=unknown, 3=control, 4=user defined, 5=unused, 6=byte). If present, it must have the same length and index as `tokens`.
 - `tokenizer.ggml.merges: array[string]`: If present, the merges of the tokenizer. If not present, the tokens are assumed to be atomic.
 - `tokenizer.ggml.added_tokens: array[string]`: If present, tokens that were added after training.
 
@@ -460,9 +461,43 @@ This is a future extension and still needs to be discussed, and may necessitate
 
 A sample computation graph of GGML nodes could be included in the model itself, allowing an executor to run the model without providing its own implementation of the architecture. This would allow for a more consistent experience across executors, and would allow for more complex architectures to be supported without requiring the executor to implement them.
 
+## Standardized tensor names
+
+To minimize complexity and maximize compatibility, it is recommended that models using the transformer architecture use the following naming convention for their tensors:
+
+### Base layers
+
+`AA.weight` `AA.bias`
+
+where `AA` can be:
+
+- `token_embd`: Token embedding layer
+- `pos_embd`: Position embedding layer
+- `output_norm`: Output normalization layer
+- `output`: Output layer
+
+### Attention and feed-forward layer blocks
+
+`blk.N.BB.weight` `blk.N.BB.bias`
+
+where N signifies the block number a layer belongs to, and where `BB` could be:
+
+- `attn_norm`: Attention normalization layer
+- `attn_norm_2`: Attention normalization layer
+- `attn_qkv`: Attention query-key-value layer
+- `attn_q`: Attention query layer
+- `attn_k`: Attention key layer
+- `attn_v`: Attention value layer
+- `attn_output`: Attention output layer
+
+- `ffn_norm`: Feed-forward network normalization layer
+- `ffn_up`: Feed-forward network "up" layer
+- `ffn_gate`: Feed-forward network "gate" layer
+- `ffn_down`: Feed-forward network "down" layer
+
 ---
 
-## Current State of Affairs
+## Historical State of Affairs
 
 The following information is provided for context, but is not necessary to understand the rest of this document.
 

From 3d4507e73f5a804c25bbb9f955e3833618afb440 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Tue, 15 Aug 2023 18:51:01 +0200
Subject: [PATCH 14/27] docs(gguf): clarify license

---
 docs/gguf.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 953304fdc..d47b89e67 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -221,7 +221,7 @@ If a particular community key is widely used, it may be promoted to a standardiz
 - `general.url`: URL to the model's homepage. This can be a GitHub repo, a paper, etc.
 - `general.description: string`: free-form description of the model including anything that isn't covered by the other fields
 - `general.file_type: string`: type of the majority of the tensors in the file. This shouldn't have any semantic meaning and should be purely informational, hence the use of `string`.
-- `general.license: string`: SPDX license of the model
+- `general.license: string`: License of the model, expressed as a [SPDX license expression](https://spdx.github.io/spdx-spec/v2-draft/SPDX-license-expressions/) (e.g. `"MIT OR Apache-2.0`). Do not include any other information, such as the license text or the URL to the license.
 
 #### Source metadata
 

From 39d63776c6b76ece15a64538041504c2ae738867 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Tue, 15 Aug 2023 22:46:47 +0200
Subject: [PATCH 15/27] docs(gguf): minor tweaks

---
 docs/gguf.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index d47b89e67..08714af02 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -352,7 +352,7 @@ The following sections describe the metadata for each model architecture. Each k
 
 - `falcon.tensor_data_layout`:
 
-  - `llama.cpp` (this name may be inaccurate depending on where the Falcon implementation ends up):
+  - `jploski` (author of the original GGML implementation of Falcon; this may change in the future):
 
     ```python
     # The original query_key_value tensor contains n_head_kv "kv groups",
@@ -420,7 +420,7 @@ The following keys are used to describe the tokenizer of the model. It is recomm
 
 #### GGML
 
-GGML supports an embedded vocabulary that may be lossily compressed from a more complete tokenizer. It is simplistic and specific to GGML. This should enable inferencing of the model, but it may not fully capture the nuances of tokenization. When a more accurate tokenizer is available and supported, it should be used instead.
+GGML supports an embedded vocabulary that enables inference of the model, but implementations of tokenization using this vocabulary (i.e. `llama.cpp`'s tokenizer) may have lower accuracy than the original tokenizer used for the model. When a more accurate tokenizer is available and supported, it should be used instead.
 
 It is not guaranteed to be standardized across models, and may change in the future. It is recommended that model authors use a more standardized tokenizer if possible.
 

From e36b4ca4cd71a6958efafae6f2b96e475d426a47 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 20 Aug 2023 16:32:39 +0200
Subject: [PATCH 16/27] docs(gguf): data layout, GQA eq, no ft, LE GGUF

---
 docs/gguf.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 08714af02..802fae745 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -123,7 +123,8 @@ struct gguf_metadata_kv_t {
 
 struct gguf_header_t {
     // Magic number to announce that this is a GGUF file.
-    // Must be `'GGUF'`/`0x47475546`.
+    // Must be `GGUF` as a 32-bit little-endian integer:
+    // `0x46` `0x55` `0x47` `0x47` (i.e. 'F' 'U' 'G' 'G' at the byte level).
     uint32_t magic;
     // The version of the format implemented.
     // Must be `1` for version described in this spec.
@@ -211,7 +212,7 @@ If a particular community key is widely used, it may be promoted to a standardiz
   - `bloom`
   - `falcon`
   - `rwkv`
-- **`general.quantization_version: uint32`**: The version of the quantization format. Not required if the model is not quantized (i.e. no tensors are quantized). If any tensors are quantized, this _must_ be present. This is separate to the quantization scheme described in `general.file_type` because the quantization version may change without changing the file type (e.g. the quantization scheme is Q5_K, and the quantization version is 4).
+- **`general.quantization_version: uint32`**: The version of the quantization format. Not required if the model is not quantized (i.e. no tensors are quantized). If any tensors are quantized, this _must_ be present. This is separate to the quantization scheme of the tensors itself; the quantization version may change without changing the scheme's name (e.g. the quantization scheme is Q5_K, and the quantization version is 4).
 - **`general.alignment: uint32`**: the global alignment to use, as described above. This can vary to allow for different alignment schemes, but it must be a multiple of 8.
 
 #### General metadata
@@ -220,7 +221,6 @@ If a particular community key is widely used, it may be promoted to a standardiz
 - `general.author`: The author of the model.
 - `general.url`: URL to the model's homepage. This can be a GitHub repo, a paper, etc.
 - `general.description: string`: free-form description of the model including anything that isn't covered by the other fields
-- `general.file_type: string`: type of the majority of the tensors in the file. This shouldn't have any semantic meaning and should be purely informational, hence the use of `string`.
 - `general.license: string`: License of the model, expressed as a [SPDX license expression](https://spdx.github.io/spdx-spec/v2-draft/SPDX-license-expressions/) (e.g. `"MIT OR Apache-2.0`). Do not include any other information, such as the license text or the URL to the license.
 
 #### Source metadata
@@ -246,7 +246,7 @@ In the following, `[llm]` is used to fill in for the name of a specific LLM arch
 #### Attention
 
 - `[llm].attention.head_count: uint32`: Also known as `n_head`. Number of attention heads.
-- `[llm].attention.head_count_kv: uint32`: The number of heads per group used in Grouped-Query-Attention. If not present, the model does not use GQA.
+- `[llm].attention.head_count_kv: uint32`: The number of heads per group used in Grouped-Query-Attention. If not present or if present and equal to `[llm].attention.head_count`, the model does not use GQA.
 - `[llm].attention.max_alibi_bias: float32`: The maximum bias to use for ALiBI.
 - `[llm].attention.clamp_kqv: float32`: Value (`C`) to clamp the values of the `Q`, `K`, and `V` tensors between (`[-C, C]`).
 - `[llm].attention.layer_norm_epsilon: float32`: Layer normalization epsilon.
@@ -276,7 +276,7 @@ The following sections describe the metadata for each model architecture. Each k
 - `llama.rope.scale`
 - `llama.attention.head_count_kv`
 - `llama.tensor_data_layout`:
-  - `llama.cpp`:
+  - `Meta AI original pth`:
     ```python
     def permute(weights: NDArray, n_head: int) -> NDArray:
         return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])

From d5cfb553dbb1ebd5cf46f4c59934c1f48b469c8e Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 20 Aug 2023 20:55:39 +0200
Subject: [PATCH 17/27] docs(gguf): fix magic order

---
 docs/gguf.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 802fae745..08de76b4d 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -123,8 +123,10 @@ struct gguf_metadata_kv_t {
 
 struct gguf_header_t {
     // Magic number to announce that this is a GGUF file.
-    // Must be `GGUF` as a 32-bit little-endian integer:
-    // `0x46` `0x55` `0x47` `0x47` (i.e. 'F' 'U' 'G' 'G' at the byte level).
+    // Must be `GGUF` at the byte level: `0x47` `0x47` `0x55` `0x46`.
+    // Your executor might do little-endian byte order, so it might be
+    // check for 0x46554747 and letting the endianness cancel out.
+    // Consider being *very* explicit about the byte order here.
     uint32_t magic;
     // The version of the format implemented.
     // Must be `1` for version described in this spec.

From aa8d0ba00b7aeb0570d5afba994d709064c20001 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 20 Aug 2023 21:54:31 +0200
Subject: [PATCH 18/27] docs(gguf): match impl

---
 docs/gguf.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 08de76b4d..aac7a40e5 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -95,10 +95,10 @@ union gguf_metadata_value_t {
     bool bool_;
     gguf_string_t string;
     struct {
-        // Number of elements, not bytes
-        uint32_t len;
         // Any value type is valid, including arrays.
         gguf_metadata_value_type type;
+        // Number of elements, not bytes
+        uint32_t len;
         // The array of values.
         gguf_metadata_value_t array[len];
     } array;
@@ -112,8 +112,6 @@ struct gguf_metadata_kv_t {
     // Any keys that do not follow these rules are invalid.
     gguf_string_t key;
 
-    // The length of the value, in bytes
-    uint32_t value_len;
     // The type of the value.
     // Must be one of the `gguf_metadata_value_type` values.
     gguf_metadata_value_type value_type;

From f3e7632402584ee547015af3cf1ef4297ef3de0e Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 20 Aug 2023 22:01:45 +0200
Subject: [PATCH 19/27] docs(gguf): specify fallback alignment

---
 docs/gguf.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index aac7a40e5..2895bda44 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -213,7 +213,7 @@ If a particular community key is widely used, it may be promoted to a standardiz
   - `falcon`
   - `rwkv`
 - **`general.quantization_version: uint32`**: The version of the quantization format. Not required if the model is not quantized (i.e. no tensors are quantized). If any tensors are quantized, this _must_ be present. This is separate to the quantization scheme of the tensors itself; the quantization version may change without changing the scheme's name (e.g. the quantization scheme is Q5_K, and the quantization version is 4).
-- **`general.alignment: uint32`**: the global alignment to use, as described above. This can vary to allow for different alignment schemes, but it must be a multiple of 8.
+- **`general.alignment: uint32`**: the global alignment to use, as described above. This can vary to allow for different alignment schemes, but it must be a multiple of 8. Some writers may not write the alignment. If the alignment is **not** specified, assume it is `32`.
 
 #### General metadata
 

From 2fe03e5f1213ff1a2ddc7652f9b5783e8e836188 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 20 Aug 2023 22:20:01 +0200
Subject: [PATCH 20/27] docs(gguf): remove TensorInfo::n_elements

---
 docs/gguf.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 2895bda44..a39782522 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -152,8 +152,6 @@ struct gguf_tensor_info_t {
     uint32_t n_dimensions;
     // The dimensions of the tensor.
     uint32_t dimensions[n_dimensions];
-    // The number of elements in the tensor.
-    uint32_t n_elements;
     // The type of the tensor.
     ggml_type type;
     // The offset of the tensor's data in this file in bytes.

From 2b65fba00c83b9fa041df2ac55ccd8c2f10c5281 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Thu, 24 Aug 2023 21:23:33 +0200
Subject: [PATCH 21/27] docs(gguf): filetype, rope base/linear scale

---
 docs/gguf.md | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index a39782522..8291b9180 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -220,6 +220,26 @@ If a particular community key is widely used, it may be promoted to a standardiz
 - `general.url`: URL to the model's homepage. This can be a GitHub repo, a paper, etc.
 - `general.description: string`: free-form description of the model including anything that isn't covered by the other fields
 - `general.license: string`: License of the model, expressed as a [SPDX license expression](https://spdx.github.io/spdx-spec/v2-draft/SPDX-license-expressions/) (e.g. `"MIT OR Apache-2.0`). Do not include any other information, such as the license text or the URL to the license.
+- `general.file_type: u32`: An enumerated value describing the type of the majority of the tensors in the file. Optional; can be inferred from the tensor types.
+  - `ALL_F32 = 0`
+  - `MOSTLY_F16 = 1`
+  - `MOSTLY_Q4_0 = 2`
+  - `MOSTLY_Q4_1 = 3`
+  - `MOSTLY_Q4_1_SOME_F16 = 4`
+  - `MOSTLY_Q4_2 = 5` (support removed)
+  - `MOSTLY_Q4_3 = 6` (support removed)
+  - `MOSTLY_Q8_0 = 7`
+  - `MOSTLY_Q5_0 = 8`
+  - `MOSTLY_Q5_1 = 9`
+  - `MOSTLY_Q2_K = 10`
+  - `MOSTLY_Q3_K_S = 11`
+  - `MOSTLY_Q3_K_M = 12`
+  - `MOSTLY_Q3_K_L = 13`
+  - `MOSTLY_Q4_K_S = 14`
+  - `MOSTLY_Q4_K_M = 15`
+  - `MOSTLY_Q5_K_S = 16`
+  - `MOSTLY_Q5_K_M = 17`
+  - `MOSTLY_Q6_K = 18`
 
 #### Source metadata
 
@@ -253,7 +273,8 @@ In the following, `[llm]` is used to fill in for the name of a specific LLM arch
 #### RoPE
 
 - `[llm].rope.dimension_count: uint32`: The number of rotary dimensions for RoPE.
-- `[llm].rope.scale: float32`: A scale factor for RoPE to adjust the context length.
+- `[llm].rope.freq_base: float32`: The base frequency for RoPE.
+- `[llm].rope.scale_linear: float32`: A linear scale factor for RoPE to adjust the context length.
 
 #### Models
 

From b021b2577d4294800ece200c9f26c9c65b0f6f51 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sat, 26 Aug 2023 21:14:29 +0200
Subject: [PATCH 22/27] docs(gguf): v2 - uint64 all the things

---
 docs/gguf.md | 49 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 8291b9180..36bea4637 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -74,12 +74,18 @@ enum gguf_metadata_value_type: uint32_t {
     ///
     // Arrays can be nested, and the length of the array is the number of elements in the array, not the number of bytes.
     GGUF_METADATA_VALUE_TYPE_ARRAY = 9,
+    // The value is a 64-bit unsigned little-endian integer.
+    GGUF_METADATA_VALUE_TYPE_UINT64 = 10,
+    // The value is a 64-bit signed little-endian integer.
+    GGUF_METADATA_VALUE_TYPE_INT64 = 11,
+    // The value is a 64-bit IEEE754 floating point number.
+    GGUF_METADATA_VALUE_TYPE_FLOAT64 = 12,
 }
 
 // A string in GGUF.
 struct gguf_string_t {
     // The length of the string, in bytes.
-    uint32_t len;
+    uint64_t len;
     // The string as a UTF-8 non-null-terminated string.
     char string[len];
 }
@@ -92,13 +98,16 @@ union gguf_metadata_value_t {
     uint32_t uint32;
     int32_t int32;
     float float32;
+    uint64_t uint64;
+    int64_t int64;
+    double float64;
     bool bool_;
     gguf_string_t string;
     struct {
         // Any value type is valid, including arrays.
         gguf_metadata_value_type type;
         // Number of elements, not bytes
-        uint32_t len;
+        uint64_t len;
         // The array of values.
         gguf_metadata_value_t array[len];
     } array;
@@ -127,7 +136,7 @@ struct gguf_header_t {
     // Consider being *very* explicit about the byte order here.
     uint32_t magic;
     // The version of the format implemented.
-    // Must be `1` for version described in this spec.
+    // Must be `2` for version described in this spec.
     //
     // This version should only be increased for structural changes to the format.
     // Changes that do not affect the structure of the file should instead update the metadata
@@ -136,9 +145,9 @@ struct gguf_header_t {
     // The number of tensors in the file.
     // This is explicit, instead of being included in the metadata, to ensure it is always present
     // for loading the tensors.
-    uint32_t tensor_count;
+    uint64_t tensor_count;
     // The number of metadata key-value pairs.
-    uint32_t metadata_kv_count;
+    uint64_t metadata_kv_count;
     // The metadata key-value pairs.
     gguf_metadata_kv_t metadata_kv[metadata_kv_count];
 };
@@ -151,7 +160,7 @@ struct gguf_tensor_info_t {
     // Currently at most 4, but this may change in the future.
     uint32_t n_dimensions;
     // The dimensions of the tensor.
-    uint32_t dimensions[n_dimensions];
+    uint64_t dimensions[n_dimensions];
     // The type of the tensor.
     ggml_type type;
     // The offset of the tensor's data in this file in bytes.
@@ -197,6 +206,8 @@ The community can develop their own key-value pairs to carry additional data. Ho
 
 If a particular community key is widely used, it may be promoted to a standardized key.
 
+By convention, most counts/lengths/etc are `uint64` unless otherwise specified. This is to allow for larger models to be supported in the future. Some models may use `uint32` for their values; it is recommended that readers support both.
+
 ### General
 
 #### Required
@@ -220,7 +231,7 @@ If a particular community key is widely used, it may be promoted to a standardiz
 - `general.url`: URL to the model's homepage. This can be a GitHub repo, a paper, etc.
 - `general.description: string`: free-form description of the model including anything that isn't covered by the other fields
 - `general.license: string`: License of the model, expressed as a [SPDX license expression](https://spdx.github.io/spdx-spec/v2-draft/SPDX-license-expressions/) (e.g. `"MIT OR Apache-2.0`). Do not include any other information, such as the license text or the URL to the license.
-- `general.file_type: u32`: An enumerated value describing the type of the majority of the tensors in the file. Optional; can be inferred from the tensor types.
+- `general.file_type: uint32`: An enumerated value describing the type of the majority of the tensors in the file. Optional; can be inferred from the tensor types.
   - `ALL_F32 = 0`
   - `MOSTLY_F16 = 1`
   - `MOSTLY_Q4_0 = 2`
@@ -252,10 +263,10 @@ Information about where this model came from. This is useful for tracking the pr
 
 In the following, `[llm]` is used to fill in for the name of a specific LLM architecture. They will be used in each architecture's section.
 
-- `[llm].context_length: uint32`: Also known as `n_ctx`. length of the context (in tokens) that the model was trained on. For most architectures, this is the hard limit on the length of the input. Architectures, like RWKV, that are not reliant on transformer-style attention may be able to handle larger inputs, but this is not guaranteed.
-- `[llm].embedding_length: uint32`: Also known as `n_embd`. Embedding layer size.
-- `[llm].block_count: uint32`: The number of blocks of attention+feedforward layers (i.e. the bulk of the LLM). Does not include the input or embedding layers.
-- `[llm].feedforward_length: uint32`: Also known as `n_ff`. The length of the feedforward layer.
+- `[llm].context_length: uint64`: Also known as `n_ctx`. length of the context (in tokens) that the model was trained on. For most architectures, this is the hard limit on the length of the input. Architectures, like RWKV, that are not reliant on transformer-style attention may be able to handle larger inputs, but this is not guaranteed.
+- `[llm].embedding_length: uint64`: Also known as `n_embd`. Embedding layer size.
+- `[llm].block_count: uint64`: The number of blocks of attention+feedforward layers (i.e. the bulk of the LLM). Does not include the input or embedding layers.
+- `[llm].feedforward_length: uint64`: Also known as `n_ff`. The length of the feedforward layer.
 - `[llm].use_parallel_residual: bool`: Whether or not the parallel residual logic should be used.
 - `[llm].tensor_data_layout: string`: When a model is converted to GGUF, tensors may be rearranged to improve performance. This key describes the layout of the tensor data. This is not required; if not present, it is assumed to be `reference`.
   - `reference`: tensors are laid out in the same order as the original model
@@ -263,8 +274,8 @@ In the following, `[llm]` is used to fill in for the name of a specific LLM arch
 
 #### Attention
 
-- `[llm].attention.head_count: uint32`: Also known as `n_head`. Number of attention heads.
-- `[llm].attention.head_count_kv: uint32`: The number of heads per group used in Grouped-Query-Attention. If not present or if present and equal to `[llm].attention.head_count`, the model does not use GQA.
+- `[llm].attention.head_count: uint64`: Also known as `n_head`. Number of attention heads.
+- `[llm].attention.head_count_kv: uint64`: The number of heads per group used in Grouped-Query-Attention. If not present or if present and equal to `[llm].attention.head_count`, the model does not use GQA.
 - `[llm].attention.max_alibi_bias: float32`: The maximum bias to use for ALiBI.
 - `[llm].attention.clamp_kqv: float32`: Value (`C`) to clamp the values of the `Q`, `K`, and `V` tensors between (`[-C, C]`).
 - `[llm].attention.layer_norm_epsilon: float32`: Layer normalization epsilon.
@@ -272,7 +283,7 @@ In the following, `[llm]` is used to fill in for the name of a specific LLM arch
 
 #### RoPE
 
-- `[llm].rope.dimension_count: uint32`: The number of rotary dimensions for RoPE.
+- `[llm].rope.dimension_count: uint64`: The number of rotary dimensions for RoPE.
 - `[llm].rope.freq_base: float32`: The base frequency for RoPE.
 - `[llm].rope.scale_linear: float32`: A linear scale factor for RoPE to adjust the context length.
 
@@ -398,10 +409,10 @@ The following sections describe the metadata for each model architecture. Each k
 The vocabulary size is the same as the number of rows in the `head` matrix.
 
 - `rwkv.architecture_version: uint32`: The only allowed value currently is 4. Version 5 is expected to appear some time in the future.
-- `rwkv.context_length: uint32`: Length of the context used during training or fine-tuning. RWKV is able to handle larger context than this limit, but the output quality may suffer.
-- `rwkv.layer_count: uint32`
-- `rwkv.embedding_length: uint32`
-- `rwkv.feedforward_length: uint32`
+- `rwkv.context_length: uint64`: Length of the context used during training or fine-tuning. RWKV is able to handle larger context than this limit, but the output quality may suffer.
+- `rwkv.layer_count: uint64`
+- `rwkv.embedding_length: uint64`
+- `rwkv.feedforward_length: uint64`
 
 ##### Whisper
 
@@ -412,7 +423,7 @@ This is because they are both transformer models.
 - `whisper.encoder.context_length`
 - `whisper.encoder.embedding_length`
 - `whisper.encoder.layer_count`
-- `whisper.encoder.mels_count: uint32`
+- `whisper.encoder.mels_count: uint64`
 - `whisper.encoder.attention.head_count`
 
 - `whisper.decoder.context_length`

From 2da80c137a11059a023800dfc4751e11a6d7bfb4 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Tue, 29 Aug 2023 01:43:25 +0200
Subject: [PATCH 23/27] docs(gguf): tweak extensibility wording

---
 docs/gguf.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 36bea4637..468c1af28 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -2,7 +2,7 @@
 
 GGUF is a file format for storing models for inference with GGML and executors based on GGML. GGUF is a binary format that is designed for fast loading and saving of models, and for ease of reading. Models are traditionally developed using PyTorch or another framework, and then converted to GGUF for use in GGML.
 
-It is a successor file format to GGML, GGMF and GGJT, and is designed to be unambiguous by containing all the information needed to load a model. It is also designed to be extensible, so that new features can be added to GGML without breaking compatibility with older models.
+It is a successor file format to GGML, GGMF and GGJT, and is designed to be unambiguous by containing all the information needed to load a model. It is also designed to be extensible, so that new information can be added to models without breaking compatibility.
 
 For more information about the motivation behind GGUF, see [Historical State of Affairs](#historical-state-of-affairs).
 
@@ -11,7 +11,7 @@ For more information about the motivation behind GGUF, see [Historical State of
 GGUF is a format based on the existing GGJT, but makes a few changes to the format to make it more extensible and easier to use. The following features are desired:
 
 - Single-file deployment: they can be easily distributed and loaded, and do not require any external files for additional information.
-- Extensible: new features can be added to GGML without breaking compatibility with existing models.
+- Extensible: new features can be added to GGML-based executors/new information can be added to GGUF models without breaking compatibility with existing models.
 - `mmap` compatibility: models can be loaded using `mmap` for fast loading and saving.
 - Easy to use: models can be easily loaded and saved using a small amount of code, with no need for external libraries, regardless of the language used.
 - Full information: all information needed to load a model is contained in the model file, and no additional information needs to be provided by the user.

From 574b408f472923071fbc7a265c974c00ce01f959 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sat, 9 Sep 2023 12:26:52 +0200
Subject: [PATCH 24/27] docs(gguf): fix spec discrepancies

---
 docs/gguf.md | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 468c1af28..34a9d12ae 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -261,12 +261,12 @@ Information about where this model came from. This is useful for tracking the pr
 
 ### LLM
 
-In the following, `[llm]` is used to fill in for the name of a specific LLM architecture. They will be used in each architecture's section.
+In the following, `[llm]` is used to fill in for the name of a specific LLM architecture. For example, `llama` for LLaMA, `mpt` for MPT, etc. If mentioned in an architecture's section, it is required for that architecture, but not all keys are required for all architectures. Consult the relevant section for more information.
 
 - `[llm].context_length: uint64`: Also known as `n_ctx`. length of the context (in tokens) that the model was trained on. For most architectures, this is the hard limit on the length of the input. Architectures, like RWKV, that are not reliant on transformer-style attention may be able to handle larger inputs, but this is not guaranteed.
 - `[llm].embedding_length: uint64`: Also known as `n_embd`. Embedding layer size.
-- `[llm].block_count: uint64`: The number of blocks of attention+feedforward layers (i.e. the bulk of the LLM). Does not include the input or embedding layers.
-- `[llm].feedforward_length: uint64`: Also known as `n_ff`. The length of the feedforward layer.
+- `[llm].block_count: uint64`: The number of blocks of attention+feed-forward layers (i.e. the bulk of the LLM). Does not include the input or embedding layers.
+- `[llm].feed_forward_length: uint64`: Also known as `n_ff`. The length of the feed-forward layer.
 - `[llm].use_parallel_residual: bool`: Whether or not the parallel residual logic should be used.
 - `[llm].tensor_data_layout: string`: When a model is converted to GGUF, tensors may be rearranged to improve performance. This key describes the layout of the tensor data. This is not required; if not present, it is assumed to be `reference`.
   - `reference`: tensors are laid out in the same order as the original model
@@ -295,8 +295,8 @@ The following sections describe the metadata for each model architecture. Each k
 
 - `llama.context_length`
 - `llama.embedding_length`
-- `llama.layer_count`
-- `llama.feedforward_length`
+- `llama.block_count`
+- `llama.feed_forward_length`
 - `llama.rope.dimension_count`
 - `llama.attention.head_count`
 - `llama.attention.layer_norm_rms_epsilon`
@@ -318,7 +318,7 @@ The following sections describe the metadata for each model architecture. Each k
 
 - `mpt.context_length`
 - `mpt.embedding_length`
-- `mpt.layer_count`
+- `mpt.block_count`
 - `mpt.attention.head_count`
 - `mpt.attention.alibi_bias_max`
 - `mpt.attention.clip_kqv`
@@ -328,7 +328,7 @@ The following sections describe the metadata for each model architecture. Each k
 
 - `gptneox.context_length`
 - `gptneox.embedding_length`
-- `gptneox.layer_count`
+- `gptneox.block_count`
 - `gptneox.use_parallel_residual`
 - `gptneox.rope.dimension_count`
 - `gptneox.attention.head_count`
@@ -342,7 +342,7 @@ The following sections describe the metadata for each model architecture. Each k
 
 - `gptj.context_length`
 - `gptj.embedding_length`
-- `gptj.layer_count`
+- `gptj.block_count`
 - `gptj.rope.dimension_count`
 - `gptj.attention.head_count`
 - `gptj.attention.layer_norm_epsilon`
@@ -355,7 +355,7 @@ The following sections describe the metadata for each model architecture. Each k
 
 - `gpt2.context_length`
 - `gpt2.embedding_length`
-- `gpt2.layer_count`
+- `gpt2.block_count`
 - `gpt2.attention.head_count`
 - `gpt2.attention.layer_norm_epsilon`
 
@@ -363,8 +363,8 @@ The following sections describe the metadata for each model architecture. Each k
 
 - `bloom.context_length`
 - `bloom.embedding_length`
-- `bloom.layer_count`
-- `bloom.feedforward_length`
+- `bloom.block_count`
+- `bloom.feed_forward_length`
 - `bloom.attention.head_count`
 - `bloom.attention.layer_norm_epsilon`
 
@@ -372,7 +372,7 @@ The following sections describe the metadata for each model architecture. Each k
 
 - `falcon.context_length`
 - `falcon.embedding_length`
-- `falcon.layer_count`
+- `falcon.block_count`
 - `falcon.attention.head_count`
 - `falcon.attention.head_count_kv`
 - `falcon.attention.use_norm`
@@ -410,9 +410,9 @@ The vocabulary size is the same as the number of rows in the `head` matrix.
 
 - `rwkv.architecture_version: uint32`: The only allowed value currently is 4. Version 5 is expected to appear some time in the future.
 - `rwkv.context_length: uint64`: Length of the context used during training or fine-tuning. RWKV is able to handle larger context than this limit, but the output quality may suffer.
-- `rwkv.layer_count: uint64`
+- `rwkv.block_count: uint64`
 - `rwkv.embedding_length: uint64`
-- `rwkv.feedforward_length: uint64`
+- `rwkv.feed_forward_length: uint64`
 
 ##### Whisper
 
@@ -422,13 +422,13 @@ This is because they are both transformer models.
 
 - `whisper.encoder.context_length`
 - `whisper.encoder.embedding_length`
-- `whisper.encoder.layer_count`
+- `whisper.encoder.block_count`
 - `whisper.encoder.mels_count: uint64`
 - `whisper.encoder.attention.head_count`
 
 - `whisper.decoder.context_length`
 - `whisper.decoder.embedding_length`
-- `whisper.decoder.layer_count`
+- `whisper.decoder.block_count`
 - `whisper.decoder.attention.head_count`
 
 #### Prompting

From 78faa7b1c25b540bd90d3a5cb1348954d28e0a16 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Wed, 1 Nov 2023 00:13:10 +0100
Subject: [PATCH 25/27] docs(gguf): v3 + other fixes

---
 docs/gguf.md | 58 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 8 deletions(-)

diff --git a/docs/gguf.md b/docs/gguf.md
index 34a9d12ae..9140cf35f 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -20,12 +20,12 @@ The key difference between GGJT and GGUF is the use of a key-value structure for
 
 ### File Structure
 
-GGUF files are structured as follows. They use a global alignment specified in the `general.alignment` metadata field. Where required, the file is padded with `0x00` bytes to the next multiple of `general.alignment`.
+GGUF files are structured as follows. They use a global alignment specified in the `general.alignment` metadata field, referred to as `ALIGNMENT` below. Where required, the file is padded with `0x00` bytes to the next multiple of `general.alignment`.
 
 Fields, including arrays, are written sequentially without alignment unless otherwise specified.
 
 ```c
-enum ggml_type {
+enum ggml_type: uint32_t {
     GGML_TYPE_F32  = 0,
     GGML_TYPE_F16  = 1,
     GGML_TYPE_Q4_0 = 2,
@@ -136,7 +136,7 @@ struct gguf_header_t {
     // Consider being *very* explicit about the byte order here.
     uint32_t magic;
     // The version of the format implemented.
-    // Must be `2` for version described in this spec.
+    // Must be `3` for version described in this spec, which introduces big-endian support.
     //
     // This version should only be increased for structural changes to the format.
     // Changes that do not affect the structure of the file should instead update the metadata
@@ -152,6 +152,10 @@ struct gguf_header_t {
     gguf_metadata_kv_t metadata_kv[metadata_kv_count];
 };
 
+uint64_t align_offset(uint64_t offset) {
+    return offset + (ALIGNMENT - (offset % ALIGNMENT)) % ALIGNMENT;
+}
+
 struct gguf_tensor_info_t {
     // The name of the tensor. It is a standard GGUF string, with the caveat that
     // it must be at most 64 bytes long.
@@ -164,11 +168,13 @@ struct gguf_tensor_info_t {
     // The type of the tensor.
     ggml_type type;
     // The offset of the tensor's data in this file in bytes.
+    //
     // This offset is relative to `tensor_data`, not to the start
     // of the file, to make it easier for writers to write the file.
     // Readers should consider exposing this offset relative to the
     // file to make it easier to read the data.
-    // Must be a multiple of `ALIGNMENT`.
+    //
+    // Must be a multiple of `ALIGNMENT`. That is, `align_offset(offset) == offset`.
     uint64_t offset;
 };
 
@@ -180,7 +186,13 @@ struct gguf_file_t {
     gguf_tensor_info_t tensor_infos[header.tensor_count];
 
     // Padding to the nearest multiple of `ALIGNMENT`.
-    uint8_t _padding[ALIGNMENT - (sizeof(header + tensor_infos) % ALIGNMENT)];
+    //
+    // That is, if `sizeof(header) + sizeof(tensor_infos)` is not a multiple of `ALIGNMENT`,
+    // this padding is added to make it so.
+    //
+    // This can be calculated as `align_offset(position) - position`, where `position` is
+    // the position of the end of `tensor_infos` (i.e. `sizeof(header) + sizeof(tensor_infos)`).
+    uint8_t _padding[];
 
     // Tensor data.
     //
@@ -285,8 +297,22 @@ In the following, `[llm]` is used to fill in for the name of a specific LLM arch
 
 - `[llm].rope.dimension_count: uint64`: The number of rotary dimensions for RoPE.
 - `[llm].rope.freq_base: float32`: The base frequency for RoPE.
+
+##### Scaling
+
+The following keys describe RoPE scaling parameters:
+
+- `[llm].rope.scaling.type: string`: Can be `none`, `linear`, or `yarn`.
+- `[llm].rope.scaling.factor: float32`: A scale factor for RoPE to adjust the context length.
+- `[llm].rope.scaling.original_context_length: uint32_t`: The original context length of the base model.
+- `[llm].rope.scaling.finetuned: bool`: True if model has been finetuned with RoPE scaling.
+
+Note that older models may not have these keys, and may instead use the following key:
+
 - `[llm].rope.scale_linear: float32`: A linear scale factor for RoPE to adjust the context length.
 
+It is recommended that models use the newer keys if possible, as they are more flexible and allow for more complex scaling schemes. Executors will need to support both indefinitely.
+
 #### Models
 
 The following sections describe the metadata for each model architecture. Each key specified _must_ be present.
@@ -382,7 +408,7 @@ The following sections describe the metadata for each model architecture. Each k
 
 - `falcon.tensor_data_layout`:
 
-  - `jploski` (author of the original GGML implementation of Falcon; this may change in the future):
+  - `jploski` (author of the original GGML implementation of Falcon):
 
     ```python
     # The original query_key_value tensor contains n_head_kv "kv groups",
@@ -461,7 +487,7 @@ It is not guaranteed to be standardized across models, and may change in the fut
   - `rwkv`: RWKV tokenizer
 - `tokenizer.ggml.tokens: array[string]`: A list of tokens indexed by the token ID used by the model.
 - `tokenizer.ggml.scores: array[float32]`: If present, the score/probability of each token. If not present, all tokens are assumed to have equal probability. If present, it must have the same length and index as `tokens`.
-- `tokenizer.ggml.token_type: array[uint32]`: The token type (1=normal, 2=unknown, 3=control, 4=user defined, 5=unused, 6=byte). If present, it must have the same length and index as `tokens`.
+- `tokenizer.ggml.token_type: array[int32]`: The token type (1=normal, 2=unknown, 3=control, 4=user defined, 5=unused, 6=byte). If present, it must have the same length and index as `tokens`.
 - `tokenizer.ggml.merges: array[string]`: If present, the merges of the tokenizer. If not present, the tokens are assumed to be atomic.
 - `tokenizer.ggml.added_tokens: array[string]`: If present, tokens that were added after training.
 
@@ -525,7 +551,23 @@ where N signifies the block number a layer belongs to, and where `BB` could be:
 - `ffn_gate`: Feed-forward network "gate" layer
 - `ffn_down`: Feed-forward network "down" layer
 
----
+## Version History
+
+This document is actively updated to describe the current state of the metadata, and these changes are not tracked outside of the commits.
+
+However, the format _itself_ has changed. The following sections describe the changes to the format itself.
+
+### v3
+
+Adds big-endian support.
+
+### v2
+
+Most countable values (lengths, etc) were changed from `uint32` to `uint64` to allow for larger models to be supported in the future.
+
+### v1
+
+Initial version.
 
 ## Historical State of Affairs
 

From 0da010db3d472a66ef42fd09904e98184e95cdc8 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Wed, 1 Nov 2023 00:14:40 +0100
Subject: [PATCH 26/27] fix(editorconfig): use 2-space tabs for markdown

---
 .editorconfig | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.editorconfig b/.editorconfig
index 135a7e4bc..6f987c04b 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -12,6 +12,9 @@ charset = utf-8
 indent_style = space
 indent_size = 4
 
+[*.md]
+indent_size = 2
+
 [Makefile]
 indent_style = tab
 

From ad9598866fca248d14674b8e92ee32d69723aa59 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Wed, 1 Nov 2023 00:24:48 +0100
Subject: [PATCH 27/27] docs(gguf): clarify big-endian

---
 docs/gguf.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/gguf.md b/docs/gguf.md
index 9140cf35f..75610a45f 100644
--- a/docs/gguf.md
+++ b/docs/gguf.md
@@ -24,6 +24,8 @@ GGUF files are structured as follows. They use a global alignment specified in t
 
 Fields, including arrays, are written sequentially without alignment unless otherwise specified.
 
+Models are little-endian by default. They can also come in big-endian for use with big-endian computers; in this case, all values (including metadata values and tensors) will also be big-endian. At the time of writing, there is no way to determine if a model is big-endian; this may be rectified in future versions. If no additional information is provided, assume the model is little-endian.
+
 ```c
 enum ggml_type: uint32_t {
     GGML_TYPE_F32  = 0,