From 3394b827ba7733b041f543e98fa46700a67ebaf5 Mon Sep 17 00:00:00 2001
From: Benjamin Ye <benjamin.ye@georgian.io>
Date: Mon, 8 Apr 2024 18:47:36 -0400
Subject: [PATCH 1/9] add torch_dtype and attn_implementation for model loading
 step

---
 llmtune/finetune/lora.py                |  2 ++
 llmtune/pydantic_models/config_model.py | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)
diff --git a/llmtune/finetune/lora.py b/llmtune/finetune/lora.py
index e74667c..bd49e9e 100644
--- a/llmtune/finetune/lora.py
+++ b/llmtune/finetune/lora.py
@@ -74,6 +74,8 @@ def _get_model(self):
             ),
             use_cache=False,
             device_map=self.device_map,
+            torch_dtype=self._model_config.torch_dtype,
+            attn_implementation=self._model_config.attn_implementation,
         )
 
         model.config.pretraining_tp = 1
diff --git a/llmtune/pydantic_models/config_model.py b/llmtune/pydantic_models/config_model.py
index 1b86a59..21bfd6f 100644
--- a/llmtune/pydantic_models/config_model.py
+++ b/llmtune/pydantic_models/config_model.py
@@ -77,7 +77,13 @@ class ModelConfig(BaseModel):
         description="Path to the model (huggingface repo or local path)",
     )
     device_map: Optional[str] = Field("auto", description="device onto which to load the model")
+    torch_dtype: Optional[str] = Field("auto", description="torch dtype to use for model weights")
+    attn_implementation: Optional[str] = Field(
+        None,
+        description="set desired attention implementation; leave None for default. E.g. `flash_attention_2` (please ensure `torch_dtype` is either float16 or bfloat16).",
+    )
 
+    # Quantization Config
     quantize: Optional[bool] = Field(False, description="Flag to enable quantization")
     bitsandbytes: BitsAndBytesConfig = Field(None, description="Bits and Bytes configuration")
 
@@ -99,6 +105,16 @@ def set_device_map_to_none(cls, v, values, **kwargs):
             return None
         return v
 
+    @validator("torch_dtype", pre=True, allow_reuse=True)
+    def convert_str_to_torch_dtype(cls, v):
+        try:
+            # Attempt to retrieve the corresponding PyTorch data type
+            torch_dtype = getattr(torch, v)
+        except AttributeError:
+            # Handle the case where the string does not match any PyTorch data type
+            raise ValueError(f"{v} is not a valid torch data type")
+        return torch_dtype
+
 
 class LoraConfig(BaseModel):
     r: Optional[int] = Field(8, description="Lora rank")

From 557719f86869b200c49f69d233915d8d41d84587 Mon Sep 17 00:00:00 2001
From: Benjamin Ye <benjamin.ye@georgian.io>
Date: Mon, 8 Apr 2024 18:54:17 -0400
Subject: [PATCH 2/9] added README instruction

---
 README.md | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 3cd36e1..0c3b7b9 100644
--- a/README.md
+++ b/README.md
@@ -13,18 +13,21 @@ LLM Finetuning toolkit is a config-based CLI tool for launching a series of LLM
 </p>
 
 ## Installation
+
 ### pipx (recommended)
+
 pipx installs the package and depdencies in a seperate virtual environment
+
 ```shell
 pipx install llm-toolkit
 ```
 
 ### pip
+
 ```shell
 pip install llm-toolkit
 ```
 
-
 ## Quick Start
 
 This guide contains 3 stages that will enable you to get the most out of this toolkit!
@@ -45,6 +48,30 @@ This command initiates the fine-tuning process using the settings specified in t
 
 The configuration file is the central piece that defines the behavior of the toolkit. It is written in YAML format and consists of several sections that control different aspects of the process, such as data ingestion, model definition, training, inference, and quality assurance. We highlight some of the critical sections.
 
+#### Flash Attention 2
+
+To enable Flash-attention for [supported models](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2). First install `flash-attn`:
+
+**pipx**
+
+```shell
+pipx inject llm-toolkit flash-attn --pip-args=--no-build-isolation
+```
+
+**pip**
+
+```
+pip install flash-attn --no-build-isolation
+```
+
+Then, add to config file.
+
+```yaml
+model:
+  torch_dtype: "bfloat16" # or "float16" if using older GPU
+  attn_implementation: "flash_attention_2"
+```
+
 #### Data Ingestion
 
 An example of what the data ingestion may look like:
@@ -247,6 +274,7 @@ NOTE: Be sure to merge the latest from "upstream" before making a pull request!
    # GPU
    docker run -it --gpus all llm-toolkit
 ```
+
 </details>
 
 <details>
@@ -257,6 +285,7 @@ See poetry documentation page for poetry [installation instructions](https://pyt
 ```shell
    poetry install
 ```
+
 </details>
 <details>
 <summary>pip</summary>
@@ -265,11 +294,10 @@ We recommend using a virtual environment like `venv` or `conda` for installation
 ```shell
    pip install -e .
 ```
+
 </details>
 </details>
 
-
-
 ### Checklist Before Pull Request (Optional)
 
 1. Use `ruff check --fix` to check and fix lint errors
@@ -277,15 +305,12 @@ We recommend using a virtual environment like `venv` or `conda` for installation
 
 NOTE: Ruff linting and formatting checks are done when PR is raised via Git Action. Before raising a PR, it is a good practice to check and fix lint errors, as well as apply formatting.
 
-
 ### Releasing
 
-
-To manually release a PyPI package, please run: 
+To manually release a PyPI package, please run:
 
 ```shell
    make build-release
 ```
 
 Note: Make sure you have pypi token for this [PyPI repo](https://pypi.org/project/llm-toolkit/).
-

From bc830653daf8171dae12d0d69f45fbb841d1c38b Mon Sep 17 00:00:00 2001
From: Benjamin Ye <benjamin.ye@georgian.io>
Date: Mon, 8 Apr 2024 19:09:09 -0400
Subject: [PATCH 3/9] rename infer_test_set for greater clarity

---
 llmtune/cli/toolkit.py        | 2 +-
 llmtune/inference/generics.py | 2 +-
 llmtune/inference/lora.py     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llmtune/cli/toolkit.py b/llmtune/cli/toolkit.py
index 3be899a..cef55db 100644
--- a/llmtune/cli/toolkit.py
+++ b/llmtune/cli/toolkit.py
@@ -68,7 +68,7 @@ def run_one_experiment(config: Config, config_path: str) -> None:
     results_file_path = join(dir_helper.save_paths.results, "results.csv")
     if not exists(results_path) or exists(results_file_path):
         inference_runner = LoRAInference(test, test_column, config, dir_helper)
-        inference_runner.infer_all()
+        inference_runner.infer_test_set()
         RichUI.after_inference(results_path)
     else:
         RichUI.inference_found(results_path)
diff --git a/llmtune/inference/generics.py b/llmtune/inference/generics.py
index 24a2bfb..b42db50 100644
--- a/llmtune/inference/generics.py
+++ b/llmtune/inference/generics.py
@@ -7,5 +7,5 @@ def infer_one(self, prompt: str):
         pass
 
     @abstractmethod
-    def infer_all(self):
+    def infer_test_set(self):
         pass
diff --git a/llmtune/inference/lora.py b/llmtune/inference/lora.py
index 720822c..68d812c 100644
--- a/llmtune/inference/lora.py
+++ b/llmtune/inference/lora.py
@@ -64,7 +64,7 @@ def _get_merged_model(self, weights_path: str):
 
         return model, tokenizer
 
-    def infer_all(self):
+    def infer_test_set(self):
         results = []
         prompts = self.test_dataset["formatted_prompt"]
         labels = self.test_dataset[self.label_column]

From 075c1459a756f9c48462755bf8f1c3c24e13f514 Mon Sep 17 00:00:00 2001
From: Benjamin Ye <benjamin.ye@georgian.io>
Date: Mon, 8 Apr 2024 19:10:10 -0400
Subject: [PATCH 4/9] enrich inference config parameters

---
 llmtune/pydantic_models/config_model.py | 170 +++++++++++++++++++-----
 1 file changed, 140 insertions(+), 30 deletions(-)

diff --git a/llmtune/pydantic_models/config_model.py b/llmtune/pydantic_models/config_model.py
index 21bfd6f..da31359 100644
--- a/llmtune/pydantic_models/config_model.py
+++ b/llmtune/pydantic_models/config_model.py
@@ -9,13 +9,21 @@
 
 
 class QaConfig(BaseModel):
-    llm_tests: Optional[List[str]] = Field([], description="list of tests that needs to be connected")
+    llm_tests: Optional[List[str]] = Field(
+        [], description="list of tests that needs to be connected"
+    )
 
 
 class DataConfig(BaseModel):
-    file_type: Literal["json", "jsonl", "csv", "huggingface"] = Field(None, description="File type")
-    path: Union[FilePath, HfModelPath] = Field(None, description="Path to the file or HuggingFace model")
-    prompt: str = Field(None, description="Prompt for the model. Use {} brackets for column name")
+    file_type: Literal["json", "jsonl", "csv", "huggingface"] = Field(
+        None, description="File type"
+    )
+    path: Union[FilePath, HfModelPath] = Field(
+        None, description="Path to the file or HuggingFace model"
+    )
+    prompt: str = Field(
+        None, description="Prompt for the model. Use {} brackets for column name"
+    )
     prompt_stub: str = Field(
         None,
         description="Stub for the prompt; this is injected during training. Use {} brackets for column name",
@@ -42,7 +50,9 @@ class DataConfig(BaseModel):
 
 
 class BitsAndBytesConfig(BaseModel):
-    load_in_8bit: Optional[bool] = Field(False, description="Enable 8-bit quantization with LLM.int8()")
+    load_in_8bit: Optional[bool] = Field(
+        False, description="Enable 8-bit quantization with LLM.int8()"
+    )
     llm_int8_threshold: Optional[float] = Field(
         6.0, description="Outlier threshold for outlier detection in 8-bit quantization"
     )
@@ -53,7 +63,9 @@ class BitsAndBytesConfig(BaseModel):
         False,
         description="Enable splitting model parts between int8 on GPU and fp32 on CPU",
     )
-    llm_int8_has_fp16_weight: Optional[bool] = Field(False, description="Run LLM.int8() with 16-bit main weights")
+    llm_int8_has_fp16_weight: Optional[bool] = Field(
+        False, description="Run LLM.int8() with 16-bit main weights"
+    )
 
     load_in_4bit: Optional[bool] = Field(
         True,
@@ -76,8 +88,12 @@ class ModelConfig(BaseModel):
         "NousResearch/Llama-2-7b-hf",
         description="Path to the model (huggingface repo or local path)",
     )
-    device_map: Optional[str] = Field("auto", description="device onto which to load the model")
-    torch_dtype: Optional[str] = Field("auto", description="torch dtype to use for model weights")
+    device_map: Optional[str] = Field(
+        "auto", description="device onto which to load the model"
+    )
+    torch_dtype: Optional[str] = Field(
+        "auto", description="torch dtype to use for model weights"
+    )
     attn_implementation: Optional[str] = Field(
         None,
         description="set desired attention implementation; leave None for default. E.g. `flash_attention_2` (please ensure `torch_dtype` is either float16 or bfloat16).",
@@ -85,7 +101,9 @@ class ModelConfig(BaseModel):
 
     # Quantization Config
     quantize: Optional[bool] = Field(False, description="Flag to enable quantization")
-    bitsandbytes: BitsAndBytesConfig = Field(None, description="Bits and Bytes configuration")
+    bitsandbytes: BitsAndBytesConfig = Field(
+        None, description="Bits and Bytes configuration"
+    )
 
     # @validator("hf_model_ckpt")
     # def validate_model(cls, v, **kwargs):
@@ -118,12 +136,22 @@ def convert_str_to_torch_dtype(cls, v):
 
 class LoraConfig(BaseModel):
     r: Optional[int] = Field(8, description="Lora rank")
-    task_type: Optional[str] = Field("CAUSAL_LM", description="Base Model task type during training")
+    task_type: Optional[str] = Field(
+        "CAUSAL_LM", description="Base Model task type during training"
+    )
 
-    lora_alpha: Optional[int] = Field(16, description="The alpha parameter for Lora scaling")
-    bias: Optional[str] = Field("none", description="Bias type for Lora. Can be 'none', 'all' or 'lora_only'")
-    lora_dropout: Optional[float] = Field(0.1, description="The dropout probability for Lora layers")
-    target_modules: Optional[List[str]] = Field(None, description="The names of the modules to apply Lora to")
+    lora_alpha: Optional[int] = Field(
+        16, description="The alpha parameter for Lora scaling"
+    )
+    bias: Optional[str] = Field(
+        "none", description="Bias type for Lora. Can be 'none', 'all' or 'lora_only'"
+    )
+    lora_dropout: Optional[float] = Field(
+        0.1, description="The dropout probability for Lora layers"
+    )
+    target_modules: Optional[List[str]] = Field(
+        None, description="The names of the modules to apply Lora to"
+    )
     fan_in_fan_out: Optional[bool] = Field(
         False,
         description="Flag to indicate if the layer to replace stores weight like (fan_in, fan_out)",
@@ -132,7 +160,9 @@ class LoraConfig(BaseModel):
         None,
         description="List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint",
     )
-    layers_to_transform: Optional[Union[List[int], int]] = Field(None, description="The layer indexes to transform")
+    layers_to_transform: Optional[Union[List[int], int]] = Field(
+        None, description="The layer indexes to transform"
+    )
     layers_pattern: Optional[str] = Field(None, description="The layer pattern name")
     # rank_pattern: Optional[Dict[str, int]] = Field(
     #     {}, description="The mapping from layer names or regexp expression to ranks"
@@ -142,12 +172,17 @@ class LoraConfig(BaseModel):
     # )
 
 
-# TODO: Get comprehensive Args!
 class TrainingArgs(BaseModel):
     num_train_epochs: Optional[int] = Field(1, description="Number of training epochs")
-    per_device_train_batch_size: Optional[int] = Field(1, description="Batch size per training device")
-    gradient_accumulation_steps: Optional[int] = Field(1, description="Number of steps for gradient accumulation")
-    gradient_checkpointing: Optional[bool] = Field(True, description="Flag to enable gradient checkpointing")
+    per_device_train_batch_size: Optional[int] = Field(
+        1, description="Batch size per training device"
+    )
+    gradient_accumulation_steps: Optional[int] = Field(
+        1, description="Number of steps for gradient accumulation"
+    )
+    gradient_checkpointing: Optional[bool] = Field(
+        True, description="Flag to enable gradient checkpointing"
+    )
     optim: Optional[str] = Field("paged_adamw_32bit", description="Optimizer")
     logging_steps: Optional[int] = Field(100, description="Number of logging steps")
     learning_rate: Optional[float] = Field(2.0e-4, description="Learning rate")
@@ -156,10 +191,15 @@ class TrainingArgs(BaseModel):
     fp16: Optional[bool] = Field(False, description="Flag to enable fp16")
     max_grad_norm: Optional[float] = Field(0.3, description="Maximum gradient norm")
     warmup_ratio: Optional[float] = Field(0.03, description="Warmup ratio")
-    lr_scheduler_type: Optional[str] = Field("constant", description="Learning rate scheduler type")
+    lr_scheduler_type: Optional[str] = Field(
+        "constant", description="Learning rate scheduler type"
+    )
+    save_steps: Optional[Union[int, float]] = Field(
+        500,
+        description="Number of updates steps before checkpoint saves. Should be an integer or a float in range [0,1). If smaller than 1, will be interpreted as ratio of total training steps.",
+    )
 
 
-# TODO: Get comprehensive Args!
 class SftArgs(BaseModel):
     max_seq_length: Optional[int] = Field(None, description="Maximum sequence length")
     neftune_noise_alpha: Optional[float] = Field(
@@ -173,16 +213,86 @@ class TrainingConfig(BaseModel):
     sft_args: SftArgs
 
 
-# TODO: Get comprehensive Args!
 class InferenceConfig(BaseModel):
-    max_new_tokens: Optional[int] = Field(None, description="Maximum new tokens")
-    use_cache: Optional[bool] = Field(True, description="Flag to enable cache usage")
-    do_sample: Optional[bool] = Field(True, description="Flag to enable sampling")
-    top_p: Optional[float] = Field(1.0, description="Top p value")
-    temperature: Optional[float] = Field(0.1, description="Temperature value")
-    epsilon_cutoff: Optional[float] = Field(0.0, description="epsilon cutoff value")
-    eta_cutoff: Optional[float] = Field(0.0, description="eta cutoff value")
-    top_k: Optional[int] = Field(50, description="top-k sampling")
+    # Length
+    max_length: Optional[int] = Field(
+        None, description="The maximum length the generated tokens can have."
+    )
+    max_new_tokens: Optional[int] = Field(
+        None, description="The maximum numbers of tokens to generate."
+    )
+    min_length: Optional[int] = Field(
+        0, description="The minimum length of the sequence to be generated."
+    )
+    min_new_tokens: Optional[int] = Field(
+        None, description="The minimum numbers of tokens to generate."
+    )
+    early_stopping: Optional[Union[bool, str]] = Field(
+        False, description="Controls the stopping condition for beam search."
+    )
+    max_time: Optional[float] = Field(
+        None, description="The maximum amount of time for the computation in seconds."
+    )
+
+    # Generation Strategy
+    do_sample: Optional[bool] = Field(
+        False, description="Whether or not to use sampling."
+    )
+    num_beams: Optional[int] = Field(1, description="Number of beams for beam search.")
+    num_beam_groups: Optional[int] = Field(
+        1, description="Number of groups for diversity among beams."
+    )
+    penalty_alpha: Optional[float] = Field(
+        None, description="Balances model confidence and degeneration penalty."
+    )
+    use_cache: Optional[bool] = Field(
+        True,
+        description="Whether to use past key/values attentions to speed up decoding.",
+    )
+
+    # Manipulation of Model Output Logits
+    temperature: Optional[float] = Field(
+        1.0, description="Modulates the next token probabilities."
+    )
+    top_k: Optional[int] = Field(
+        50,
+        description="Number of highest probability tokens to keep for top-k-filtering.",
+    )
+    top_p: Optional[float] = Field(
+        1.0,
+        description="Keeps the smallest set of most probable tokens summing up to top_p.",
+    )
+    typical_p: Optional[float] = Field(1.0, description="Local typicality measure.")
+    epsilon_cutoff: Optional[float] = Field(
+        0.0, description="Minimum conditional probability for token sampling."
+    )
+    eta_cutoff: Optional[float] = Field(
+        0.0, description="Hybrid of locally typical sampling and epsilon sampling."
+    )
+    diversity_penalty: Optional[float] = Field(
+        0.0, description="Penalty for token repetition across different beam groups."
+    )
+    repetition_penalty: Optional[float] = Field(
+        1.0, description="Penalty for token repetition."
+    )
+    encoder_repetition_penalty: Optional[float] = Field(
+        1.0, description="Penalty on sequences not in the original input."
+    )
+    length_penalty: Optional[float] = Field(
+        1.0, description="Exponential penalty to the length for beam search."
+    )
+    no_repeat_ngram_size: Optional[int] = Field(
+        0, description="Size of ngrams that cannot occur more than once."
+    )
+    bad_words_ids: Optional[List[List[int]]] = Field(
+        None, description="Tokens that are not allowed to be generated."
+    )
+    force_words_ids: Optional[List[Union[List[int], List[List[int]]]]] = Field(
+        None, description="Tokens that must be generated."
+    )
+    renormalize_logits: Optional[bool] = Field(
+        False, description="Whether to renormalize logits after all processors."
+    )
 
 
 class AblationConfig(BaseModel):

From 050e9688383267ea46832e1f9d565ba15691f89a Mon Sep 17 00:00:00 2001
From: Benjamin Ye <benjamin.ye@georgian.io>
Date: Mon, 8 Apr 2024 19:12:41 -0400
Subject: [PATCH 5/9] ruff formatting

---
 llmtune/pydantic_models/config_model.py | 136 ++++++------------------
 1 file changed, 34 insertions(+), 102 deletions(-)

diff --git a/llmtune/pydantic_models/config_model.py b/llmtune/pydantic_models/config_model.py
index da31359..3f848aa 100644
--- a/llmtune/pydantic_models/config_model.py
+++ b/llmtune/pydantic_models/config_model.py
@@ -9,21 +9,13 @@
 
 
 class QaConfig(BaseModel):
-    llm_tests: Optional[List[str]] = Field(
-        [], description="list of tests that needs to be connected"
-    )
+    llm_tests: Optional[List[str]] = Field([], description="list of tests that needs to be connected")
 
 
 class DataConfig(BaseModel):
-    file_type: Literal["json", "jsonl", "csv", "huggingface"] = Field(
-        None, description="File type"
-    )
-    path: Union[FilePath, HfModelPath] = Field(
-        None, description="Path to the file or HuggingFace model"
-    )
-    prompt: str = Field(
-        None, description="Prompt for the model. Use {} brackets for column name"
-    )
+    file_type: Literal["json", "jsonl", "csv", "huggingface"] = Field(None, description="File type")
+    path: Union[FilePath, HfModelPath] = Field(None, description="Path to the file or HuggingFace model")
+    prompt: str = Field(None, description="Prompt for the model. Use {} brackets for column name")
     prompt_stub: str = Field(
         None,
         description="Stub for the prompt; this is injected during training. Use {} brackets for column name",
@@ -50,9 +42,7 @@ class DataConfig(BaseModel):
 
 
 class BitsAndBytesConfig(BaseModel):
-    load_in_8bit: Optional[bool] = Field(
-        False, description="Enable 8-bit quantization with LLM.int8()"
-    )
+    load_in_8bit: Optional[bool] = Field(False, description="Enable 8-bit quantization with LLM.int8()")
     llm_int8_threshold: Optional[float] = Field(
         6.0, description="Outlier threshold for outlier detection in 8-bit quantization"
     )
@@ -63,9 +53,7 @@ class BitsAndBytesConfig(BaseModel):
         False,
         description="Enable splitting model parts between int8 on GPU and fp32 on CPU",
     )
-    llm_int8_has_fp16_weight: Optional[bool] = Field(
-        False, description="Run LLM.int8() with 16-bit main weights"
-    )
+    llm_int8_has_fp16_weight: Optional[bool] = Field(False, description="Run LLM.int8() with 16-bit main weights")
 
     load_in_4bit: Optional[bool] = Field(
         True,
@@ -88,12 +76,8 @@ class ModelConfig(BaseModel):
         "NousResearch/Llama-2-7b-hf",
         description="Path to the model (huggingface repo or local path)",
     )
-    device_map: Optional[str] = Field(
-        "auto", description="device onto which to load the model"
-    )
-    torch_dtype: Optional[str] = Field(
-        "auto", description="torch dtype to use for model weights"
-    )
+    device_map: Optional[str] = Field("auto", description="device onto which to load the model")
+    torch_dtype: Optional[str] = Field("auto", description="torch dtype to use for model weights")
     attn_implementation: Optional[str] = Field(
         None,
         description="set desired attention implementation; leave None for default. E.g. `flash_attention_2` (please ensure `torch_dtype` is either float16 or bfloat16).",
@@ -101,9 +85,7 @@ class ModelConfig(BaseModel):
 
     # Quantization Config
     quantize: Optional[bool] = Field(False, description="Flag to enable quantization")
-    bitsandbytes: BitsAndBytesConfig = Field(
-        None, description="Bits and Bytes configuration"
-    )
+    bitsandbytes: BitsAndBytesConfig = Field(None, description="Bits and Bytes configuration")
 
     # @validator("hf_model_ckpt")
     # def validate_model(cls, v, **kwargs):
@@ -136,22 +118,12 @@ def convert_str_to_torch_dtype(cls, v):
 
 class LoraConfig(BaseModel):
     r: Optional[int] = Field(8, description="Lora rank")
-    task_type: Optional[str] = Field(
-        "CAUSAL_LM", description="Base Model task type during training"
-    )
+    task_type: Optional[str] = Field("CAUSAL_LM", description="Base Model task type during training")
 
-    lora_alpha: Optional[int] = Field(
-        16, description="The alpha parameter for Lora scaling"
-    )
-    bias: Optional[str] = Field(
-        "none", description="Bias type for Lora. Can be 'none', 'all' or 'lora_only'"
-    )
-    lora_dropout: Optional[float] = Field(
-        0.1, description="The dropout probability for Lora layers"
-    )
-    target_modules: Optional[List[str]] = Field(
-        None, description="The names of the modules to apply Lora to"
-    )
+    lora_alpha: Optional[int] = Field(16, description="The alpha parameter for Lora scaling")
+    bias: Optional[str] = Field("none", description="Bias type for Lora. Can be 'none', 'all' or 'lora_only'")
+    lora_dropout: Optional[float] = Field(0.1, description="The dropout probability for Lora layers")
+    target_modules: Optional[List[str]] = Field(None, description="The names of the modules to apply Lora to")
     fan_in_fan_out: Optional[bool] = Field(
         False,
         description="Flag to indicate if the layer to replace stores weight like (fan_in, fan_out)",
@@ -160,9 +132,7 @@ class LoraConfig(BaseModel):
         None,
         description="List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint",
     )
-    layers_to_transform: Optional[Union[List[int], int]] = Field(
-        None, description="The layer indexes to transform"
-    )
+    layers_to_transform: Optional[Union[List[int], int]] = Field(None, description="The layer indexes to transform")
     layers_pattern: Optional[str] = Field(None, description="The layer pattern name")
     # rank_pattern: Optional[Dict[str, int]] = Field(
     #     {}, description="The mapping from layer names or regexp expression to ranks"
@@ -174,15 +144,9 @@ class LoraConfig(BaseModel):
 
 class TrainingArgs(BaseModel):
     num_train_epochs: Optional[int] = Field(1, description="Number of training epochs")
-    per_device_train_batch_size: Optional[int] = Field(
-        1, description="Batch size per training device"
-    )
-    gradient_accumulation_steps: Optional[int] = Field(
-        1, description="Number of steps for gradient accumulation"
-    )
-    gradient_checkpointing: Optional[bool] = Field(
-        True, description="Flag to enable gradient checkpointing"
-    )
+    per_device_train_batch_size: Optional[int] = Field(1, description="Batch size per training device")
+    gradient_accumulation_steps: Optional[int] = Field(1, description="Number of steps for gradient accumulation")
+    gradient_checkpointing: Optional[bool] = Field(True, description="Flag to enable gradient checkpointing")
     optim: Optional[str] = Field("paged_adamw_32bit", description="Optimizer")
     logging_steps: Optional[int] = Field(100, description="Number of logging steps")
     learning_rate: Optional[float] = Field(2.0e-4, description="Learning rate")
@@ -191,9 +155,7 @@ class TrainingArgs(BaseModel):
     fp16: Optional[bool] = Field(False, description="Flag to enable fp16")
     max_grad_norm: Optional[float] = Field(0.3, description="Maximum gradient norm")
     warmup_ratio: Optional[float] = Field(0.03, description="Warmup ratio")
-    lr_scheduler_type: Optional[str] = Field(
-        "constant", description="Learning rate scheduler type"
-    )
+    lr_scheduler_type: Optional[str] = Field("constant", description="Learning rate scheduler type")
     save_steps: Optional[Union[int, float]] = Field(
         500,
         description="Number of updates steps before checkpoint saves. Should be an integer or a float in range [0,1). If smaller than 1, will be interpreted as ratio of total training steps.",
@@ -215,45 +177,27 @@ class TrainingConfig(BaseModel):
 
 class InferenceConfig(BaseModel):
     # Length
-    max_length: Optional[int] = Field(
-        None, description="The maximum length the generated tokens can have."
-    )
-    max_new_tokens: Optional[int] = Field(
-        None, description="The maximum numbers of tokens to generate."
-    )
-    min_length: Optional[int] = Field(
-        0, description="The minimum length of the sequence to be generated."
-    )
-    min_new_tokens: Optional[int] = Field(
-        None, description="The minimum numbers of tokens to generate."
-    )
+    max_length: Optional[int] = Field(None, description="The maximum length the generated tokens can have.")
+    max_new_tokens: Optional[int] = Field(None, description="The maximum numbers of tokens to generate.")
+    min_length: Optional[int] = Field(0, description="The minimum length of the sequence to be generated.")
+    min_new_tokens: Optional[int] = Field(None, description="The minimum numbers of tokens to generate.")
     early_stopping: Optional[Union[bool, str]] = Field(
         False, description="Controls the stopping condition for beam search."
     )
-    max_time: Optional[float] = Field(
-        None, description="The maximum amount of time for the computation in seconds."
-    )
+    max_time: Optional[float] = Field(None, description="The maximum amount of time for the computation in seconds.")
 
     # Generation Strategy
-    do_sample: Optional[bool] = Field(
-        False, description="Whether or not to use sampling."
-    )
+    do_sample: Optional[bool] = Field(False, description="Whether or not to use sampling.")
     num_beams: Optional[int] = Field(1, description="Number of beams for beam search.")
-    num_beam_groups: Optional[int] = Field(
-        1, description="Number of groups for diversity among beams."
-    )
-    penalty_alpha: Optional[float] = Field(
-        None, description="Balances model confidence and degeneration penalty."
-    )
+    num_beam_groups: Optional[int] = Field(1, description="Number of groups for diversity among beams.")
+    penalty_alpha: Optional[float] = Field(None, description="Balances model confidence and degeneration penalty.")
     use_cache: Optional[bool] = Field(
         True,
         description="Whether to use past key/values attentions to speed up decoding.",
     )
 
     # Manipulation of Model Output Logits
-    temperature: Optional[float] = Field(
-        1.0, description="Modulates the next token probabilities."
-    )
+    temperature: Optional[float] = Field(1.0, description="Modulates the next token probabilities.")
     top_k: Optional[int] = Field(
         50,
         description="Number of highest probability tokens to keep for top-k-filtering.",
@@ -263,30 +207,18 @@ class InferenceConfig(BaseModel):
         description="Keeps the smallest set of most probable tokens summing up to top_p.",
     )
     typical_p: Optional[float] = Field(1.0, description="Local typicality measure.")
-    epsilon_cutoff: Optional[float] = Field(
-        0.0, description="Minimum conditional probability for token sampling."
-    )
-    eta_cutoff: Optional[float] = Field(
-        0.0, description="Hybrid of locally typical sampling and epsilon sampling."
-    )
+    epsilon_cutoff: Optional[float] = Field(0.0, description="Minimum conditional probability for token sampling.")
+    eta_cutoff: Optional[float] = Field(0.0, description="Hybrid of locally typical sampling and epsilon sampling.")
     diversity_penalty: Optional[float] = Field(
         0.0, description="Penalty for token repetition across different beam groups."
     )
-    repetition_penalty: Optional[float] = Field(
-        1.0, description="Penalty for token repetition."
-    )
+    repetition_penalty: Optional[float] = Field(1.0, description="Penalty for token repetition.")
     encoder_repetition_penalty: Optional[float] = Field(
         1.0, description="Penalty on sequences not in the original input."
     )
-    length_penalty: Optional[float] = Field(
-        1.0, description="Exponential penalty to the length for beam search."
-    )
-    no_repeat_ngram_size: Optional[int] = Field(
-        0, description="Size of ngrams that cannot occur more than once."
-    )
-    bad_words_ids: Optional[List[List[int]]] = Field(
-        None, description="Tokens that are not allowed to be generated."
-    )
+    length_penalty: Optional[float] = Field(1.0, description="Exponential penalty to the length for beam search.")
+    no_repeat_ngram_size: Optional[int] = Field(0, description="Size of ngrams that cannot occur more than once.")
+    bad_words_ids: Optional[List[List[int]]] = Field(None, description="Tokens that are not allowed to be generated.")
     force_words_ids: Optional[List[Union[List[int], List[List[int]]]]] = Field(
         None, description="Tokens that must be generated."
     )

From b0e0f56127d86623cf46c243d53199f8d4c212db Mon Sep 17 00:00:00 2001
From: Benjamin Ye <benjamin.ye@georgian.io>
Date: Mon, 8 Apr 2024 22:35:35 -0400
Subject: [PATCH 6/9] Revert "rename infer_test_set for greater clarity"

This reverts commit bc830653daf8171dae12d0d69f45fbb841d1c38b.
---
 llmtune/cli/toolkit.py        | 2 +-
 llmtune/inference/generics.py | 2 +-
 llmtune/inference/lora.py     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llmtune/cli/toolkit.py b/llmtune/cli/toolkit.py
index cef55db..3be899a 100644
--- a/llmtune/cli/toolkit.py
+++ b/llmtune/cli/toolkit.py
@@ -68,7 +68,7 @@ def run_one_experiment(config: Config, config_path: str) -> None:
     results_file_path = join(dir_helper.save_paths.results, "results.csv")
     if not exists(results_path) or exists(results_file_path):
         inference_runner = LoRAInference(test, test_column, config, dir_helper)
-        inference_runner.infer_test_set()
+        inference_runner.infer_all()
         RichUI.after_inference(results_path)
     else:
         RichUI.inference_found(results_path)
diff --git a/llmtune/inference/generics.py b/llmtune/inference/generics.py
index b42db50..24a2bfb 100644
--- a/llmtune/inference/generics.py
+++ b/llmtune/inference/generics.py
@@ -7,5 +7,5 @@ def infer_one(self, prompt: str):
         pass
 
     @abstractmethod
-    def infer_test_set(self):
+    def infer_all(self):
         pass
diff --git a/llmtune/inference/lora.py b/llmtune/inference/lora.py
index 68d812c..720822c 100644
--- a/llmtune/inference/lora.py
+++ b/llmtune/inference/lora.py
@@ -64,7 +64,7 @@ def _get_merged_model(self, weights_path: str):
 
         return model, tokenizer
 
-    def infer_test_set(self):
+    def infer_all(self):
         results = []
         prompts = self.test_dataset["formatted_prompt"]
         labels = self.test_dataset[self.label_column]

From 3837366d7176f0c77f1e828170d8cc675fe9ee2a Mon Sep 17 00:00:00 2001
From: Benjamin Ye <benjamin.ye@georgian.io>
Date: Tue, 9 Apr 2024 00:41:52 -0400
Subject: [PATCH 7/9] use getter instead of validator to handle type casting in
 pydantic model

---
 llmtune/finetune/lora.py                |  2 +-
 llmtune/inference/lora.py               | 10 +++-------
 llmtune/pydantic_models/config_model.py | 16 +++++++++-------
 3 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/llmtune/finetune/lora.py b/llmtune/finetune/lora.py
index bd49e9e..a305c4d 100644
--- a/llmtune/finetune/lora.py
+++ b/llmtune/finetune/lora.py
@@ -74,7 +74,7 @@ def _get_model(self):
             ),
             use_cache=False,
             device_map=self.device_map,
-            torch_dtype=self._model_config.torch_dtype,
+            torch_dtype=self._model_config.casted_torch_dtype,
             attn_implementation=self._model_config.attn_implementation,
         )
 
diff --git a/llmtune/inference/lora.py b/llmtune/inference/lora.py
index 720822c..1c5da78 100644
--- a/llmtune/inference/lora.py
+++ b/llmtune/inference/lora.py
@@ -40,17 +40,13 @@ def _get_merged_model(self, weights_path: str):
         torch.cuda.empty_cache()
 
         # Load from path
-        dtype = (
-            torch.float16
-            if self.config.training.training_args.fp16
-            else (torch.bfloat16 if self.config.training.training_args.bf16 else torch.float32)
-        )
 
         self.model = AutoPeftModelForCausalLM.from_pretrained(
             weights_path,
-            torch_dtype=dtype,
-            device_map=self.device_map,
+            torch_dtype=self.config.model.casted_torch_dtype,
             quantization_config=(BitsAndBytesConfig(**self.config.model.bitsandbytes.model_dump())),
+            device_map=self.device_map,
+            attn_implementation=self.config.model.attn_implementation,
         )
 
         """TODO: figure out multi-gpu
diff --git a/llmtune/pydantic_models/config_model.py b/llmtune/pydantic_models/config_model.py
index 3f848aa..1095656 100644
--- a/llmtune/pydantic_models/config_model.py
+++ b/llmtune/pydantic_models/config_model.py
@@ -1,7 +1,7 @@
 from typing import List, Literal, Optional, Union
 
 import torch
-from pydantic import BaseModel, Field, FilePath, validator
+from pydantic import BaseModel, ConfigDict, Field, FilePath, validator
 
 
 # TODO: Refactor this into multiple files...
@@ -105,14 +105,16 @@ def set_device_map_to_none(cls, v, values, **kwargs):
             return None
         return v
 
-    @validator("torch_dtype", pre=True, allow_reuse=True)
-    def convert_str_to_torch_dtype(cls, v):
+    @property
+    def casted_torch_dtype(self) -> Union[str, torch.dtype]:
+        if self.torch_dtype == "auto":
+            return self.torch_dtype
+
         try:
-            # Attempt to retrieve the corresponding PyTorch data type
-            torch_dtype = getattr(torch, v)
+            torch_dtype = getattr(torch, self.torch_dtype)
         except AttributeError:
-            # Handle the case where the string does not match any PyTorch data type
-            raise ValueError(f"{v} is not a valid torch data type")
+            raise ValueError(f"{self.torch_dtype} is not a valid torch data type")
+
         return torch_dtype
 
 

From a93855011bf2768b9dbe76f72c11bc49e36b1369 Mon Sep 17 00:00:00 2001
From: Benjamin Ye <benjamin.ye@georgian.io>
Date: Tue, 9 Apr 2024 00:44:16 -0400
Subject: [PATCH 8/9] adding flash attention to example config

---
 config.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/config.yml b/config.yml
index b85cab2..15ea2ce 100644
--- a/config.yml
+++ b/config.yml
@@ -24,6 +24,8 @@ data:
 # Model Definition -------------------
 model:
   hf_model_ckpt: "NousResearch/Llama-2-7b-hf"
+  torch_dtype: "bfloat16"
+  attn_implementation: "flash_attention_2"
   quantize: true
   bitsandbytes:
     load_in_4bit: true
@@ -80,4 +82,4 @@ qa:
     - verb_percent
     - adjective_percent
     - noun_percent
-    - summary_length
\ No newline at end of file
+    - summary_length

From 0f683f639cfb34ce2c3995f904e063e3fbb56cbb Mon Sep 17 00:00:00 2001
From: Benjamin Ye <benjamin.ye@georgian.io>
Date: Tue, 9 Apr 2024 00:49:54 -0400
Subject: [PATCH 9/9] fix lint

---
 llmtune/pydantic_models/config_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmtune/pydantic_models/config_model.py b/llmtune/pydantic_models/config_model.py
index 1095656..e0f9976 100644
--- a/llmtune/pydantic_models/config_model.py
+++ b/llmtune/pydantic_models/config_model.py
@@ -1,7 +1,7 @@
 from typing import List, Literal, Optional, Union
 
 import torch
-from pydantic import BaseModel, ConfigDict, Field, FilePath, validator
+from pydantic import BaseModel, Field, FilePath, validator
 
 
 # TODO: Refactor this into multiple files...