[SDK] Add docstring for Train API (kubeflow#2075)

* [SDK] Add docstring for Train API Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> * Update a few args Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> --------- Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com> Signed-off-by: Weiyu Yen <ckyuto@gmail.com>
ckyuto · Apr 26, 2024 · f0e0f02 · f0e0f02
1 parent ca04ae6
commit f0e0f02
Show file tree

Hide file tree

Showing 2 changed files with 78 additions and 17 deletions.
diff --git a/sdk/python/kubeflow/storage_initializer/hugging_face.py b/sdk/python/kubeflow/storage_initializer/hugging_face.py
@@ -46,7 +46,7 @@ def __post_init__(self):
 
 
 @dataclass
-class HuggingFaceTrainParams:
+class HuggingFaceTrainerParams:
     training_parameters: transformers.TrainingArguments = field(
         default_factory=transformers.TrainingArguments
     )
@@ -77,7 +77,7 @@ def download_model_and_tokenizer(self):
 
 
 @dataclass
-class HfDatasetParams:
+class HuggingFaceDatasetParams:
     repo_id: str
     access_token: Optional[str] = None
     # TODO (andreyvelich): Discuss where we should specify dataset preprocess parameters.
@@ -91,7 +91,7 @@ def __post_init__(self):
 
 class HuggingFaceDataset(datasetProvider):
     def load_config(self, serialised_args):
-        self.config = HfDatasetParams(**json.loads(serialised_args))
+        self.config = HuggingFaceDatasetParams(**json.loads(serialised_args))
 
     def download_dataset(self):
         logger.info("Downloading dataset")

diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py
@@ -99,21 +99,74 @@ def train(
         namespace: Optional[str] = None,
         num_workers: int = 1,
         num_procs_per_worker: int = 1,
+        resources_per_worker: Union[dict, client.V1ResourceRequirements, None] = None,
+        model_provider_parameters=None,
+        dataset_provider_parameters=None,
+        trainer_parameters=None,
         storage_config: Dict[str, Optional[Union[str, List[str]]]] = {
             "size": constants.PVC_DEFAULT_SIZE,
             "storage_class": None,
             "access_modes": constants.PVC_DEFAULT_ACCESS_MODES,
         },
-        model_provider_parameters=None,
-        dataset_provider_parameters=None,
-        train_parameters=None,
-        resources_per_worker: Union[dict, client.V1ResourceRequirements, None] = None,
     ):
-        """
-        Higher level train api
-        model_provider_parameters: It can be of type HuggingFaceModelParams
-        dataset_provider_parameters: It can be of type HfDatasetParams or S3DatasetParams
-        train_parameters: It can be of type HuggingFaceTrainParams
+        """High level API to fine-tune LLMs with distributed PyTorchJob. Follow this guide
+        for more information about this feature: TODO (andreyvelich): Add link.
+
+        It uses the pre-created Storage Initializer to download pre-trained model and dataset, and
+        Trainer to fine-tune LLM. Your cluster should support PVC with ReadOnlyMany access mode
+        to distribute data across PyTorchJob workers.
+
+        It uses `torchrun` CLI to fine-tune model in distributed mode with multiple PyTorchJob
+        workers. Follow this guide to know more about `torchrun` CLI:
+        https://pytorch.org/docs/stable/elastic/run.html
+
+        This feature is in alpha stage and Kubeflow community is looking for your feedback.
+        Please use #kubeflow-training-operator Slack channel or Kubeflow Training Operator GitHub
+        for your questions or suggestions.
+
+        Args:
+            name: Name of the PyTorchJob.
+            namespace: Namespace for the PyTorchJob. By default namespace is taken from
+                `TrainingClient` object.
+            num_workers: Number of PyTorchJob workers.
+            num_procs_per_worker: Number of processes per PyTorchJob worker for `torchrun` CLI.
+                You can use this parameter if you want to use more than 1 GPU per PyTorchJob worker.
+            resources_per_worker: A parameter that lets you specify how much
+                resources each PyTorchJob worker container should have. You can either specify a
+                kubernetes.client.V1ResourceRequirements object (documented here:
+                https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md)
+                or a dictionary that includes one or more of the following keys:
+                `cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate
+                values for these keys are documented here:
+                https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/.
+                For example:
+                ```
+                {
+                    "cpu": "1",
+                    "memory": "2Gi",
+                    "gpu": "1",
+                }
+                ```
+                Please note, `gpu` specifies a resource request with a key of
+                `nvidia.com/gpu`, i.e. an NVIDIA GPU. If you need a different type
+                of GPU, pass in a V1ResourceRequirement instance instead, since it's
+                more flexible. This parameter is optional and defaults to None.
+            model_provider_parameters: Parameters for the model provider in the Storage Initializer.
+                For example, HuggingFace model name and Transformer type for that model, like:
+                AutoModelForSequenceClassification. This argument must be the type of
+                `kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams`
+            dataset_provider_parameters: Parameters for the dataset provider in the
+                Storage Initializer. For example, name of the HuggingFace dataset or
+                AWS S3 configuration. This argument must be the type of
+                `kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams` or
+                `kubeflow.storage_initializer.s3.S3DatasetParams`
+            trainer_parameters: Parameters for LLM Trainer that will fine-tune pre-trained model
+                with the given dataset. For example, LoRA config for parameter-efficient fine-tuning
+                and HuggingFace training arguments like optimizer or number of training epochs.
+                This argument must be the type of
+                `kubeflow.storage_initializer.HuggingFaceTrainerParams`
+            storage_config: Configuration for Storage Initializer PVC to download pre-trained model
+                and dataset. You can configure PVC size and storage class name in this argument.
         """
         try:
             import peft
@@ -126,14 +179,20 @@ def train(
         from kubeflow.storage_initializer.s3 import S3DatasetParams
         from kubeflow.storage_initializer.hugging_face import (
             HuggingFaceModelParams,
-            HfDatasetParams,
+            HuggingFaceDatasetParams,
+        )
+
+        print(
+            "Thank you for using `train` API for LLMs fine-tuning. This feature is in alpha stage "
+            "Kubeflow community is looking for your feedback. Please share your experience "
+            "via #kubeflow-training-operator Slack channel or Kubeflow Training Operator GitHub."
         )
 
         if (
             not name
             or not model_provider_parameters
             or not dataset_provider_parameters
-            or not train_parameters
+            or not trainer_parameters
         ):
             raise ValueError("One of the required parameters is None")
 
@@ -172,7 +231,7 @@ def train(
 
         if isinstance(dataset_provider_parameters, S3DatasetParams):
             dp = "s3"
-        elif isinstance(dataset_provider_parameters, HfDatasetParams):
+        elif isinstance(dataset_provider_parameters, HuggingFaceDatasetParams):
             dp = "hf"
         else:
             raise ValueError(
@@ -210,9 +269,11 @@ def train(
                 "--dataset_dir",
                 VOLUME_PATH_DATASET,
                 "--lora_config",
-                json.dumps(train_parameters.lora_config.__dict__, cls=utils.SetEncoder),
+                json.dumps(
+                    trainer_parameters.lora_config.__dict__, cls=utils.SetEncoder
+                ),
                 "--training_parameters",
-                json.dumps(train_parameters.training_parameters.to_dict()),
+                json.dumps(trainer_parameters.training_parameters.to_dict()),
             ],
             volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT],
             resources=resources_per_worker,