diff --git a/sdk/python/kubeflow/storage_initializer/hugging_face.py b/sdk/python/kubeflow/storage_initializer/hugging_face.py index 0d4d344aab..4b5b0794a9 100644 --- a/sdk/python/kubeflow/storage_initializer/hugging_face.py +++ b/sdk/python/kubeflow/storage_initializer/hugging_face.py @@ -46,7 +46,7 @@ def __post_init__(self): @dataclass -class HuggingFaceTrainParams: +class HuggingFaceTrainerParams: training_parameters: transformers.TrainingArguments = field( default_factory=transformers.TrainingArguments ) @@ -77,7 +77,7 @@ def download_model_and_tokenizer(self): @dataclass -class HfDatasetParams: +class HuggingFaceDatasetParams: repo_id: str access_token: Optional[str] = None # TODO (andreyvelich): Discuss where we should specify dataset preprocess parameters. @@ -91,7 +91,7 @@ def __post_init__(self): class HuggingFaceDataset(datasetProvider): def load_config(self, serialised_args): - self.config = HfDatasetParams(**json.loads(serialised_args)) + self.config = HuggingFaceDatasetParams(**json.loads(serialised_args)) def download_dataset(self): logger.info("Downloading dataset") diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index 0dfe96772c..e6e263ef92 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -99,21 +99,74 @@ def train( namespace: Optional[str] = None, num_workers: int = 1, num_procs_per_worker: int = 1, + resources_per_worker: Union[dict, client.V1ResourceRequirements, None] = None, + model_provider_parameters=None, + dataset_provider_parameters=None, + trainer_parameters=None, storage_config: Dict[str, Optional[Union[str, List[str]]]] = { "size": constants.PVC_DEFAULT_SIZE, "storage_class": None, "access_modes": constants.PVC_DEFAULT_ACCESS_MODES, }, - model_provider_parameters=None, - dataset_provider_parameters=None, - train_parameters=None, - resources_per_worker: Union[dict, client.V1ResourceRequirements, None] = None, ): - """ - Higher level train api - model_provider_parameters: It can be of type HuggingFaceModelParams - dataset_provider_parameters: It can be of type HfDatasetParams or S3DatasetParams - train_parameters: It can be of type HuggingFaceTrainParams + """High level API to fine-tune LLMs with distributed PyTorchJob. Follow this guide + for more information about this feature: TODO (andreyvelich): Add link. + + It uses the pre-created Storage Initializer to download pre-trained model and dataset, and + Trainer to fine-tune LLM. Your cluster should support PVC with ReadOnlyMany access mode + to distribute data across PyTorchJob workers. + + It uses `torchrun` CLI to fine-tune model in distributed mode with multiple PyTorchJob + workers. Follow this guide to know more about `torchrun` CLI: + https://pytorch.org/docs/stable/elastic/run.html + + This feature is in alpha stage and Kubeflow community is looking for your feedback. + Please use #kubeflow-training-operator Slack channel or Kubeflow Training Operator GitHub + for your questions or suggestions. + + Args: + name: Name of the PyTorchJob. + namespace: Namespace for the PyTorchJob. By default namespace is taken from + `TrainingClient` object. + num_workers: Number of PyTorchJob workers. + num_procs_per_worker: Number of processes per PyTorchJob worker for `torchrun` CLI. + You can use this parameter if you want to use more than 1 GPU per PyTorchJob worker. + resources_per_worker: A parameter that lets you specify how much + resources each PyTorchJob worker container should have. You can either specify a + kubernetes.client.V1ResourceRequirements object (documented here: + https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md) + or a dictionary that includes one or more of the following keys: + `cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate + values for these keys are documented here: + https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/. + For example: + ``` + { + "cpu": "1", + "memory": "2Gi", + "gpu": "1", + } + ``` + Please note, `gpu` specifies a resource request with a key of + `nvidia.com/gpu`, i.e. an NVIDIA GPU. If you need a different type + of GPU, pass in a V1ResourceRequirement instance instead, since it's + more flexible. This parameter is optional and defaults to None. + model_provider_parameters: Parameters for the model provider in the Storage Initializer. + For example, HuggingFace model name and Transformer type for that model, like: + AutoModelForSequenceClassification. This argument must be the type of + `kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams` + dataset_provider_parameters: Parameters for the dataset provider in the + Storage Initializer. For example, name of the HuggingFace dataset or + AWS S3 configuration. This argument must be the type of + `kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams` or + `kubeflow.storage_initializer.s3.S3DatasetParams` + trainer_parameters: Parameters for LLM Trainer that will fine-tune pre-trained model + with the given dataset. For example, LoRA config for parameter-efficient fine-tuning + and HuggingFace training arguments like optimizer or number of training epochs. + This argument must be the type of + `kubeflow.storage_initializer.HuggingFaceTrainerParams` + storage_config: Configuration for Storage Initializer PVC to download pre-trained model + and dataset. You can configure PVC size and storage class name in this argument. """ try: import peft @@ -126,14 +179,20 @@ def train( from kubeflow.storage_initializer.s3 import S3DatasetParams from kubeflow.storage_initializer.hugging_face import ( HuggingFaceModelParams, - HfDatasetParams, + HuggingFaceDatasetParams, + ) + + print( + "Thank you for using `train` API for LLMs fine-tuning. This feature is in alpha stage " + "Kubeflow community is looking for your feedback. Please share your experience " + "via #kubeflow-training-operator Slack channel or Kubeflow Training Operator GitHub." ) if ( not name or not model_provider_parameters or not dataset_provider_parameters - or not train_parameters + or not trainer_parameters ): raise ValueError("One of the required parameters is None") @@ -172,7 +231,7 @@ def train( if isinstance(dataset_provider_parameters, S3DatasetParams): dp = "s3" - elif isinstance(dataset_provider_parameters, HfDatasetParams): + elif isinstance(dataset_provider_parameters, HuggingFaceDatasetParams): dp = "hf" else: raise ValueError( @@ -210,9 +269,11 @@ def train( "--dataset_dir", VOLUME_PATH_DATASET, "--lora_config", - json.dumps(train_parameters.lora_config.__dict__, cls=utils.SetEncoder), + json.dumps( + trainer_parameters.lora_config.__dict__, cls=utils.SetEncoder + ), "--training_parameters", - json.dumps(train_parameters.training_parameters.to_dict()), + json.dumps(trainer_parameters.training_parameters.to_dict()), ], volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], resources=resources_per_worker,