Skip to content

Commit

Permalink
feat: LLM - Support tuning of new text embedding models by migrating …
Browse files Browse the repository at this point in the history
…to the new v1.1.3 pipeline.

PiperOrigin-RevId: 631887159
  • Loading branch information
vertex-sdk-bot authored and Copybara-Service committed May 8, 2024
1 parent 3938107 commit 7fea754
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 47 deletions.
117 changes: 90 additions & 27 deletions tests/unit/aiplatform/test_language_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ def reverse_string_2(s):""",
"parameterType": "STRING",
},
"base_model_version_id": {
"defaultValue": "textembedding-gecko@001",
"defaultValue": "text-embedding-004",
"description": "which base model to tune. This may be any stable\nnumbered version, for example `textembedding-gecko@001`.",
"isOptional": True,
"parameterType": "STRING",
Expand All @@ -578,17 +578,15 @@ def reverse_string_2(s):""",
"description": "the GCS path to the corpus data location.",
"parameterType": "STRING",
},
"iterations": {
"defaultValue": 1000,
"description": "the number of steps to perform fine-tuning.",
"encryption_spec_key_name": {
"defaultValue": "",
"isOptional": True,
"parameterType": "NUMBER_INTEGER",
"parameterType": "STRING",
},
"location": {
"defaultValue": "us-central1",
"description": "GCP region to run the pipeline.",
"learning_rate_multiplier": {
"defaultValue": 1.0,
"isOptional": True,
"parameterType": "STRING",
"parameterType": "NUMBER_DOUBLE",
},
"machine_type": {
"defaultValue": "n1-standard-16",
Expand All @@ -602,9 +600,10 @@ def reverse_string_2(s):""",
"isOptional": True,
"parameterType": "STRING",
},
"project": {
"description": "user's project id.",
"parameterType": "STRING",
"output_dimensionality": {
"defaultValue": -1,
"isOptional": True,
"parameterType": "NUMBER_INTEGER",
},
"queries_path": {
"description": "the GCS path to the queries location.",
Expand All @@ -626,6 +625,12 @@ def reverse_string_2(s):""",
"description": "the GCS path to the train label data location.",
"parameterType": "STRING",
},
"train_steps": {
"defaultValue": 1000,
"description": "the number of steps to perform fine-tuning.",
"isOptional": True,
"parameterType": "NUMBER_INTEGER",
},
"validation_label_path": {
"defaultValue": "",
"description": "The GCS path to the validation label data location.",
Expand Down Expand Up @@ -2283,6 +2288,61 @@ def test_text_generation_response_repr(self):
["https://us-central1-kfp.pkg.dev/proj/repo/pack/latest"],
indirect=True,
)
@pytest.mark.parametrize(
"base_model_version_id,tune_args,expected_pipeline_args",
[ # Do not pass any optional parameters.
(
"textembedding-gecko@003",
dict(
training_data="gs://bucket/training.tsv",
corpus_data="gs://bucket/corpus.jsonl",
queries_data="gs://bucket/queries.jsonl",
),
dict(
base_model_version_id="textembedding-gecko@003",
train_label_path="gs://bucket/training.tsv",
corpus_path="gs://bucket/corpus.jsonl",
queries_path="gs://bucket/queries.jsonl",
encryption_spec_key_name=_TEST_ENCRYPTION_KEY_NAME,
),
),
# Pass all optional parameters.
(
"text-multilingual-embedding-002",
dict(
training_data="gs://bucket/training.tsv",
corpus_data="gs://bucket/corpus.jsonl",
queries_data="gs://bucket/queries.jsonl",
test_data="gs://bucket/test.tsv",
validation_data="gs://bucket/validation.tsv",
tuned_model_location="us-central1",
model_display_name="my-tuned-model",
train_steps=30,
batch_size=256,
accelerator="NVIDIA_TESLA_V100",
accelerator_count=1,
machine_type="n1-highmem-16",
task_type="DEFAULT",
),
dict(
train_steps=30,
accelerator_type="NVIDIA_TESLA_V100",
accelerator_count=1,
machine_type="n1-highmem-16",
base_model_version_id="text-multilingual-embedding-002",
train_label_path="gs://bucket/training.tsv",
corpus_path="gs://bucket/corpus.jsonl",
queries_path="gs://bucket/queries.jsonl",
test_label_path="gs://bucket/test.tsv",
batch_size=256,
model_display_name="my-tuned-model",
validation_label_path="gs://bucket/validation.tsv",
encryption_spec_key_name=_TEST_ENCRYPTION_KEY_NAME,
task_type="DEFAULT",
),
),
],
)
def test_tune_text_embedding_model(
self,
mock_pipeline_service_create,
Expand All @@ -2294,6 +2354,9 @@ def test_tune_text_embedding_model(
mock_gcs_upload,
mock_request_urlopen_gecko,
mock_deploy_tuned_embedding_model,
tune_args,
expected_pipeline_args,
base_model_version_id,
):
"""Tests tuning the text embedding model."""
aiplatform.init(
Expand All @@ -2309,23 +2372,23 @@ def test_tune_text_embedding_model(
),
):
model = language_models.TextEmbeddingModel.from_pretrained(
"textembedding-gecko@003"
)
tuning_job = model.tune_model(
training_data="gs://bucket/training.tsv",
corpus_data="gs://bucket/corpus.jsonl",
queries_data="gs://bucket/queries.jsonl",
test_data="gs://bucket/test.tsv",
tuned_model_location="us-central1",
train_steps=10,
accelerator="NVIDIA_TESLA_A100",
base_model_version_id
)
tuning_job = model.tune_model(**tune_args)
call_kwargs = mock_pipeline_service_create.call_args[1]
pipeline_arguments = call_kwargs[
"pipeline_job"
].runtime_config.parameter_values
assert pipeline_arguments["iterations"] == 10
assert pipeline_arguments["accelerator_type"] == "NVIDIA_TESLA_A100"
pipeline_arguments = dict(
call_kwargs["pipeline_job"].runtime_config.parameter_values
)

if (
"model_display_name" not in tune_args
and "model_display_name" in pipeline_arguments
):
# This is automatically generated from some params, so don't
# check it.
del pipeline_arguments["model_display_name"]

assert pipeline_arguments == expected_pipeline_args

# Testing the tuned model
tuned_model = tuning_job.deploy_tuned_model()
Expand Down
6 changes: 4 additions & 2 deletions vertexai/_model_garden/_model_garden_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,10 @@
"chat-bison-32k": "https://us-kfp.pkg.dev/ml-pipeline/large-language-model-pipelines/tune-large-chat-model/v3.0.0",
"codechat-bison": "https://us-kfp.pkg.dev/ml-pipeline/large-language-model-pipelines/tune-large-chat-model/v3.0.0",
"codechat-bison-32k": "https://us-kfp.pkg.dev/ml-pipeline/large-language-model-pipelines/tune-large-chat-model/v3.0.0",
"textembedding-gecko": "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model/v1.1.2",
"textembedding-gecko-multilingual": "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model/v1.1.2",
"textembedding-gecko": "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model/v1.1.3",
"textembedding-gecko-multilingual": "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model/v1.1.3",
"text-embedding-004": "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model/v1.1.3",
"text-multilingual-embedding-002": "https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model/v1.1.3",
}

_LOGGER = base.Logger(__name__)
Expand Down
39 changes: 21 additions & 18 deletions vertexai/language_models/_language_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,20 +414,24 @@ def _tune_model(
model_id=self._model_id,
schema_to_class_map={self._INSTANCE_SCHEMA_URI: type(self)},
)
if model_info.tuning_pipeline_uri.startswith(
"https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model"
):
train_steps = tuning_parameters.pop("train_steps", None)
if train_steps:
tuning_parameters["iterations"] = train_steps
if _is_text_embedding_tuning_pipeline(model_info.tuning_pipeline_uri):
tunable_base_model_id = self._model_id.rpartition("/")[-1]
tuning_parameters["base_model_version_id"] = tunable_base_model_id
else:
tuning_parameters["large_model_reference"] = model_info.tuning_model_id
if aiplatform_initializer.global_config.encryption_spec_key_name:
tuning_parameters[
"encryption_spec_key_name"
] = aiplatform_initializer.global_config.encryption_spec_key_name
tuning_parameters.update(
{
"project": aiplatform_initializer.global_config.project,
# TODO(b/275444096): Remove the explicit location once tuning
# can happen in all regions.
# "location": aiplatform_initializer.global_config.location,
"location": tuned_model_location,
}
)
if aiplatform_initializer.global_config.encryption_spec_key_name:
tuning_parameters[
"encryption_spec_key_name"
] = aiplatform_initializer.global_config.encryption_spec_key_name

if not model_info.tuning_pipeline_uri:
raise RuntimeError(f"The {self._model_id} model does not support tuning")
Expand Down Expand Up @@ -3890,6 +3894,12 @@ def _maybe_upload_training_data(
)


def _is_text_embedding_tuning_pipeline(pipeline_uri: str) -> bool:
return pipeline_uri.startswith(
"https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model"
)


def _launch_tuning_job(
training_data: Union[str, "pandas.core.frame.DataFrame"],
model_id: str,
Expand Down Expand Up @@ -3931,16 +3941,9 @@ def _launch_tuning_job(
model_display_name = name[:max_display_name_length]

pipeline_arguments = {
"project": aiplatform_initializer.global_config.project,
# TODO(b/275444096): Remove the explicit location once tuning can happen in all regions
# "location": aiplatform_initializer.global_config.location,
"location": tuned_model_location,
"model_display_name": model_display_name,
}

if tuning_pipeline_uri.startswith(
"https://us-kfp.pkg.dev/ml-pipeline/llm-text-embedding/tune-text-embedding-model"
):
if _is_text_embedding_tuning_pipeline(tuning_pipeline_uri):
pipeline_arguments["train_label_path"] = training_data_path
elif training_data_path.startswith("gs://"):
pipeline_arguments["dataset_uri"] = training_data_path
Expand Down

0 comments on commit 7fea754

Please sign in to comment.