### Register the Model 

1. Create the model schema
2. Register the model code `LLMModel` class to the Model Registry
   - Note the `load_context` function and how the model api loads from the dataset mount (Mutation adds this)
   - And notice the `predict` function on how it interprets the input payload
4. Note the model name and model version and head over to deploy it using the model api endpoint

In [43]:
import mlflow
import os
import accelerate
import json
import mlflow
import transformers
import os
import torch


In [44]:
# We will use the default dataset of the project to download the model. But it can be any dataset you have access to
def get_download_dataset_folder(model_name):
    ds_dir = os.environ['DOMINO_DATASETS_DIR']
    ds_name = os.environ['DOMINO_PROJECT_NAME']
    download_ds_dir = f"{ds_dir}/{ds_name}/{model_name}"
    return download_ds_dir

In [45]:
model_name = "google/gemma-2b"
model_path = get_download_dataset_folder(model_name)

In [46]:
experiment_name = "gemma2b"
registered_model_name = experiment_name

In [47]:

class LLMModel(mlflow.pyfunc.PythonModel):
        from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
        import pandas
        import torch
        import os
        import accelerate
        def load_context(self, context):
            metadata={}
            print("Context Artifacts (Only Metadata)")
            print(context.artifacts)

            
            with open(context.artifacts["model_binaries_path"], "r") as f:
                metadata = json.load(f)
               
            print("Loaded Metadata")
            print(metadata)
            model_path = metadata["model_path"]
            
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            #model_path = save_path  # Change this to your local model directory

            print(f"Now load the model from the path obtained from the metadata {model_path}")
            model = AutoModelForCausalLM.from_pretrained(model_path, 
                                                         torch_dtype=torch.float16, 
                                                         device_map=device)
            tokenizer = AutoTokenizer.from_pretrained(model_path)
            # Create a text-generation pipeline
            self.text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
            
            
            #prompt = "Once upon a time in a distant land,"
            
    
    
        def predict(self, context, model_input, params=None):
            """
            This method generates prediction for the given input.
            """

            prompt = model_input["prompt"]            
            input_string = prompt.iloc[0]
            output = self.text_generator(input_string, max_length=50, do_sample=True)
            return {'text_from_llm': output}
    
   

In [48]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import pandas
import torch
import os
import pandas as pd
import numpy as np
import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types import DataType, Schema, ColSpec, ParamSchema, ParamSpec

# Define input and output schema
input_schema = Schema(
    [
        ColSpec(DataType.string, "prompt"),
    ]
)
output_schema = Schema([ColSpec(DataType.string, "text_from_llm")])

parameters = ParamSchema(
    [       
    ]
)

signature = ModelSignature(inputs=input_schema, outputs=output_schema, params=parameters)


# Define input example
input_example = pd.DataFrame({"prompt": ["Once upon a time in a distant land,"]})

In [49]:
client = mlflow.MlflowClient()

experiment = client.get_experiment_by_name(experiment_name)
if not experiment:
    experiment_id = client.create_experiment(experiment_name)
else:
    experiment_id = experiment.experiment_id

print(f"Experiment Id is {experiment_id}")


registered_model = None
try:
    registered_model = client.create_registered_model(registered_model_name)
except:
     registered_model = client.get_registered_model(registered_model_name)
registered_model = client.get_registered_model(registered_model_name)


Experiment Id is 7


In [50]:
import json
import shutil
def write_dict_to_model_json(data: dict, path: str):
    if os.path.exists(path):
        if os.path.isfile(path):
            os.remove(path)
        else:
            shutil.rmtree(path)

    # Create the directory
    os.makedirs(path, exist_ok=True)  # Create the directory if it doesn't exist
    file_path = os.path.join(path, "model.json")
    with open(file_path, "w") as f:
        json.dump(data, f, indent=2)  # Pretty-print with 2-space indent
    return file_path


In [51]:
## Create a json which contains the path of the model artifacts

model_artifacts={
    "model_path":model_path
}
file_path = write_dict_to_model_json(model_artifacts, "/tmp/mymodel")
#Write to a local location



# Get the current base version of torch that is installed, without specific version modifiers
torch_version = torch.__version__.split("+")[0]
# Start an MLflow run context and log the llama2-7B model wrapper along with the param-included signature to
# allow for overriding parameters at inference time
with mlflow.start_run(experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    model_info = mlflow.pyfunc.log_model(        
        artifact_path="model",
        python_model=LLMModel(),
        artifacts={"model_binaries_path": file_path},
        pip_requirements=[
            f"torch=={torch_version}",
            f"transformers=={transformers.__version__}",
            f"accelerate=={accelerate.__version__}"
        ],
        input_example=input_example,
        signature=signature,
        registered_model_name=registered_model_name 
    )
    model_src = f"runs:/{run_id}/model"
    mv = mlflow.register_model(model_uri=model_src, name=registered_model_name)
    print("Name: {}".format(mv.name))
    print("Version: {}".format(mv.version))
    print("Description: {}".format(mv.description))
    print("Status: {}".format(mv.status))
    print("Stage: {}".format(mv.current_stage))

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Context Artifacts (Only Metadata)
{'model_binaries_path': '/tmp/tmplvdg85xs/model/artifacts/model.json'}
Loaded Metadata
{'model_path': '/mnt/data/deploy_llm/google/gemma-2b'}
Now load the model from the path obtained from the metadata /mnt/data/deploy_llm/google/gemma-2b


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Registered model 'gemma2b' already exists. Creating a new version of this model...
2025/03/31 20:36:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: gemma2b, version 8
Created version '8' of model 'gemma2b'.
Registered model 'gemma2b' already exists. Creating a new version of this model...
2025/03/31 20:36:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: gemma2b, version 9
Created version '9' of model 'gemma2b'.


Name: gemma2b
Version: 9
Description: 
Status: READY
Stage: None
🏃 View run illustrious-lark-851 at: http://127.0.0.1:8765/#/experiments/7/runs/4392bf746089483fa3264e5c7f9c3ecb
🧪 View experiment at: http://127.0.0.1:8765/#/experiments/7


### Test you LLMModel class locally
1. Download it from model registry
2. `load_context` called automatically and it sees the same mount that is shared between wks and model api
3. `predict` call will interpret the input


In [52]:
import mlflow.pyfunc
os.environ['MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR']="true"
model_version = "latest"
#or
#model_version = "5"
model_uri = f"models:/{registered_model_name}/{mv.version}"  # Example for a registry model
print(model_uri)
# Load the MLflow model
model = mlflow.pyfunc.load_model(model_uri)


models:/gemma2b/9


Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Context Artifacts (Only Metadata)
{'model_binaries_path': '/tmp/tmpeqmph3kq/artifacts/model.json'}
Loaded Metadata
{'model_path': '/mnt/data/deploy_llm/google/gemma-2b'}
Now load the model from the path obtained from the metadata /mnt/data/deploy_llm/google/gemma-2b


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cpu


In [53]:
result = model.predict({"prompt" : "Once upon a time in a distant land,"})

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [54]:
print(result)

{'text_from_llm': [{'generated_text': 'Once upon a time in a distant land, there existed a country whose name was, uh, let’s just say the world knows it well enough. It was a country that had a large empire, made many conquests, and had as much'}]}
