## Register downloaded model to Model Registry

For this demo we use `TinyLlama` model. We will assume that the `local_download.ipynb` notebook
has executed.

In [None]:
import os
import json
import shutil

import torch
import transformers
import accelerate
import numpy
import mlflow
from mlflow.exceptions import MlflowException
from mlflow.tracking import MlflowClient
from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository
from utils import recreate_folder

from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

## Define the model class

Use this pattern to create a Model class. This class is an extention of `mlflow.pyfunc.PythonModel`

This class has the following characteristics:

1. Model context has an attribute `run_id`. This is the `run_id` associated with this model version but not the `run_id` of the model version itself. 
2. The model binaries and configs are stored in the artifacts of this referenced `run_id`
3. The artifacts in the location `/llm-models/{referenced_run_id}/model`
4. The `load_context` method initializes the model
5. The `predict` function 



In [None]:
import mlflow
import os
import accelerate
class LLMModel(mlflow.pyfunc.PythonModel):
        from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
        import pandas
        import torch
        import os
        import accelerate
        def load_context(self, context):
            root_path = os.environ.get("LOCAL_ROOT_FOLDER","/")
            model_path = "llm-models"
            with open(context.artifacts["model_context"], "r") as f:
                cfg = json.load(f)
            self.mlflow_run_id = cfg["run_id"]            
            self.absolute_model_path = os.path.join(root_path,model_path,self.mlflow_run_id)
            print(os.listdir(self.absolute_model_path))
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            print(self.absolute_model_path)
            model = AutoModelForCausalLM.from_pretrained(self.absolute_model_path, 
                                                         torch_dtype=torch.float16, 
                                                         device_map=device)
            tokenizer = AutoTokenizer.from_pretrained(self.absolute_model_path)
            # Create a text-generation pipeline
            self.text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
            
            
            #prompt = "Once upon a time in a distant land,"
            
    
    
        def predict(self, context, model_input, params=None):
            """
            This method generates prediction for the given input.
            """

            prompt = model_input["prompt"]
            if hasattr(prompt, "iloc"):  # pandas Series
                input_string = prompt.iloc[0]
            else:  # regular list
                input_string = prompt[0]
            output = self.text_generator(input_string, max_length=50, do_sample=True)
            return {'text_from_llm': output}
    

In [None]:
def get_or_create_experiment(name: str) -> str:
    # Try to get the experiment
    experiment = mlflow.get_experiment_by_name(name)
    if experiment:
        return experiment.experiment_id

    # Otherwise, create it
    experiment_id = mlflow.create_experiment(name)
    return experiment_id

In [None]:
def get_or_create_registered_model(name: str):
    client = MlflowClient()
    try:
        # Try to get the model details
        model = client.get_registered_model(name)
        print(f"Model '{name}' already registered.")
    except MlflowException as e:
        if "RESOURCE_DOES_NOT_EXIST" in str(e):
            # Model doesn't exist — create it
            model = client.create_registered_model(name)
            print(f"Model '{name}' created.")
        else:
            raise e
    return model

In [None]:
experiment_name="TinyLlama-MODEL-REGISTRATION"
registered_model_name="TinyLlama-MODEL"
experiment_id = get_or_create_experiment(experiment_name)
registered_model = get_or_create_registered_model(registered_model_name)

In [None]:
DATASET_ROOT_FOLDER_PREFIX = "/mnt/imported/data" #Use this for Git backed projects
#DATASET_ROOT_FOLDER_PREFIX = "/domino/datasets" #Use this for DFS based projects
MODEL_ROOT_FOLDER = f"{DATASET_ROOT_FOLDER_PREFIX}/domino-models-dev"
MODEL_SUB_FOLDER = "llm-models"
LOCAL_MODEL_FOLDER = "/home/ubuntu/TinyLlama"


### BUILD AND TEST

In the first pass test during DEV phase, 
```
ONLY_LOCAL_TESTING = True
```
This avoids publishing model binaries to MLFLOW. Because the model is 
served from the dataset, this works but here in the dev phase you are NOT using Domino as the system of record.

Once you are happy with your model, rerun the cells below after setting 
```
ONLY_LOCAL_TESTING = False
```
This publishes your model binary to MLFLOW. This is the point at which you tell the `prod-deployer` role to deploy this model version.

In [None]:
## When true we do not publish artifacts to MLflow
ONLY_LOCAL_TESTING = True

In [None]:
##Neded here for save model to work
os.environ["LOCAL_ROOT_FOLDER"]=MODEL_ROOT_FOLDER

In [None]:
# Start an MLflow run context and log the llama2-7B model wrapper along with the param-included signature to
# allow for overriding parameters at inference time

os.environ['MLFLOW_ENABLE_PROXY_MULTIPART_UPLOAD'] = "true"
torch_version = torch.__version__.split("+")[0]
client = MlflowClient()
#First register the model binaries
with mlflow.start_run(experiment_id=experiment_id) as parent_run:
    parent_run_id = parent_run.info.run_id
    print(f"Parent Run Id {parent_run_id}")
    target_local_dir = os.path.join(MODEL_ROOT_FOLDER,MODEL_SUB_FOLDER,parent_run_id)
    recreate_folder(target_local_dir)
    shutil.copytree(LOCAL_MODEL_FOLDER, target_local_dir, dirs_exist_ok=True)

    #KEY DESIGN. THESE BINARIES CAN BE LARGE, YOU DO NOT PUBLISH THEM UNTIL YOU ARE CONFIDENT THE MODEL WORKS
    if not ONLY_LOCAL_TESTING:
        mlflow.log_artifacts(target_local_dir,artifact_path="model")
    
    # Start child run
    with mlflow.start_run(experiment_id=experiment_id,parent_run_id=parent_run_id, nested=True) as child_run:
        child_run_id = child_run.info.run_id
        print(f"Child Run Id {parent_run_id}")
        # Save model config
       
        model_context = {
            "run_id":parent_run_id            
        }
        config_path = "/tmp/model_context.json"
        with open(config_path, "w") as f:
            json.dump(model_context, f)
        
        model_info = mlflow.pyfunc.log_model(
            artifact_path="",
            python_model=LLMModel(),
            artifacts={"model_context": config_path},
            pip_requirements=[
            f"torch=={torch_version}",
            f"transformers=={transformers.__version__}",
            f"accelerate=={accelerate.__version__}"
           ]
        )

    runs_uri = model_info.model_uri
    print("runs_uri:", runs_uri)

    model_src = RunsArtifactRepository.get_underlying_uri(runs_uri)
    mv = client.create_model_version(registered_model_name, model_src, child_run_id,tags={"triton_env":"domino-triton-dev"})

    print("Name:", mv.name)
    print("Version:", mv.version)
    print("Status:", mv.status)

In [None]:
import mlflow.pyfunc
from mlflow.tracking import MlflowClient
from mlflow.artifacts import download_artifacts
from mlflow.pyfunc import load_model

os.environ['MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR']="true"

'''
model_uri = f"models:/{registered_model_name}/latest"  # Example for a registry model
print(model_uri)
model = mlflow.pyfunc.load_model(model_uri)
'''

MODEL_NAME=mv.name
MODEL_VERSION=mv.version

print("v1")
client = MlflowClient()
mv = client.get_model_version(name=MODEL_NAME, version=MODEL_VERSION)
run_id = mv.run_id
print(run_id)



#This does not work either
#local_path = client.download_artifacts(run_id, "llm-model")


artifact_uri = client.get_run(run_id).info.artifact_uri
local_path = mlflow.artifacts.download_artifacts(f"{artifact_uri}/llm-model")
model = load_model(local_path)

result = model.predict({"prompt" : ["Once upon a time in a distant land,"]})
print(result)



In [None]:
import mlflow.pyfunc
os.environ['MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR']="true"
os.environ["LOCAL_ROOT_FOLDER"]=MODEL_ROOT_FOLDER
MODEL_NAME=mv.name
MODEL_VERSION=mv.version
# Set model URI (update with your MLflow model registry path)
model_uri = f"models:/{registered_model_name}/latest"  # Example for a registry model
print(model_uri)
# model_uri = "runs:/your_run_id/model"  # If stored in a specific run
# Load the MLflow model


model = mlflow.pyfunc.load_model(model_uri)

In [None]:
result = model.predict({"prompt" : ["Once upon a time in a distant land,"]})
print(result)