In [8]:
import mlflow
import transformers

class MyModel(mlflow.pyfunc.PythonModel):

    def load_context(self, context):
        import os
        import torch
        from transformers import (
            AutoModelForCausalLM,
            AutoTokenizer,
            BitsAndBytesConfig
        )

        self.project_id = os.listdir('/artifacts/mlflow')[0]
        compute_dtype = getattr(torch, "float16")
        quant_config = BitsAndBytesConfig(load_in_4bit=True,
                                          bnb_4bit_quant_type="nf4",
                                          bnb_4bit_compute_dtype=compute_dtype,
                                          bnb_4bit_use_double_quant=False)

        ft_model_name = "final_merged_checkpoint"
        model_tokenizer_path = f"/artifacts/mlflow/{self.project_id}/{ft_model_name}"
        
        self.model = AutoModelForCausalLM.from_pretrained(model_tokenizer_path,
                                                          cache_dir=f"/artifacts/mlflow/{self.project_id}/cache/",
                                                          quantization_config=quant_config,
                                                          device_map="auto")
        self.model.config.use_cache = False
        self.model.config.pretraining_tp = 1

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_tokenizer_path, 
                                                       cache_dir=f"/artifacts/mlflow/{self.project_id}/cache/",
                                                       trust_remote_code=True)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "right"
    
    
    def predict(self, context, model_input, params=None):
        """
        This method generates prediction for the given input.
        """
        prompt = model_input["prompt"]

        if prompt is None:
            return 'Please provide a prompt.'
        
        prompt_template = f"<s>[INST] {{dialogue}} [/INST]"
            

        user_input = f"<s>[INST] {prompt} [/INST]"
        
        tokens = self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(user_input))
        input_length = len(tokens)
        
        new_tokens = 750
        
        text = f"<s>[INST] {prompt} [/INST]"

        device = "cuda:0"

        inputs = self.tokenizer(text, return_tensors="pt").to(device)

        generation_config = transformers.GenerationConfig(
                    pad_token_id=self.tokenizer.pad_token_id,
                    max_new_tokens = 200
                )

        outputs = self.model.generate(**inputs, generation_config=generation_config)
        llm_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        result = llm_output.replace(f"[INST] {prompt} [/INST]", '')
        return {'text_from_llm': result}

In [9]:
import pandas as pd
import numpy as np
import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types import DataType, Schema, ColSpec, ParamSchema, ParamSpec

# Define input and output schema
input_schema = Schema(
    [
        ColSpec(DataType.string, "prompt"),
    ]
)
output_schema = Schema([ColSpec(DataType.string, "text_from_llm")])

parameters = ParamSchema(
    [       
    ]
)

signature = ModelSignature(inputs=input_schema, outputs=output_schema, params=parameters)


# Define input example
input_example = pd.DataFrame({"prompt": ["What is machine learning?"]})

In [10]:
client = mlflow.MlflowClient()
model_name="llama2-guanaco-sft"
registered_model = None
try:
    registered_model = client.create_registered_model(model_name)
except:
     registered_model = client.get_registered_model(model_name)

In [11]:
import peft
import trl
import torch
import transformers


from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository

# Get the current base version of torch that is installed, without specific version modifiers
torch_version = torch.__version__.split("+")[0]

# Start an MLflow run context and log the llama2-7B model wrapper along with the param-included signature to
# allow for overriding parameters at inference time
with mlflow.start_run() as run:
    model_info = mlflow.pyfunc.log_model(
        model_name,
        python_model=MyModel(),
        # NOTE: the artifacts dictionary mapping is critical! This dict is used by the load_context() method in our MyModel() class.
        artifacts={"snapshot": '/mnt/'},
        pip_requirements=[
            f"torch=={torch_version}",
            f"transformers=={transformers.__version__}",                        
            f"peft=={peft.__version__}",
            f"trl=={trl.__version__}",            
            "einops",
            "sentencepiece",
        ],
        input_example=input_example,
        signature=signature,
    )
    runs_uri = model_info.model_uri
    print(runs_uri)
    # Create a new model version of the RandomForestRegression model from this run
    
    model_src = RunsArtifactRepository.get_underlying_uri(runs_uri)
    mv = client.create_model_version(model_name, model_src, run.info.run_id)
    print("Name: {}".format(mv.name))
    print("Version: {}".format(mv.version))
    print("Description: {}".format(mv.description))
    print("Status: {}".format(mv.status))
    print("Stage: {}".format(mv.current_stage))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Downloading artifacts:   0%|          | 0/493 [00:00<?, ?it/s]2024/02/16 10:01:40 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false
Downloading artifacts: 100%|██████████| 493/493 [00:00<00:00, 841.44it/s] 
2024/02/16 10:02:42 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: llama2-guanaco-sft, version 4


runs:/1f62fb2e365a4228815a6a508fbb858d/llama2-guanaco-sft
Name: llama2-guanaco-sft
Version: 4
Description: 
Status: READY
Stage: None


In [4]:
! rm -rf /tmp/mlflow*

In [5]:
# Save the model
mlflow.pyfunc.save_model(path="/tmp/mlflow/", python_model=MyModel())
# Load the model for inference
model = mlflow.pyfunc.load_model("/tmp/mlflow/")
result = model.predict({"prompt" : "Where is Cancun?"})

Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.49s/it]


In [6]:
result

{'text_from_llm': ' Cancun is a city located on the Caribbean coast of Mexico. It is situated in the state of Quintana Roo, and is known for its beautiful beaches, lively nightlife, and rich history. Cancun is a popular tourist destination, attracting visitors from around the world with its warm weather, beautiful scenery, and rich cultural heritage. It is also home to a number of important archaeological sites, including the ancient Mayan city of Chichen Itza. Cancun is located about 150 miles (240 kilometers) south of the city of Playa del Carmen, and is easily accessible by air or car. It is a popular destination for both domestic and international tourists, and is known for its vibrant nightlife, beautiful beaches, and rich cultural heritage.'}

In [7]:
transformers.__version__


'4.33.2'