In [0]:
%pip install bitsandbytes
%pip install accelerate
%pip install git+https://github.com/huggingface/peft.git

In [0]:
from transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch
import pandas as pd

In [0]:
# Replace the path names based on where the adapters, model and tokenizers were saved during your work

In [0]:
peft_model_id = "/dbfs/FileStore/shared_uploads/<your-user-name>/dollyllama"

In [0]:
pretrained_model_id = 'openlm-research/open_llama_7b_v2'

In [0]:
tokenizer = LlamaTokenizer.from_pretrained(peft_model_id)

In [0]:
model = LlamaForCausalLM.from_pretrained(
    pretrained_model_id, device_map='auto',
)

In [0]:
from peft import PeftModel, PeftConfig
config = PeftConfig.from_pretrained(peft_model_id)
config.base_model_name_or_path

In [0]:
peft_model = PeftModel.from_pretrained(model, peft_model_id)

In [0]:
merged_model_path = "/dbfs/FileStore/shared_uploads/<your-user-name>/dollyllama/merged_model"

In [0]:
merged_model = peft_model.merge_and_unload()
merged_model.save_pretrained(merged_model_path)

In [0]:
merged_model_tokenizer_path = "/dbfs/FileStore/shared_uploads/<your-user-name>/dollyllama/merged_model_tokenizer"

In [0]:
tokenizer.save_pretrained(merged_model_tokenizer_path)

In [0]:
#Ensure that the final model can be loaded from the saved path
model = LlamaForCausalLM.from_pretrained(merged_model_path, torch_dtype=torch.float16).to("cuda")

In [0]:
model.eval()

In [0]:
#Ensure that the tokenizer can be loaded from the saved path
tokenizer = LlamaTokenizer.from_pretrained(merged_model_tokenizer_path)

## Testing preprocessing and prediction functions before composing the pyfunc

In [0]:
def build_prompt(instruction):
    prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

  ### Instruction:
  {}

  ### Response:
  """.format(instruction)
    return prompt

In [0]:
def parse(text):
    start_marker = '### Response:'
    end_marker = '### End'
    start_index = text.find(start_marker)
    end_index = text.find(end_marker, start_index + len(start_marker))
    
    return (text[start_index + len(start_marker):].strip() if start_index != -1 and end_index == -1
            else text[start_index + len(start_marker):end_index].strip() if start_index != -1
            else None)

In [0]:
def extract_response(text):
    start_marker = '### Response:'
    end_marker = '### End'
    start_index = text.find(start_marker)
    end_index = text.find(end_marker, start_index + len(start_marker))
    
    return (text[start_index + len(start_marker):].strip() if start_index != -1 and end_index == -1
            else text[start_index + len(start_marker):end_index].strip() if start_index != -1
            else None)

In [0]:
prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Explain how the US economy works using an analogy

### Response:
"""
# input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')

# generation_output = model.generate(
#     input_ids=input_ids, max_new_tokens=128, penalty_alpha=0.5, top_k=4, 
# )
# response = extract_response(tokenizer.decode(generation_output[0]))
# print(response)

In [0]:
payload_pd = pd.DataFrame([[prompt]],columns=['text'])
payload_pd

In [0]:
input_example = payload_pd

In [0]:
def predict(model_input):
    import json
    question = model_input.iloc[:,0].to_list()[0] # get the first column
    prompt = build_prompt(question)
    input_ids = tokenizer(prompt, return_tensors="pt").to('cuda')
    generation_output = model.generate(
    input_ids=input_ids["input_ids"], max_new_tokens=128, penalty_alpha=0.5, top_k=4)
    output = parse(tokenizer.decode(generation_output[0]))
    result = {'response': output}
    return json.dumps(result)

In [0]:
predict(input_example)

## Log with MLFlow and Deploy

In [0]:
artifacts = {
"tokenizer_path": merged_model_tokenizer_path,
"model_path": merged_model_path,
}

In [0]:
import mlflow.pyfunc

class Dollyllama(mlflow.pyfunc.PythonModel):
  def load_context(self, context):
    from transformers import AutoModelForCausalLM
    from transformers import LlamaTokenizer, LlamaForCausalLM
    import torch
    self.tokenizer = LlamaTokenizer.from_pretrained(context.artifacts['tokenizer_path'])
    self.model = LlamaForCausalLM.from_pretrained(context.artifacts['model_path'], torch_dtype=torch.bfloat16)
    self.model.to(device = "cuda")
    self.model.eval()

  def build_prompt(self, instruction):
    prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

  ### Instruction:
  {}

  ### Response:
  """.format(instruction)
    return prompt

  def parse(self, text):
    start_marker = '### Response:'
    end_marker = '### End'
    start_index = text.find(start_marker)
    end_index = text.find(end_marker, start_index + len(start_marker))
    
    return (text[start_index + len(start_marker):].strip() if start_index != -1 and end_index == -1
            else text[start_index + len(start_marker):end_index].strip() if start_index != -1
            else None)


  def predict(self, context, model_input):
    import json
    question = model_input.iloc[:,0].to_list()[0] # get the first column
    prompt = self.build_prompt(question)
    input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
    generation_output = self.model.generate(
    input_ids=input_ids["input_ids"], max_new_tokens=180, penalty_alpha=0.5, top_k=4)
    output = self.parse(self.tokenizer.decode(generation_output[0]))
    result = {'response': output}
    return json.dumps(result)

In [0]:
from sys import version_info
 
PYTHON_VERSION = "{major}.{minor}.{micro}".format(major=version_info.major,
                                                  minor=version_info.minor,
                                                  micro=version_info.micro)

In [0]:
import cloudpickle
conda_env = {
    'channels': ['defaults'],
    'dependencies': [
      'python={}'.format(PYTHON_VERSION),
      'pip',
      {
        'pip': [
          'mlflow',
          'transformers==4.28.1',
          "datasets==2.12.0",
          "accelerate==0.18.0",
          "bitsandbytes==0.40.0",
          'pandas',
          "sentencepiece",
          "py7zr",
          'cloudpickle=={}'.format(cloudpickle.__version__),
          'torch'],
      },
    ],
    'name': 'dollyllamav2_environment'
}

mlflow_pyfunc_model_path = "dollyllama7bv2_prod"

In [0]:
mlflow.pyfunc.log_model(artifact_path=mlflow_pyfunc_model_path, python_model=Dollyllama(),artifacts=artifacts, conda_env=conda_env, input_example = input_example)