# Comparing LLMs with MLFlow
This notebook is the companion to [this blog post](https://medium.com/@dliden/comparing-llms-with-mlflow-1c69553718df). It demonstrates how to use MLFlow to compare a few small (<1B Parameters) text generation models from Hugging Face, and how to compare different generation configurations for those models.

For more details, read [the blog post](https://medium.com/@dliden/comparing-llms-with-mlflow-1c69553718df)

## Install Required Packages

In [None]:
%pip install transformers accelerate torch mlflow xformers

## Define the Models
We prepare the models for use with `mlflow.evaluate()` by wrapping them in a `pyfunc` model wrapper.

In [None]:
import mlflow
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    GenerationConfig,
)


class PyfuncTransformer(mlflow.pyfunc.PythonModel):
    """PyfuncTransformer is a class that extends the mlflow.pyfunc.PythonModel class
    and is used to create a custom MLflow model for text generation using Transformers.
    """

    def __init__(self, model_name, gen_config_dict=None, examples=""):
        """
        Initializes a new instance of the PyfuncTransformer class.

        Args:
            model_name (str): The name of the pre-trained Transformer model to use.
            gen_config_dict (dict): A dictionary of generation configuration parameters.
            examples: examples for multi-shot prompting, prepended to the input.
        """
        self.model_name = model_name
        self.gen_config_dict = (
            gen_config_dict if gen_config_dict is not None else {}
        )
        self.examples = examples
        super().__init__()

    def load_context(self, context):
        """
        Loads the model and tokenizer using the specified model_name.

        Args:
            context: The MLflow context.
        """
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            # device_map="auto"
            # make the device CPU
            device_map="cpu",
        )

        # Create a custom GenerationConfig
        gcfg = GenerationConfig.from_model_config(model.config)
        for key, value in self.gen_config_dict.items():
            if hasattr(gcfg, key):
                setattr(gcfg, key, value)

        # Apply the GenerationConfig to the model's config
        model.config.update(gcfg.to_dict())

        self.model = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            return_full_text=False,
        )

    def predict(self, context, model_input):
        """
        Generates text based on the provided model_input using the loaded model.

        Args:
            context: The MLflow context.
            model_input: The input used for generating the text.

        Returns:
            list: A list of generated texts.
        """
        if isinstance(model_input, pd.DataFrame):
            model_input = model_input.values.flatten().tolist()
        elif not isinstance(model_input, list):
            model_input = [model_input]

        generated_text = []
        for input_text in model_input:
            output = self.model(
                self.examples + input_text, return_full_text=False
            )
            generated_text.append(
                output[0]["generated_text"],
            )

        return generated_text

## Instantiate the Models

In [None]:
gcfg = {
    "max_length": 180,
    "max_new_tokens": 10,
    "do_sample": False,
}

example = (
    "Q: Are elephants larger than mice?\nA: Yes.\n\n"
    "Q: Are mice carnivorous?\nA: No, mice are typically omnivores.\n\n"
    "Q: What is the average lifespan of an elephant?\nA: The average lifespan of an elephant in the wild is about 60 to 70 years.\n\n"
    "Q: Is Mount Everest the highest mountain in the world?\nA: Yes.\n\n"
    "Q: Which city is known as the 'City of Love'?\nA: Paris is often referred to as the 'City of Love'.\n\n"
    "Q: What is the capital of Australia?\nA: The capital of Australia is Canberra.\n\n"
    "Q: Who wrote the novel '1984'?\nA: The novel '1984' was written by George Orwell.\n\n"
)

bloom560 = PyfuncTransformer(
    "bigscience/bloom-560m",
    gen_config_dict=gcfg,
    examples=example,
)
gpt2large = PyfuncTransformer(
    "gpt2-large",
    gen_config_dict=gcfg,
    examples=example,
)
distilgpt2 = PyfuncTransformer(
    "distilgpt2",
    gen_config_dict=gcfg,
    examples=example,
)

## Log the Models with MLFlow

In [None]:
mlflow.set_experiment(experiment_name="compare_small_models")
run_ids = []
artifact_paths = []
model_names = ["bloom560", "gpt2large", "distilgpt2"]

for model, name in zip([bloom560, gpt2large, distilgpt2], model_names):
    with mlflow.start_run(run_name=f"log_model_{name}"):
        pyfunc_model = model
        artifact_path = f"models/{name}"
        mlflow.pyfunc.log_model(
            artifact_path=artifact_path,
            python_model=pyfunc_model,
            input_example="Q: What color is the sky?\nA:",
        )
        run_ids.append(mlflow.active_run().info.run_id)
        artifact_paths.append(artifact_path)

## Set up the Evaluation Data

In [None]:
eval_df = pd.DataFrame(
    {
        "question": [
            "Q: What color is the sky?\nA:",
            "Q: Are trees plants or animals?\nA:",
            "Q: What is 2+2?\nA:",
            "Q: Who is Darth Vader?\nA:",
            "Q: What is your favorite color?\nA:",
        ]
    }
)
print(eval_df)

## Compare the Models with `mlflow.evaluate()`

In [None]:
for i in range(3):
    with mlflow.start_run(
        run_id=run_ids[i]
    ):  # reopen the run with the stored run ID
        evaluation_results = mlflow.evaluate(
            model=f"runs:/{run_ids[i]}/{artifact_paths[i]}",
            model_type="text",
            data=eval_df,
        )

### Load the Results Table
You can also view the results in the Artifacts view in the MLFlow UI.

In [None]:
mlflow.load_table("eval_results_table.json")

# Comparing Generation Parameters at Inference Time
We can modify the approach above to accept generation configuration parameters at inference time, so we can compare many of the same inputs with different generation configurations and track those configurations in the evaluation table.

## Defining the Models

In [None]:
import json


class PyfuncTransformerWithParams(mlflow.pyfunc.PythonModel):
    """PyfuncTransformer is a class that extends the mlflow.pyfunc.PythonModel class
    and is used to create a custom MLflow model for text generation using Transformers.
    """

    def __init__(self, model_name):
        """
        Initializes a new instance of the PyfuncTransformer class.

        Args:
            model_name (str): The name of the pre-trained Transformer model to use.
            examples: examples for multi-shot prompting, prepended to the input.
        """
        self.model_name = model_name
        super().__init__()

    def load_context(self, context):
        """
        Loads the model and tokenizer using the specified model_name.

        Args:
            context: The MLflow context.
        """
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name, device_map="auto"
        )

        self.model = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            return_full_text=False,
        )

    def predict(self, context, model_input):
        """
        Generates text based on the provided model_input using the loaded model.

        Args:
            context: The MLflow context.
            model_input: The input used for generating the text.

        Returns:
            list: A list of generated texts.
        """
        if isinstance(model_input, pd.DataFrame):
            model_input = model_input.to_dict(orient="records")
        elif not isinstance(model_input, list):
            model_input = [model_input]

        generated_text = []
        for record in model_input:
            input_text = record["input_text"]
            few_shot_examples = record["few_shot_examples"]
            config_dict = record["config_dict"]
            # Update the GenerationConfig attributes with the provided config_dict
            gcfg = GenerationConfig.from_model_config(self.model.model.config)
            for key, value in json.loads(config_dict).items():
                if hasattr(gcfg, key):
                    setattr(gcfg, key, value)

            output = self.model(
                few_shot_examples + input_text,
                generation_config=gcfg,
                return_full_text=False,
            )
            generated_text.append(output[0]["generated_text"])

        return generated_text

## Setting Up the Evaluation Data

In [None]:
few_shot_examples_1 = (
    "Q: Are elephants larger than mice?\nA: Yes.\n\n"
    "Q: Are mice carnivorous?\nA: No, mice are typically omnivores.\n\n"
    "Q: What is the average lifespan of an elephant?\nA: The average lifespan of an elephant in the wild is about 60 to 70 years.\n\n"
)

few_shot_examples_2 = (
    "Q: Is Mount Everest the highest mountain in the world?\nA: Yes.\n\n"
    "Q: Which city is known as the 'City of Love'?\nA: Paris is often referred to as the 'City of Love'.\n\n"
    "Q: What is the capital of Australia?\nA: The capital of Australia is Canberra.\n\n"
    "Q: Who wrote the novel '1984'?\nA: The novel '1984' was written by George Orwell.\n\n"
)

config_dict1 = {
    "do_sample": True,
    "top_k": 10,
    "max_length": 180,
    "max_new_tokens": 10,
}
config_dict2 = {"do_sample": False, "max_length": 180, "max_new_tokens": 10}

few_shot_examples = [few_shot_examples_1, few_shot_examples_2]
config_dicts = [config_dict1, config_dict2]

questions = [
    "Q: What color is the sky?\nA:",
    "Q: Are trees plants or animals?\nA:",
    "Q: What is 2+2?\nA:",
    "Q: Who is the Darth Vader?\nA:",
    "Q: What is your favorite color?\nA:",
]

data = {
    "input_text": questions * len(few_shot_examples),
    "few_shot_examples": [
        example for example in few_shot_examples for _ in range(len(questions))
    ],
    "config_dict": [
        json.dumps(config)
        for config in config_dicts
        for _ in range(len(questions))
    ],
}

eval_df = pd.DataFrame(data)

## Evaluating the Different Generation Configs

In [None]:
mlflow.set_experiment(experiment_name="compare_generation_params")
model_name = "bloom560"

with mlflow.start_run(run_name=f"log_model_{model_name}"):
    # Define an input example
    input_example = pd.DataFrame(
        {
            "input_text": "Q: What color is the sky?\nA:",
            "few_shot_examples": example,  # Assuming 'example' is defined and contains your few-shot prompts
            "config_dict": {},  # Assuming an empty dict for the generation parameters in this example
        }
    )

    # Define the artifact_path
    artifact_path = f"models/{model_name}"

    # log the data
    eval_data = mlflow.data.from_pandas(eval_df, name="evaluate_configurations")

    # Log the model
    mod = mlflow.pyfunc.log_model(
        artifact_path=artifact_path,
        python_model=bloom560_with_params,
        input_example=input_example,
    )

    # Define the model_uri
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/{artifact_path}"

    # Evaluate the model
    mlflow.evaluate(model=model_uri, model_type="text", data=eval_data)