In [4]:
%pip install -q -U distilabel "farm-haystack[preprocessing]"
%pip install -q -U "distilabel[hf-inference-endpoints, argilla]"
%pip install -q -U ollama openai

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
from typing import Dict

from distilabel.llm import OllamaLLM
from distilabel.pipeline import Pipeline, pipeline
from distilabel.tasks import TextGenerationTask, SelfInstructTask, Prompt

from datasets import Dataset
from haystack.nodes import PDFToTextConverter, PreProcessor

In [18]:
os.environ['OPENAI_API_KEY'] = "MY_KEY_GOES_HERE"

In [3]:
class QuestionAnsweringTask(TextGenerationTask):
    def generate_prompt(self, question: str) -> str:
        return Prompt(
            system_prompt=self.system_prompt,
            formatted_prompt=question,
        ).format_as(
            "openai"
        )  # type: ignore

    def parse_output(self, output: str) -> Dict[str, str]:
        return {"answer": output.strip()}

    @property
    def input_args_names(self) -> list[str]:
        return ["question"]

    @property
    def output_args_names(self) -> list[str]:
        return ["answer"]

In [4]:
from distilabel.llm import OllamaLLM

llm = OllamaLLM(
    model="mixtral",  # should be deployed via `ollama notus:7b-v1-q5_K_M`
    task=QuestionAnsweringTask(),
    prompt_format="openai",
)

In [17]:
generation = llm.generate(
    [{"question": "What's the second most populated city in Denmark?"}]
)

generation[0][0]["parsed_output"]["answer"]

'The second most populated city in Denmark is Aarhus. The most populated city in Denmark is Copenhagen, which is also the capital of the country. Aarhus is an important port and the second largest city in Denmark, located on the east coast of the Jutland peninsula. It has a population of approximately 270,000 people.'

In [21]:
!zenml integration install s3 -y

[2K[32m⠸[0m Installing integrations.....
[1A[2K

In [8]:
from zenml.client import Client

artifact = Client().get_artifact_version('86ba966e-66d1-4c79-a464-8bfff65300a0')
loaded_artifact = artifact.load()

[33mCould not import GCP service connector: No module named 'google.api_core'.[0m
[33mCould not import Azure service connector: No module named 'azure.identity'.[0m
[33mCould not import Kubernetes service connector: No module named 'kubernetes'.[0m
[33mCould not import HyperAI service connector: No module named 'paramiko'.[0m


In [9]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=150,
    split_respect_sentence_boundary=True,
)
raw_texts = [{"content": doc.page_content} for doc in loaded_artifact]
docs = preprocessor.process(raw_texts)

Preprocessing:   0%|          | 0/165 [00:00<?, ?docs/s]

[33mWe found one or more sentences whose split count is higher than the split length.[0m


Preprocessing: 100%|██████████| 165/165 [00:00<00:00, 390.05docs/s]


In [10]:
inputs = [doc.content for doc in docs]
inputs[0][0:500]

'An end-to-end project\n\nPut your new knowledge in action with an end-to-end project\n\nThat was awesome! We learned so many advanced MLOps production concepts:\n\nThe value of deploying ZenML\u200b\n\nAbstracting infrastructure configuration into stacks\u200b\n\n\u200bConnecting remote storage\u200b\n\n\u200bOrchestrating on the cloud\u200b\n\n\u200bConfiguring the pipeline to scale compute\u200b\n\n\u200bConnecting a git repository\u200b\n\nWe will now combine all of these concepts into an end-to-end MLOps project powered by ZenML.\n\nGet started\n\nStart with a f'

In [11]:
instructions_dataset = Dataset.from_dict({"input": inputs[0:50]})

instructions_dataset

Dataset({
    features: ['input'],
    num_rows: 50
})

In [12]:
instructions_task = SelfInstructTask(
    application_description="An assistant that can answer questions about the open-source MLOps framework ZenML."
)

In [13]:
instructions_generator = OllamaLLM(
    model="mixtral",
    task=instructions_task,
)

instructions_pipeline = Pipeline(generator=instructions_generator)

In [14]:
generated_instructions = instructions_pipeline.generate(
    dataset=instructions_dataset, num_generations=1, batch_size=8
)

[1;35mExecuting dry-run...[0m


[1;35mProcessing batch 1 of 1...[0m


[1;35mCalling generator for batch 1...[0m


Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

[1;35mDry-run executed with no issues. Starting the actual generation...[0m


Output()

[1;35mProcessing batch 1 of 7...[0m


[1;35mCalling generator for batch 1...[0m


[1;35mProcessing batch 2 of 7...[0m


[1;35mCalling generator for batch 2...[0m


[1;35mProcessing batch 3 of 7...[0m


[1;35mCalling generator for batch 3...[0m


[1;35mProcessing batch 4 of 7...[0m


[1;35mCalling generator for batch 4...[0m


[1;35mProcessing batch 5 of 7...[0m


[1;35mCalling generator for batch 5...[0m


[1;35mProcessing batch 6 of 7...[0m


[1;35mCalling generator for batch 6...[0m


[1;35mProcessing batch 7 of 7...[0m


[1;35mCalling generator for batch 7...[0m


Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]

[1;35mCheckpoint saved to disk: /home/strickvl/coding/zenml-projects/llm-lora-finetuning/data_generation/ckpt.[0m


[1;35mFinal dataset saved at /home/strickvl/coding/zenml-projects/llm-lora-finetuning/data_generation/ckpt[0m


In [15]:
instructions = []
for generations in generated_instructions["instructions"]:
    for generation in generations:
        instructions.extend(generation)

print(f"Number of generated instructions: {len(instructions)}")

for instruction in instructions[:5]:
    print(instruction)

Number of generated instructions: 196
"Can you walk me through the process of setting up a ZenML project using templates?"
"What are the key steps to configure an MLOps pipeline with ZenML for scaling compute?"
"Could you detail the benefits of deploying ZenML in abstracting infrastructure configurations?"
"How can I connect remote storage and a git repository to my ZenML project?"
"Can you explain the main features of the ZenML e2e project?"


In [16]:
generated_instructions[0]

{'input': 'An end-to-end project\n\nPut your new knowledge in action with an end-to-end project\n\nThat was awesome! We learned so many advanced MLOps production concepts:\n\nThe value of deploying ZenML\u200b\n\nAbstracting infrastructure configuration into stacks\u200b\n\n\u200bConnecting remote storage\u200b\n\n\u200bOrchestrating on the cloud\u200b\n\n\u200bConfiguring the pipeline to scale compute\u200b\n\n\u200bConnecting a git repository\u200b\n\nWe will now combine all of these concepts into an end-to-end MLOps project powered by ZenML.\n\nGet started\n\nStart with a fresh virtual environment with no dependencies. Then let\'s install our dependencies:\n\npip install "zenml[templates,server]" notebook\n\nzenml integration install sklearn -y\n\nWe will then use\n\nZenML templates\n\nto help us get the code we need for the project:\n\nmkdir zenml_batch_e2e\n\ncd zenml_batch_e2e\n\nzenml init --template e2e_batch --template-with-defaults\n\n# Just in case, we install the requiremen

In [17]:
instructions_rg_dataset = generated_instructions.to_argilla()
instructions_rg_dataset[0]

  instructions_rg_dataset = generated_instructions.to_argilla()


[1;35mLoad pretrained SentenceTransformer: TaylorAI/bge-micro-v2[0m


[1;35mUse pytorch device_name: cuda[0m


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

  similarities.append(sent.similarity(sents[i + order]))


Output()

FeedbackRecord(fields={'input': 'An end-to-end project\n\nPut your new knowledge in action with an end-to-end project\n\nThat was awesome! We learned so many advanced MLOps production concepts:\n\nThe value of deploying ZenML\u200b\n\nAbstracting infrastructure configuration into stacks\u200b\n\n\u200bConnecting remote storage\u200b\n\n\u200bOrchestrating on the cloud\u200b\n\n\u200bConfiguring the pipeline to scale compute\u200b\n\n\u200bConnecting a git repository\u200b\n\nWe will now combine all of these concepts into an end-to-end MLOps project powered by ZenML.\n\nGet started\n\nStart with a fresh virtual environment with no dependencies. Then let\'s install our dependencies:\n\npip install "zenml[templates,server]" notebook\n\nzenml integration install sklearn -y\n\nWe will then use\n\nZenML templates\n\nto help us get the code we need for the project:\n\nmkdir zenml_batch_e2e\n\ncd zenml_batch_e2e\n\nzenml init --template e2e_batch --template-with-defaults\n\n# Just in case, we 

In [5]:
import argilla as rg
from argilla._constants import DEFAULT_API_KEY

# Argilla credentials
api_url = "https://strickvl-argilla.hf.space" # "https://<YOUR-HF-SPACE>.hf.space"
api_key = "admin.apikey"
# # Huggingface credentials
# hf_token = "hf_..."

rg.init(api_url=api_url, api_key=api_key)

# # If you want to use your private HF Space
# rg.init(extra_headers={"Authorization": f"Bearer {hf_token}"})



In [22]:
instructions_rg_dataset.push_to_argilla(name=f"ollama_instructions", workspace="admin")

Output()

[1;35m✓ Dataset succesfully pushed to Argilla[0m


[1;35mRemoteFeedbackDataset(
   id=f7ad81b5-a82e-49a5-9497-faccf1fdb6cd
   name=ollama_instructions
   workspace=Workspace(id=a4eef5c0-a51c-4cce-81a7-8c5b23ce6afa, name=admin, inserted_at=2024-03-20 12:52:24.433299, updated_at=2024-03-20 12:52:24.433299)
   url=https://strickvl-argilla.hf.space/dataset/f7ad81b5-a82e-49a5-9497-faccf1fdb6cd/annotation-mode
   fields=[RemoteTextField(id=UUID('e220a39e-6de1-4e04-8e95-4493910e0def'), client=None, name='input', title='input', required=True, type='text', use_markdown=True), RemoteTextField(id=UUID('5facf295-2d48-4eca-b20c-3697433c4310'), client=None, name='instructions', title='instructions', required=True, type='text', use_markdown=False)]
   questions=[RemoteRatingQuestion(id=UUID('78614ca7-ec9c-43cf-b604-4c20c6d1b77b'), client=None, name='instruction-rating', title='How would you rate the generated instruction?', description=None, required=True, type='rating', values=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])]
   guidelines=None
   metadata_propert

RemoteFeedbackDataset(
   id=f7ad81b5-a82e-49a5-9497-faccf1fdb6cd
   name=ollama_instructions
   workspace=Workspace(id=a4eef5c0-a51c-4cce-81a7-8c5b23ce6afa, name=admin, inserted_at=2024-03-20 12:52:24.433299, updated_at=2024-03-20 12:52:24.433299)
   url=https://strickvl-argilla.hf.space/dataset/f7ad81b5-a82e-49a5-9497-faccf1fdb6cd/annotation-mode
   fields=[RemoteTextField(id=UUID('e220a39e-6de1-4e04-8e95-4493910e0def'), client=None, name='input', title='input', required=True, type='text', use_markdown=True), RemoteTextField(id=UUID('5facf295-2d48-4eca-b20c-3697433c4310'), client=None, name='instructions', title='instructions', required=True, type='text', use_markdown=False)]
   questions=[RemoteRatingQuestion(id=UUID('78614ca7-ec9c-43cf-b604-4c20c6d1b77b'), client=None, name='instruction-rating', title='How would you rate the generated instruction?', description=None, required=True, type='rating', values=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])]
   guidelines=None
   metadata_properties=[Re

In [8]:
preference_pipeline = pipeline(
    "preference",
    "instruction-following",
    generator=OllamaLLM(
        model="mixtral",
        task=TextGenerationTask(),
        max_new_tokens=256,
        num_threads=2,
        temperature=0.3,
    ),
    max_new_tokens=256,
    num_threads=2,
    api_key=os.getenv("OPENAI_API_KEY", None),
    temperature=0.0,
)

In [9]:
remote_dataset = rg.FeedbackDataset.from_argilla(
    "ollama_instructions", workspace="admin"
)
instructions_dataset = remote_dataset.pull(max_records=100)  # get first 100 records

instructions_dataset = instructions_dataset.format_as("datasets")
instructions_dataset

Dataset({
    features: ['input', 'instructions', 'instruction-rating', 'instruction-rating-suggestion', 'instruction-rating-suggestion-metadata', 'external_id', 'metadata', 'vectors'],
    num_rows: 100
})

In [10]:
instructions_dataset[0]

{'input': 'An end-to-end project\n\nPut your new knowledge in action with an end-to-end project\n\nThat was awesome! We learned so many advanced MLOps production concepts:\n\nThe value of deploying ZenML\u200b\n\nAbstracting infrastructure configuration into stacks\u200b\n\n\u200bConnecting remote storage\u200b\n\n\u200bOrchestrating on the cloud\u200b\n\n\u200bConfiguring the pipeline to scale compute\u200b\n\n\u200bConnecting a git repository\u200b\n\nWe will now combine all of these concepts into an end-to-end MLOps project powered by ZenML.\n\nGet started\n\nStart with a fresh virtual environment with no dependencies. Then let\'s install our dependencies:\n\npip install "zenml[templates,server]" notebook\n\nzenml integration install sklearn -y\n\nWe will then use\n\nZenML templates\n\nto help us get the code we need for the project:\n\nmkdir zenml_batch_e2e\n\ncd zenml_batch_e2e\n\nzenml init --template e2e_batch --template-with-defaults\n\n# Just in case, we install the requiremen

In [12]:
instructions_dataset = instructions_dataset.rename_columns({"input": "context", "instructions": "input"})

In [13]:
preference_dataset = preference_pipeline.generate(
    instructions_dataset,  # type: ignore
    num_generations=2,
    batch_size=8,
    display_progress_bar=True,
)

  return self._generate(


Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

Output()

Flattening the indices:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

In [14]:
preference_dataset[0]

{'context': 'An end-to-end project\n\nPut your new knowledge in action with an end-to-end project\n\nThat was awesome! We learned so many advanced MLOps production concepts:\n\nThe value of deploying ZenML\u200b\n\nAbstracting infrastructure configuration into stacks\u200b\n\n\u200bConnecting remote storage\u200b\n\n\u200bOrchestrating on the cloud\u200b\n\n\u200bConfiguring the pipeline to scale compute\u200b\n\n\u200bConnecting a git repository\u200b\n\nWe will now combine all of these concepts into an end-to-end MLOps project powered by ZenML.\n\nGet started\n\nStart with a fresh virtual environment with no dependencies. Then let\'s install our dependencies:\n\npip install "zenml[templates,server]" notebook\n\nzenml integration install sklearn -y\n\nWe will then use\n\nZenML templates\n\nto help us get the code we need for the project:\n\nmkdir zenml_batch_e2e\n\ncd zenml_batch_e2e\n\nzenml init --template e2e_batch --template-with-defaults\n\n# Just in case, we install the requirem

In [16]:
# Uploading the Preference Dataset
preference_rg_dataset = preference_dataset.to_argilla()

# Adding the context as a metadata property in the new Feedback dataset, as this
# information will be useful later.
for record_feedback, record_huggingface in zip(
    preference_rg_dataset, preference_dataset
):
    record_feedback.metadata["context"] = record_huggingface["context"]

preference_rg_dataset.push_to_argilla(name=f"ollama_preference", workspace="admin")

Output()

RemoteFeedbackDataset(
   id=06ae2ab9-93ed-416e-8fbe-6e8cafcdb938
   name=ollama_preference
   workspace=Workspace(id=a4eef5c0-a51c-4cce-81a7-8c5b23ce6afa, name=admin, inserted_at=2024-03-20 12:52:24.433299, updated_at=2024-03-20 12:52:24.433299)
   url=https://strickvl-argilla.hf.space/dataset/06ae2ab9-93ed-416e-8fbe-6e8cafcdb938/annotation-mode
   fields=[RemoteTextField(id=UUID('fbc04a66-a228-49e0-97bc-e4a0bb3615ae'), client=None, name='input', title='input', required=True, type='text', use_markdown=True), RemoteTextField(id=UUID('b5a88f21-d0b9-4315-9189-059f8b0d0475'), client=None, name='generations-1', title='generations-1', required=True, type='text', use_markdown=True), RemoteTextField(id=UUID('16af7a33-8048-4440-b818-6fc2d7e7142d'), client=None, name='generations-2', title='generations-2', required=True, type='text', use_markdown=True)]
   questions=[RemoteRatingQuestion(id=UUID('5f43832a-dc67-4769-9fb0-58495b0adc9d'), client=None, name='generations-1-rating', title="What's the

In [17]:
preference_rg_dataset.push_to_huggingface("strickvl/ollama_preference_zenml")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]