In [1]:
from distilabel.models import TransformersLLM
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromHub
from distilabel.steps.tasks import GenerateSentencePair

In [2]:
context = """
The text is a job description from the Singapore SkillsFuture Skills Framework.

Your task is to generate realistic search queries that users would input when looking for similar job roles.

Users typically search by:
- Inputting partial job descriptions or requirements they're looking for
- Describing skills, responsibilities, or qualifications they want to match
- Using job titles or role descriptions as search terms
- Mentioning specific domains, industries, or technical requirements

The generated query should represent how someone would search for or describe a job opening similar to the given job description. 
The generated query should be in English. The generated query should not be a question.
The generated query should contain about the same amount of words as the original job description.

Respond in this exact format using ## before the positive and negative queries:

## Positive\n your positive query here
## Negative\n your negative description here

For the positive query, generate a realistic search query for this role. Focus on creating variations that capture the essence of the role in different words, as if written by different people or organizations posting similar jobs.

For the negative query, generate a job description that could confuse a retrieval system. Choose from these strategies:
1. Same industry, different seniority level (Senior → Junior or Vice versa)
2. Same industry, different function (Business Valuation → Risk Management)
3. Similar skills, different domain (Financial Analysis in Banking vs Healthcare)
4. Same title, different industry context

The negative should be a real, distinct job role that shares 2-3 keywords with the original but serves a different purpose.
IMPORTANT: Generate a complete, realistic job description for the negative - do NOT paraphrase the original.
"""

llm = TransformersLLM(
    model="Qwen/Qwen2.5-VL-3B-Instruct",
    device_map="auto",
    torch_dtype="float16",
)

In [3]:
with Pipeline(name="generate") as pipeline:
    load_dataset = LoadDataFromHub(
        # num_examples=20,
        use_cache=False,
        output_mappings={"Job Role Description": "anchor"},
    )
    generate_retrieval_pairs_hard = GenerateSentencePair(
        name="generate_retrieval_pairs_hard",
        triplet=True,
        hard_negative=True,
        action="paraphrase",
        llm=llm,
        input_batch_size=5,
        context=context,
    )
    generate_retrieval_pairs_easy = GenerateSentencePair(
        name="generate_retrieval_pairs_easy",
        triplet=True,
        hard_negative=False,
        action="paraphrase",
        llm=llm,
        input_batch_size=5,
        context=context,
    )
    generate_retrieval_pairs_easy_v2 = GenerateSentencePair(
        name="generate_retrieval_pairs_easy_v2",
        triplet=True,
        hard_negative=False,
        action="paraphrase",
        llm=llm,
        input_batch_size=5,
        context=context,
    )

    load_dataset.connect(generate_retrieval_pairs_hard, generate_retrieval_pairs_easy, generate_retrieval_pairs_easy_v2)

In [4]:
distiset = pipeline.run(
    parameters={
        load_dataset.name: {
            "repo_id": "dnth/ssf-dataset",
            "split": "train",
        },
        "generate_retrieval_pairs_hard": {
            "llm": {"generation_kwargs": {"temperature": 0.5, "max_new_tokens": 512}}
        },
        "generate_retrieval_pairs_easy": {
            "llm": {"generation_kwargs": {"temperature": 0.3, "max_new_tokens": 512}}
        },
        "generate_retrieval_pairs_easy_v2": {
            "llm": {"generation_kwargs": {"temperature": 0.6, "max_new_tokens": 512}}
        },
    }
)

Device set to use cuda:0
The model 'Qwen2_5_VLForConditionalGeneration' is not supported for text-generation. Supported models are ['PeftModelForCausalLM', 'ArceeForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BitNetForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'DogeForCausalLM', 'Dots1ForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'Ernie4_5ForCausalLM', 'Ernie4_5_MoeForCausalLM', 'Exaone4ForCausalLM', 'FalconForCausalLM', 'FalconH1ForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Resolving data files:   0%|          | 0/38 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/38 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Resolving data files:   0%|          | 0/38 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/38 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Resolving data files:   0%|          | 0/38 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/38 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
distiset

Distiset({
    generate_retrieval_pairs_hard: DatasetDict({
        train: Dataset({
            features: ['Sector', 'Track', 'Job Role', 'anchor', 'Performance Expectation', 'positive', 'negative', 'distilabel_metadata', 'model_name'],
            num_rows: 1885
        })
    })
    generate_retrieval_pairs_easy: DatasetDict({
        train: Dataset({
            features: ['Sector', 'Track', 'Job Role', 'anchor', 'Performance Expectation', 'positive', 'negative', 'distilabel_metadata', 'model_name'],
            num_rows: 1885
        })
    })
    generate_retrieval_pairs_easy_v2: DatasetDict({
        train: Dataset({
            features: ['Sector', 'Track', 'Job Role', 'anchor', 'Performance Expectation', 'positive', 'negative', 'distilabel_metadata', 'model_name'],
            num_rows: 1885
        })
    })
})

In [6]:
distiset.push_to_hub("dnth/ssf-synthetic-data-for-retriever", revision="main")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]