<a href="https://colab.research.google.com/github/cuoicungtui/argilla-data-explorers-persona/blob/master/%F0%9F%A7%AD_Argilla_Data_Explorers_Persona_Hub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

_idf = pd.read_json(
    "hf://datasets/proj-persona/PersonaHub/instruction.jsonl", lines=True
)
_mdf = pd.read_json("hf://datasets/proj-persona/PersonaHub/math.jsonl", lines=True)
_tdf = pd.read_json("hf://datasets/proj-persona/PersonaHub/tool.jsonl", lines=True)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.concat([_idf, _mdf, _tdf])

In [None]:
df.head()

Unnamed: 0,input persona,synthesized text,description
0,A theater manager or events coordinator intere...,What are the key considerations for scheduling...,2-shot instruction data synthesis
1,An urban planner looking to understand the dis...,Compare and contrast the distribution of publi...,2-shot instruction data synthesis
2,A high school literature teacher looking for s...,Can you provide me with a list of 5-7 educatio...,0-shot instruction data synthesis
3,"A science fiction writer, exploring the use of...","In a dystopian future, a young journalist name...",2-shot instruction data synthesis
4,A financial analyst specializing in Asian mark...,Gerald Chan's investment firm has recently acq...,2-shot instruction data synthesis


In [None]:
df.description.value_counts()

description
math problem                         50000
2-shot instruction data synthesis    27732
0-shot instruction data synthesis    22268
tool development                      5000
Name: count, dtype: int64

# Push to Argilla

In [None]:
import argilla as rg

client = rg.Argilla(
    api_url="https://argilla-data-explorers.hf.space",
    api_key="owner.apikey",
)

dataset_name = "persona-hub"

In [None]:
dataset = client.datasets(dataset_name)
dataset.delete()

settings = rg.Settings(
    fields=[
        rg.TextField(name="input_persona", title="Persona"),
        rg.TextField(name="synthesized_text", title="Synthesized text", use_markdown=True),
        rg.TextField(name="description", title="Description"),
    ],
    questions=[
        rg.RatingQuestion(
            name="chosen_rating",
            title="Chosen response",
            description="Rate the quality of the chosen response",
            values=["1", "2", "3", "4", "5"],
        ),
        rg.TextQuestion(
            name="correction",
            title="Correction",
            description="Please provide a correction to the conversation",
        ),
        rg.TextQuestion(
            name="feedback",
            title="Feedback",
            description="Please provide feedback on the conversation",
        ),
    ],
    vectors=[
        rg.VectorField(
            name="input_persona_vector",
            dimensions=768,
        ),
        rg.VectorField(
            name="synthesized_text_vector",
            dimensions=768,
        ),
    ],
)

dataset = rg.Dataset(
    name=dataset_name,
    settings=settings,
)


dataset.create()



Dataset(id=UUID('f640aa10-03c5-41fa-9a5c-c51364feec9a') inserted_at=datetime.datetime(2024, 7, 8, 11, 19, 51, 492829) updated_at=datetime.datetime(2024, 7, 8, 11, 19, 54, 50227) name='persona-hub' status='ready' guidelines=None allow_extra_metadata=False workspace_id=UUID('05ff1e19-2ef0-4e65-88ef-7876fbaa8331') last_activity_at=datetime.datetime(2024, 7, 8, 11, 19, 54, 50227) url=None)

In [None]:
import json

max_records = 100
records = []
for idx, row in df.sample(max_records).iterrows():
    conversation_id = hash(row["synthesized text"])
    synthesized_text = row["synthesized text"]
    descrtipion = row["description"]
    input_persona = row["input persona"]

    if descrtipion == "tool development":
        synthesized_text = f"```json\n{json.loads(synthesized_text)}\n```"

    record = rg.Record(
        fields={
            "input_persona": input_persona,
            "synthesized_text": synthesized_text,
            "description": descrtipion,
        },
        suggestions=[
            rg.Suggestion(
                question_name="correction",
                value=row["synthesized text"],
            )
        ]
    )
    records.append(record)


dataset.records.log(records)

Sending records...: 100%|██████████| 1/1 [00:01<00:00,  1.44s/batch]


DatasetRecords(Dataset(id=UUID('f640aa10-03c5-41fa-9a5c-c51364feec9a') inserted_at=datetime.datetime(2024, 7, 8, 11, 19, 51, 492829) updated_at=datetime.datetime(2024, 7, 8, 11, 19, 54, 50227) name='persona-hub' status='ready' guidelines=None allow_extra_metadata=False workspace_id=UUID('05ff1e19-2ef0-4e65-88ef-7876fbaa8331') last_activity_at=datetime.datetime(2024, 7, 8, 11, 19, 54, 50227) url=None))

In [None]:
from sentence_transformers import SentenceTransformer

# Load https://huggingface.co/sentence-transformers/all-mpnet-base-v2
model = SentenceTransformer("all-mpnet-base-v2")

In [None]:
updated_records = []
batch_size = 50

for record in dataset.records:
    embeddings = model.encode(
        [
            record.fields["input_persona"],
            record.fields["synthesized_text"],
        ]
    )
    record.vectors["input_persona_vector"] = embeddings[0]
    record.vectors["synthesized_text_vector"] = embeddings[0]

    updated_records.append(record)
    if len(updated_records) % batch_size == 0:
        dataset.records.log(updated_records)
        updated_records = []
if updated_records:
    dataset.records.log(updated_records)

Sending records...: 100%|██████████| 1/1 [00:05<00:00,  5.77s/batch]


Sending records...: 100%|██████████| 1/1 [00:05<00:00,  5.59s/batch]


# Delete

In [None]:
dataset = client.datasets(dataset_name)
dataset.delete()