<a target="_parent" href="https://colab.research.google.com/github/gretelai/gretel-blueprints/blob/main/docs/notebooks/demo/navigator/navigator-data-designer-sdk-sample-to-dataset.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# 🌅 Early Preview: Data Designer

> **Note:** The [Data Designer](https://gretel.ai/navigator/data-designer) functionality demonstrated in this notebook is currently in **Early Preview**.
>
> To access these features and run this notebook, please [join the waitlist](https://gretel.ai/navigator/data-designer#waitlist).


# 🎛️ Import things

In [None]:
%%capture
# Install the latest version of Gretel client and dependencies
%pip install -U git+https://github.com/gretelai/gretel-python-client

In [None]:
import pandas as pd

from datasets import load_dataset
from IPython.display import IFrame

from gretel_client.navigator import DataDesignerFactory

# 🗺️ Choose your adventure

## 🏡 Real-estate Example

In [None]:
NUM_SAMPLES = 10

df = pd.read_csv("https://gretel-datasets.s3.us-west-2.amazonaws.com/realestate_data_london_2024_nov.csv")
sample_records = df.sample(NUM_SAMPLES).to_dict(orient="records")
df.head()

## 🤗 HF Dataset Examples
Make sure to uncomment!

In [None]:
# hf_handle = "openai/gsm8k"

# dataset = load_dataset(hf_handle, "main")
# df = dataset["train"].to_pandas()
# sample_records = df.sample(NUM_SAMPLES).to_dict(orient="records")

# IFrame(src=f"https://huggingface.co/datasets/{hf_handle}/embed/viewer/default/train", width=1200, height=600)

## 🧙‍♀️ Manually-provided samples from a Subject Matter Expert (SME)
Make sure to uncomment!

In [None]:
# sme_examples = [
#     {
#         "prompt": "Write a function that calculates the running average of a stream of numbers. Each time a new number is added, return the average of all numbers seen so far.",
#         "gold_solution": """
# class RunningAverage:
#     def __init__(self):
#         self.count = 0
#         self.sum = 0

#     def add_number(self, num):
#         self.count += 1
#         self.sum += num
#         return self.sum / self.count""",
#         "alternative_solution": """
# class RunningAverage:
#     def __init__(self):
#         self.numbers = []

#     def add_number(self, num):
#         self.numbers.append(num)
#         return sum(self.numbers) / len(self.numbers)""",
#         "incorrect_solution": """
# class RunningAverage:
#     def __init__(self):
#         self.last_num = 0
#         self.count = 0

#     def add_number(self, num):
#         self.count += 1
#         self.last_num = num
#         return self.last_num""",
#         "tests": """
# def test_running_average():
#     ra = RunningAverage()
#     assert ra.add_number(10) == 10.0
#     assert ra.add_number(20) == 15.0
#     assert ra.add_number(30) == 20.0"""
#     },
#     {
#         "prompt": "Create a function that finds the longest palindromic substring in a given string. For example, in 'babad', one solution is 'bab'.",
#         "gold_solution": """
# def longest_palindrome(s):
#     if not s: return ''
#     start = end = 0

#     def expand(left, right):
#         while left >= 0 and right < len(s) and s[left] == s[right]:
#             left -= 1
#             right += 1
#         return right - left - 1

#     for i in range(len(s)):
#         len1 = expand(i, i)
#         len2 = expand(i, i + 1)
#         max_len = max(len1, len2)
#         if max_len > end - start:
#             start = i - (max_len - 1) // 2
#             end = i + max_len // 2
#     return s[start:end + 1]""",
#         "alternative_solution": """
# def longest_palindrome(s):
#     if not s: return ''
#     longest = s[0]
#     for i in range(len(s)):
#         for j in range(i + 1, len(s) + 1):
#             substr = s[i:j]
#             if substr == substr[::-1] and len(substr) > len(longest):
#                 longest = substr
#     return longest""",
#         "incorrect_solution": """
# def longest_palindrome(s):
#     if not s: return ''
#     longest = ''
#     for i in range(len(s)):
#         if s[i] == s[i:i+2]:
#             return s[i:i+2]
#         if len(longest) < 1:
#             longest = s[i]
#     return longest""",
#         "tests": """
# def test_longest_palindrome():
#     assert longest_palindrome('babad') in ['bab', 'aba']
#     assert longest_palindrome('cbbd') == 'bb'
#     assert longest_palindrome('a') == 'a'
#     assert longest_palindrome('') == ''"""
#     },
#     {
#         "prompt": "Write a function that finds the first non-repeating character in a string and returns its index. If there is no non-repeating character, return -1.",
#         "gold_solution": """
# def first_unique_char(s):
#     char_count = {}

#     # Count character frequencies
#     for char in s:
#         char_count[char] = char_count.get(char, 0) + 1

#     # Find first character with count 1
#     for i, char in enumerate(s):
#         if char_count[char] == 1:
#             return i
#     return -1""",
#         "alternative_solution": """
# from collections import Counter

# def first_unique_char(s):
#     # Count all characters
#     counter = Counter(s)

#     # Find first character with count 1
#     for i, char in enumerate(s):
#         if counter[char] == 1:
#             return i
#     return -1""",
#         "incorrect_solution": """
# def first_unique_char(s):
#     seen = set()

#     # Only checks if we've seen the character before
#     # Doesn't account for later duplicates
#     for i, char in enumerate(s):
#         if char not in seen:
#             seen.add(char)
#             return i
#     return -1""",
#         "tests": """
# def test_first_unique_char():
#     assert first_unique_char('leetcode') == 0  # 'l' is first unique
#     assert first_unique_char('loveleetcode') == 2  # 'v' is first unique
#     assert first_unique_char('aabb') == -1  # no unique characters
#     assert first_unique_char('') == -1  # empty string
#     assert first_unique_char('cc') == -1  # no unique characters"""
#     }
# ]

# df = pd.DataFrame(sme_examples)
# sample_records = df.to_dict(orient="records")

# ✍️ Set up Data Designer for Sample-to-Dataset

In [None]:
MODEL_SUITE = "apache-2.0"

data_designer = DataDesignerFactory.from_sample_records(
    sample_records=sample_records,
    model_suite=MODEL_SUITE,
    api_key="prompt"
)

# 🌱 Extract Data Seeds

In [None]:
data_seeds = data_designer.run_data_seeds_step(
    max_num_seeds=7,
    num_assistants=5,
    verbose_logging=True,
)

for c in data_seeds.seed_categories:
    print(f"{c.name}:")
    print(f" |- description: {c.description}")
    print(f" |- values: {c.values}")

# 👁️ Preview Data

In [None]:
preview = data_designer.generate_dataset_preview(
    data_seeds=data_seeds,
    verbose_logging=True
)

In [None]:
preview.display_sample_record()

# 🆙 Scale up!

In [None]:
batch_job = data_designer.submit_batch_workflow(
    num_records=100,
    data_seeds=data_seeds,
    project_name="sample-to-dataset"
)

In [None]:
synthetic_dataset = batch_job.fetch_dataset(wait_for_completion=True)

In [None]:
synthetic_dataset