This is a demo for the Data Enrichment preview feature. We welcome any feedback via Slack
@Aaron Zhang

In [1]:
######## generate some data  ###################
import pandas as pd
import numpy as np

# Define the number of rows and columns
num_rows = 20
columns = ['Name', 'Age', 'Country', 'Salary']

# Generate random meaningful data
np.random.seed(42)  # For reproducibility
names = ['John', 'Emma', 'Michael', 'Sophia', 'Daniel', 'Olivia', 'Matthew', 'Ava', 'James', 'Isabella',
         'Henry', 'Mia', 'Alexander', 'Charlotte', 'William', 'Amelia', 'Benjamin', 'Harper', 'Lucas', 'Evelyn']
ages = np.random.randint(22, 60, size=num_rows)
countries = ['USA', 'UK', 'Canada', 'Australia', 'Germany', 'France', 'Spain', 'Italy', 'Netherlands', 'Sweden',
             'Norway', 'Denmark', 'Finland', 'Switzerland', 'Ireland', 'Belgium', 'Austria', 'Portugal', 'Greece', 'Poland']
salaries = np.random.randint(30000, 120000, size=num_rows)

# Create the dataframe
data = {
    'Name': names,
    'Age': ages,
    'Country': np.random.choice(countries, num_rows),
    'Salary': salaries
}
df = pd.DataFrame(data)

# Display the dataframe
df


Unnamed: 0,Name,Age,Country,Salary
0,John,50,Netherlands,83707
1,Emma,36,Spain,115305
2,Michael,29,Portugal,58693
3,Sophia,42,Australia,101932
4,Daniel,40,Switzerland,55658
5,Olivia,44,Portugal,114478
6,Matthew,32,Netherlands,48431
7,Ava,32,UK,32747
8,James,45,Poland,89150
9,Isabella,57,Ireland,95725


In [2]:
# upload the data
from cleanlab_studio import Studio
studio = Studio(<TOKEN>)

In [3]:
dataset_id: str = studio.upload_dataset(df, "name_age_contry_salary")


Uploading dataset...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████|
Ingesting Dataset...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████|


In [4]:
# create enrichment project
enrichment_project = studio.create_enrichment_project(name="aaron_enrichment_preview_demo2", dataset_id=dataset_id)

enrichment_project.id

'8a96b266eef74cb1b86bb02dbd0f72f0'

In [1]:
from cleanlab_studio import Studio
studio = Studio(<TOKEN>)
enrichment_project = studio.get_enrichment_project("8a96b266eef74cb1b86bb02dbd0f72f0")

# construct preview inputs
from cleanlab_studio.studio.enrichment import EnrichmentOptions

# EnrichmentOptions can be used for both Preview and Enrich_All. After users are satisfied with the preview result,
# they can use the same EnrichmentOptions object to enrich the entire dataset.
enrichment_options = EnrichmentOptions(
    prompt="Is ${Country} a part of Europe?",
    constrain_outputs=["Yes", "No"],
    quality_preset="low",
    # regex = ...
    # tlm_options = ...
)


In [5]:
preview_result = enrichment_project.preview(options=enrichment_options, new_column_name="Is_in_Europe")
preview_result
preview_result.details()


Unnamed: 0_level_0,Is_in_Europe,Is_in_Europe_log,Is_in_Europe_raw,Is_in_Europe_trustworthiness_score
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,Yes,,Yes,0.988472
7,Yes,,Yes,0.959141
8,Yes,,Yes,0.99856


In [2]:
populate_result_id = enrichment_project.populate(options=enrichment_options, new_column_name="Is_in_Europe")
populate_result_id

{'job_id': '11950bbcb85e43ec9a9ede4ee8c99422'}

In [3]:
studio.get_enrichment_job_status("11950bbcb85e43ec9a9ede4ee8c99422")

{'average_trustworthiness_score': None,
 'processed_rows': None,
 'status': 'RUNNING'}

In [2]:
studio.get_enrichment_job_status("11950bbcb85e43ec9a9ede4ee8c99422")

{'status': 'SUCCEEDED'}

In [2]:
result = enrichment_project.get_populate_results(job_id="11950bbcb85e43ec9a9ede4ee8c99422")
result

<cleanlab_studio.studio.enrichment.EnrichmentResult at 0x111533df0>

In [3]:
result.details()


Unnamed: 0_level_0,Age,Country,Is_in_Europe,Is_in_Europe_log,Is_in_Europe_raw,Is_in_Europe_trustworthiness_score,Name,Salary
cleanlab_row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,50,Netherlands,Yes,,Yes,0.994645,John,83707
2,36,Spain,Yes,,Yes,0.988773,Emma,115305
3,29,Portugal,Yes,,Yes,0.988472,Michael,58693
4,42,Australia,No,,No,0.997339,Sophia,101932
5,40,Switzerland,Yes,,Yes,0.996838,Daniel,55658
6,44,Portugal,Yes,,Yes,0.99594,Olivia,114478
7,32,Netherlands,Yes,,Yes,0.994645,Matthew,48431
8,32,UK,Yes,,Yes,0.959141,Ava,32747
9,45,Poland,Yes,,Yes,0.99856,James,89150
10,57,Ireland,Yes,,Yes,0.997618,Isabella,95725
