# Data to Metadata to Dummy Data

In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv")
df.head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE


## Augment Penguin Example

### Create an id

In [3]:
from utils import make_random_unique_id
# Maximum 3 contribution per penguin.
# A same penguin "species", "island", "sex" are the same through their life.
df = make_random_unique_id(
    df,
    id_column = "penguin_id",
    fixed_fields = ["species", "island", "sex"],
    max_contributions = 3
)

  df.groupby(fixed_fields, dropna=False, group_keys=False).apply(random_merge)


### Sex as boolean

In [4]:
df["sex"] = df["sex"].map({"MALE": 1, "FEMALE": 0}).astype(bool)

### Add a timestamp (in days)

In [5]:
np.random.seed(42)
start = pd.Timestamp("2025-01-01")
end = pd.Timestamp("2025-12-31")

df["timestamp"] = start + pd.to_timedelta(
    np.random.randint(0, (end - start).days, size=len(df)),
    unit="D"
)

### Add another type of timestamp (in seconds)
After first timestamp

In [6]:
df["timestamp_with_time"] = df["timestamp"] + pd.to_timedelta(
    np.random.randint(0, 24*60*60, size=len(df)),  # seconds in a day
    unit="s"
)

### Add a favourite number between 0 and 10 (categorical int)

In [7]:
df["favourite_number"] = np.random.randint(0, 11, size=len(df))

### Put body_mass_g as continuous integer

In [8]:
df["body_mass_g"] = df["body_mass_g"].astype("Int64")

### Add nulls in bill_length_mm

In [9]:
# Randomly pick 100 indices to set as NaN
nan_indices = np.random.choice(df.index, size=100, replace=False)
df.loc[nan_indices, "bill_length_mm"] = np.nan
print(df["bill_length_mm"].isna().sum())  # Should print > 100

102


### Add nulls in flipper_length_mm (if in bill_length_mm and more)

In [10]:
# Add nulls in flipper_length_mm where bill_length_mm is null
df.loc[df["bill_length_mm"].isna(), "flipper_length_mm"] = np.nan

# Add 50 more random nulls in flipper_length_mm (excluding already nulls)
available_indices = df.index[df["flipper_length_mm"].notna()]
nan_indices_flipper_extra = np.random.choice(available_indices, size=50, replace=False)
df.loc[nan_indices_flipper_extra, "flipper_length_mm"] = np.nan
print(df[["bill_length_mm", "flipper_length_mm"]].isna().sum())

bill_length_mm       102
flipper_length_mm    152
dtype: int64


### Result

In [11]:
df.head(2)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,penguin_id,timestamp,timestamp_with_time,favourite_number
0,Adelie,Torgersen,39.1,18.7,,3750,True,0,2025-04-13,2025-04-13 15:04:00,1
1,Adelie,Torgersen,39.5,17.4,,3800,False,1,2025-12-15,2025-12-15 18:15:26,10


## Use csvw-safe-library

In [12]:
from csvw_safe.make_metadata_from_data import make_metadata_from_data
from csvw_safe.validate_metadata import validate_metadata
from csvw_safe.validate_metadata_shacl import validate_metadata_shacl
from csvw_safe.make_dummy_from_metadata import make_dummy_from_metadata
from csvw_safe.assert_same_structure import assert_same_structure

### Generate metadata

In [13]:
metadata_path_1 = 'metadata/penguin_metadata_basic.json-ld'
metadata_path_2 = 'metadata/penguin_metadata_auto_partition_keys.json-ld'
metadata_path_3 = 'metadata/penguin_metadata_auto_partition_keys_auto_column_groups.json-ld'

In [14]:
metadata_1 = make_metadata_from_data(
    df, individual_col = "penguin_id", auto_partition_keys = False
)
with open(metadata_path_1, 'w', encoding='utf-8') as f:
    json.dump(metadata_1, f)

In [15]:
metadata_2 = make_metadata_from_data(
    df, individual_col = "penguin_id", auto_partition_keys = True
)
with open(metadata_path_2, 'w', encoding='utf-8') as f:
    json.dump(metadata_2, f)

In [16]:
metadata_3 = make_metadata_from_data(
    df, individual_col = "penguin_id", auto_partition_keys = True, auto_column_groups = True
)
with open(metadata_path_3, 'w', encoding='utf-8') as f:
    json.dump(metadata_3, f)

### Validate metadata

In [17]:
errors = validate_metadata(metadata_1)
errors

[]

In [18]:
errors = validate_metadata(metadata_2)
errors

[]

In [19]:
errors = validate_metadata(metadata_3)
errors

[]

### Validate metadata SHACL

In [21]:
shacl_path = '../../csvw-safe-constraints.ttl'

In [22]:
validate_metadata_shacl(metadata_path_1, shacl_path)

(True, 'Validation Report\nConforms: True\n')

In [23]:
validate_metadata_shacl(metadata_path_2, shacl_path)

(True, 'Validation Report\nConforms: True\n')

In [25]:
# TODO
# validate_metadata_shacl(metadata_path_3, shacl_path)

### Generate Dummy

In [26]:
dummy_df = make_dummy_from_metadata(metadata_1, nb_rows = 100, seed = 0)

ValueError: No publicPartitions for partitionKey 'species'

In [58]:
dummy_df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,penguin_id,timestamp,favourite_number
0,Gentoo,Dream,45.439662,14.687569,206.415883,3041.196042,0,105.131976,2025-11-18,9
1,Chinstrap,Torgersen,38.506442,19.052740,174.372875,2717.637894,1,79.360792,2025-10-27,7
2,Chinstrap,Biscoe,54.452656,14.755967,213.977723,3862.514893,0,223.212853,2025-07-16,10
3,Adelie,Biscoe,57.858844,18.199191,205.572525,6266.680986,0,90.787299,2025-03-05,4
4,Adelie,Torgersen,39.451648,18.420153,220.731476,3652.902462,1,295.760504,2025-05-05,2
...,...,...,...,...,...,...,...,...,...,...
95,Chinstrap,Torgersen,47.999077,14.104216,216.145232,6248.770687,1,64.279323,2025-06-06,2
96,Adelie,Dream,,21.403920,177.114198,4240.888269,1,108.464101,2025-11-14,8
97,Adelie,Torgersen,53.634178,19.156454,197.125518,5734.852973,0,129.210176,2025-11-01,2
98,Chinstrap,Dream,59.391440,20.414032,195.408361,2992.765289,1,169.510328,2025-02-16,2
