# Data to Metadata to Dummy Data

In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv")
df.head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE


## Augment Penguin Example

### Create an id

In [3]:
from utils import make_random_unique_id
# Maximum 3 contribution per penguin.
# A same penguin "species", "island", "sex" are the same through their life.
df = make_random_unique_id(
    df,
    id_column = "penguin_id",
    fixed_fields = ["species", "island", "sex"],
    max_contributions = 3
)

  df.groupby(fixed_fields, dropna=False, group_keys=False).apply(random_merge)


### Sex as boolean

In [4]:
df["sex"] = df["sex"].map({"MALE": 1, "FEMALE": 0}).astype(bool)

### Add a timestamp (in days)

In [5]:
np.random.seed(42)
start = pd.Timestamp("2025-01-01")
end = pd.Timestamp("2025-12-31")

df["timestamp"] = start + pd.to_timedelta(
    np.random.randint(0, (end - start).days, size=len(df)),
    unit="D"
)

### Add another type of timestamp (in seconds)
After first timestamp

In [6]:
df["timestamp_with_time"] = df["timestamp"] + pd.to_timedelta(
    np.random.randint(0, 24*60*60, size=len(df)),  # seconds in a day
    unit="s"
)

### Add a favourite number between 0 and 10 (categorical int)

In [7]:
df["favourite_number"] = np.random.randint(0, 11, size=len(df))

### Put body_mass_g as continuous integer

In [8]:
df["body_mass_g"] = df["body_mass_g"].astype("Int64")

### Add nulls in bill_length_mm

In [9]:
# Randomly pick 100 indices to set as NaN
nan_indices = np.random.choice(df.index, size=100, replace=False)
df.loc[nan_indices, "bill_length_mm"] = np.nan
print(df["bill_length_mm"].isna().sum())  # Should print > 100

102


### Add nulls in flipper_length_mm (if in bill_length_mm and more)

In [10]:
# Add nulls in flipper_length_mm where bill_length_mm is null
df.loc[df["bill_length_mm"].isna(), "flipper_length_mm"] = np.nan

# Add 50 more random nulls in flipper_length_mm (excluding already nulls)
available_indices = df.index[df["flipper_length_mm"].notna()]
nan_indices_flipper_extra = np.random.choice(available_indices, size=50, replace=False)
df.loc[nan_indices_flipper_extra, "flipper_length_mm"] = np.nan
print(df[["bill_length_mm", "flipper_length_mm"]].isna().sum())

bill_length_mm       102
flipper_length_mm    152
dtype: int64


### Result

In [11]:
df.head(2)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,penguin_id,timestamp,timestamp_with_time,favourite_number
0,Adelie,Torgersen,39.1,18.7,,3750,True,5,2025-04-13,2025-04-13 15:04:00,1
1,Adelie,Torgersen,39.5,17.4,,3800,False,122,2025-12-15,2025-12-15 18:15:26,10


## Use csvw-safe-library

In [12]:
from csvw_safe.make_metadata_from_data import make_metadata_from_data
from csvw_safe.validate_metadata import validate_metadata
from csvw_safe.validate_metadata_shacl import validate_metadata_shacl
from csvw_safe.make_dummy_from_metadata import make_dummy_from_metadata
from csvw_safe.assert_same_structure import assert_same_structure

### Generate metadata

In [13]:
metadata_path_1 = 'metadata/penguin_metadata_basic.json-ld'
metadata_path_2 = 'metadata/penguin_metadata_auto_partition_keys.json-ld'
metadata_path_3 = 'metadata/penguin_metadata_auto_partition_keys_auto_column_groups.json-ld'

In [14]:
metadata_1 = make_metadata_from_data(
    df, individual_col = "penguin_id", auto_partition_keys = False
)
with open(metadata_path_1, 'w', encoding='utf-8') as f:
    json.dump(metadata_1, f)

In [15]:
metadata_2 = make_metadata_from_data(
    df, individual_col = "penguin_id", auto_partition_keys = True
)
with open(metadata_path_2, 'w', encoding='utf-8') as f:
    json.dump(metadata_2, f)

In [16]:
metadata_3 = make_metadata_from_data(
    df, individual_col = "penguin_id", auto_partition_keys = True, auto_column_groups = True
)
with open(metadata_path_3, 'w', encoding='utf-8') as f:
    json.dump(metadata_3, f)

### Validate metadata

In [17]:
errors = validate_metadata(metadata_1)
errors

[]

In [18]:
errors = validate_metadata(metadata_2)
errors

[]

In [19]:
errors = validate_metadata(metadata_3)
errors

[]

### Validate metadata SHACL

In [20]:
shacl_path = '../../csvw-safe-constraints.ttl'

In [21]:
validate_metadata_shacl(metadata_path_1, shacl_path)

(True, 'Validation Report\nConforms: True\n')

In [22]:
validate_metadata_shacl(metadata_path_2, shacl_path)

(True, 'Validation Report\nConforms: True\n')

In [23]:
# TODO
# validate_metadata_shacl(metadata_path_3, shacl_path)

### Generate Dummy

In [26]:
dummy_df_1 = make_dummy_from_metadata(metadata_1, nb_rows = 100, seed = 0)
dummy_df_1.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,penguin_id,timestamp,timestamp_with_time,favourite_number
0,Gentoo,Dream,,21.395938,219.123288,3894.630731,False,142.014355,2025-09-10,2025-02-14 06:47:46,10
1,Chinstrap,Torgersen,,16.415165,189.811842,4049.92035,False,111.514088,2025-07-23,2025-01-27 06:47:46,8
2,Chinstrap,Biscoe,36.666146,21.155351,192.125552,5740.743126,True,194.111984,2025-07-03,2025-10-26 06:47:46,2
3,Adelie,Biscoe,53.560041,16.900659,193.253696,3340.146755,True,23.323326,2025-10-09,2025-11-03 06:47:46,1
4,Adelie,Torgersen,58.696327,16.186902,216.080732,5852.771438,False,70.576561,2025-04-11,2025-11-03 06:47:46,5


In [27]:
dummy_df_2 = make_dummy_from_metadata(metadata_2, nb_rows = 100, seed = 0)
dummy_df_2.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,penguin_id,timestamp,timestamp_with_time,favourite_number
0,Gentoo,Dream,,21.395938,219.123288,3894.630731,False,142.014355,2025-09-10,2025-02-14 06:47:46,10
1,Chinstrap,Torgersen,,16.415165,189.811842,4049.92035,False,111.514088,2025-07-23,2025-01-27 06:47:46,8
2,Chinstrap,Biscoe,36.666146,21.155351,192.125552,5740.743126,True,194.111984,2025-07-03,2025-10-26 06:47:46,2
3,Adelie,Biscoe,53.560041,16.900659,193.253696,3340.146755,True,23.323326,2025-10-09,2025-11-03 06:47:46,1
4,Adelie,Torgersen,58.696327,16.186902,216.080732,5852.771438,False,70.576561,2025-04-11,2025-11-03 06:47:46,5


In [28]:
dummy_df_3 = make_dummy_from_metadata(metadata_3, nb_rows = 100, seed = 0)
dummy_df_3.head()

Unnamed: 0,species,island,sex,favourite_number,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,penguin_id,timestamp,timestamp_with_time
0,Adelie,Dream,False,10,54.141493,14.319332,,4182.50494,303.201287,2025-09-15,2025-07-15 06:47:46
1,Chinstrap,Biscoe,True,4,47.026859,19.807631,,3859.560932,293.021011,2025-05-22,2025-12-02 06:47:46
2,Adelie,Biscoe,True,5,40.724946,15.422408,172.73175,4734.126888,283.409696,2025-01-08,2025-06-27 06:47:46
3,Adelie,Torgersen,True,10,57.32004,20.049684,208.015186,2925.776392,160.99574,2025-10-03,2025-11-25 06:47:46
4,Gentoo,Biscoe,False,2,,16.073377,193.230322,3426.104768,64.441238,2025-06-27,2025-10-16 06:47:46
