# Simulated Dataset

In [1]:
import polars as pl
from upath import UPath
from dotenv import load_dotenv
from enclaveid_data_pipeline_tests.test_utils import make_simulated_dataset

load_dotenv()

True

In [9]:
p = UPath("az://enclaveid-production-bucket/user_1/MyActivity.json")

with p.open("rb") as f:
    activity_df = pl.read_json(f.read(), schema_overrides={"time": pl.Datetime})

print(activity_df.head(2))

shape: (2, 7)
┌────────┬───────────────┬───────────────┬──────────────┬────────────┬──────────────┬──────────────┐
│ header ┆ title         ┆ titleUrl      ┆ time         ┆ products   ┆ activityCont ┆ locationInfo │
│ ---    ┆ ---           ┆ ---           ┆ ---          ┆ ---        ┆ rols         ┆ s            │
│ str    ┆ str           ┆ str           ┆ datetime[μs] ┆ list[str]  ┆ ---          ┆ ---          │
│        ┆               ┆               ┆              ┆            ┆ list[str]    ┆ list[struct[ │
│        ┆               ┆               ┆              ┆            ┆              ┆ 3]]          │
╞════════╪═══════════════╪═══════════════╪══════════════╪════════════╪══════════════╪══════════════╡
│ Search ┆ Searched for  ┆ https://www.g ┆ 2023-04-01   ┆ ["Search"] ┆ ["Web & App  ┆ [{"At this   │
│        ┆ car share app ┆ oogle.com/sea ┆ 07:56:19.202 ┆            ┆ Activity"]   ┆ general area │
│        ┆ malag…        ┆ rch?q=…       ┆              ┆            ┆       

In [10]:
p = UPath("az://enclaveid-production-bucket/records_per_day.csv")

with p.open("rb") as f:
    counts_df = pl.read_csv(f.read(), schema={"day": pl.Date, "count": pl.Int64})
print(counts_df.head(2))

shape: (2, 2)
┌────────────┬───────┐
│ day        ┆ count │
│ ---        ┆ ---   │
│ date       ┆ i64   │
╞════════════╪═══════╡
│ 2018-11-07 ┆ 6     │
│ 2018-11-08 ┆ 22    │
└────────────┴───────┘


In [11]:
sim_df = make_simulated_dataset(counts_df, activity_df)
print(sim_df.head(2))

shape: (2, 7)
┌────────┬───────────────┬───────────────┬──────────────┬────────────┬──────────────┬──────────────┐
│ header ┆ title         ┆ titleUrl      ┆ time         ┆ products   ┆ activityCont ┆ locationInfo │
│ ---    ┆ ---           ┆ ---           ┆ ---          ┆ ---        ┆ rols         ┆ s            │
│ str    ┆ str           ┆ str           ┆ datetime[μs] ┆ list[str]  ┆ ---          ┆ ---          │
│        ┆               ┆               ┆              ┆            ┆ list[str]    ┆ list[struct[ │
│        ┆               ┆               ┆              ┆            ┆              ┆ 3]]          │
╞════════╪═══════════════╪═══════════════╪══════════════╪════════════╪══════════════╪══════════════╡
│ Search ┆ Visited Bird  ┆ https://www.g ┆ 2018-11-07   ┆ ["Search"] ┆ ["Web & App  ┆ [{"At this   │
│        ┆ of Paradise   ┆ oogle.com/sea ┆ 11:53:19.202 ┆            ┆ Activity"]   ┆ general area │
│        ┆ Care - …      ┆ rch?q=…       ┆              ┆            ┆       

In [12]:
# Write to local disk
sim_path = "data/user_simulated/MyActivity.json"
sim_df.write_json(sim_path, row_oriented=True)

# Test that the two are equivalent
pl.read_json(sim_path, schema_overrides={"time": pl.Datetime}).equals(sim_df)

True

In [13]:
cloud_path = UPath("az://enclaveid-production-bucket/user_simulated/MyActivity.json")

# Save to cloud bucket
with cloud_path.open("w") as f:
    f.write(sim_df.write_json(row_oriented=True, pretty=False))

In [14]:
# Read from cloud bucket
with cloud_path.open("rb") as f:
    cloud_df = pl.read_json(f.read(), schema_overrides={"time": pl.Datetime})

# Test that the two are equivalent
cloud_df.equals(sim_df)

True