In [1]:
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

# Case Study - Generate Sensor Data

In this case study, we try to fabricate some sensor data that we might encounter
in an operations study.

## Pattern 1 - Tall Format

![sensor_model1](images/sensor_model1.drawio.svg)

Note that all `_underscored` columns are not included as they are temporary columns
used to generate the final output. One can omit these columns programmatically in the
end.

In [2]:
from data_fabricator.v1.core.mock_generator import (
    MockDataGenerator,
    BaseTable,
    UniqueId,
    RowApply,
    RandomNumbers,
    Explode,
    generate_unique_id,
)


class Sensor(BaseTable):
    num_rows = 4
    sensor_id = UniqueId(prob_null_kwargs={"seed": 1})
    sensor_name = RowApply(
        list_of_values="Sensor.sensor_id", row_func='lambda x: "sensor_" + str(x)'
    )
    _random_shift = RandomNumbers(
        dtype="Int64",
        start_range=1,
        end_range=100,
    )


class SensorReading(BaseTable):
    sensor_name = Explode(
        list_of_values=["Sensor.sensor_name", "Sensor._random_shift"],
        explode_func=generate_unique_id,
        explode_func_kwargs={"num_rows": 100},
        position=0,
    )
    _random_shift = Explode(
        list_of_values=["Sensor.sensor_name", "Sensor._random_shift"],
        explode_func=generate_unique_id,
        explode_func_kwargs={"num_rows": 100},
        position=1,
    )
    time_index = Explode(
        list_of_values=["Sensor.sensor_name", "Sensor._random_shift"],
        explode_func=generate_unique_id,
        explode_func_kwargs={"num_rows": 100},
        position=2,
    )
    _sensor_reading_smooth = RowApply(
        list_of_values=["SensorReading.time_index", "SensorReading._random_shift"],
        row_func="lambda x,y: y * math.sin((int(x)+int(y))/(2*3.412))",
    )
    _random_error = RowApply(
        list_of_values=["SensorReading._random_shift"],
        row_func="lambda x: random.random()*x*0.2",
    )
    sensor_reading = RowApply(
        list_of_values=[
            "SensorReading._sensor_reading_smooth",
            "SensorReading._random_error",
        ],
        row_func="lambda x, y: x + y",
    )


mock_generator = MockDataGenerator(tables=[Sensor, SensorReading], seed=1)
mock_generator.generate_all()


print(mock_generator.tables["Sensor"].dataframe.head(10))
print(mock_generator.tables["SensorReading"].dataframe.head(10))

  sensor_id sensor_name  _random_shift
0         1    sensor_1             14
1         2    sensor_2             84
2         3    sensor_3             76
3         4    sensor_4             26
  sensor_name  _random_shift time_index  _sensor_reading_smooth  _random_error  sensor_reading
0    sensor_1             14          1               11.334384       1.387218       12.721602
1    sensor_1             14          2               10.012962       1.258575       11.271537
2    sensor_1             14          3                8.476902       1.824460       10.301362
3    sensor_1             14          4                6.759130       2.208425        8.967555
4    sensor_1             14          5                4.896470       0.262807        5.159276
5    sensor_1             14          6                2.928848       0.079373        3.008221
6    sensor_1             14          7                0.898444       2.340142        3.238586
7    sensor_1             14          8      

If we plot this we get:

In [3]:
import plotly.express as px

df = mock_generator.tables["SensorReading"].dataframe
fig = px.line(data_frame=df, x="time_index", y="sensor_reading", color="sensor_name")
fig.write_image("data_fabricator/docs/images/sensor_sample.png")

![sensor_sample](images/sensor_sample.png)

## Pattern 2 - Wide Format

Wide format is easier if you need to control for the relationships between sensor
readings during fabrication, particularly using the `row_apply` syntax.

![sensor_model2](images/sensor_model2.drawio.svg)

In [None]:
import yaml

config_str = """

tables:
- _target_: data_fabricator.v1.core.mock_generator.create_table
  name: sensor_readings
  num_rows: 100
  columns:
    time_index:
      _target_: data_fabricator.v1.core.mock_generator.UniqueId
    sensor_1:
      _target_: data_fabricator.v1.core.mock_generator.RowApply
      list_of_values: sensor_readings.time_index
      row_func: 'lambda x: math.sin(int(x)/(2*3.412))'
    sensor_2:
      _target_: data_fabricator.v1.core.mock_generator.RowApply
      list_of_values: sensor_readings.time_index
      row_func: 'lambda x: math.cos(int(x)/(2*3.412))'
    sensor_3:
      _target_: data_fabricator.v1.core.mock_generator.RowApply
      list_of_values:
      - sensor_readings.sensor_1
      - sensor_readings.sensor_2
      row_func: 'lambda x,y: x*y'


"""
example_config = yaml.safe_load(config_str)
print(example_config)

In [None]:
from data_fabricator.v1.nodes.hydra import hydra_instantiate_dictionary

# function to return injected objects


example_config = hydra_instantiate_dictionary(example_config)

In [None]:
from data_fabricator.v1.core.mock_generator import MockDataGenerator

# Setting seed is not recommended for general use, please consider when to use seed
mock_generator = MockDataGenerator(tables=example_config["tables"], seed=1)
mock_generator.generate_all()

print(mock_generator.tables["sensor_readings"].dataframe.head(10))

If we were to plot this:

In [None]:
import plotly.graph_objects as go

df = mock_generator.tables["sensor_readings"].dataframe
# Create traces
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=df["time_index"], y=df["sensor_1"], mode="lines", name="sensor_1")
)
fig.add_trace(
    go.Scatter(x=df["time_index"], y=df["sensor_2"], mode="lines", name="sensor_2")
)
fig.add_trace(
    go.Scatter(x=df["time_index"], y=df["sensor_3"], mode="lines", name="sensor_3")
)
fig.write_image("data_fabricator/docs/images/sensor_sample2.png")

![sensor_sample2](images/sensor_sample2.png)