In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import logging

logging.basicConfig(level = logging.INFO) # Change this to logging.DEBUG pr INFO for more/less detailing debugging information.

# Load in the config files to create iot-ingester actions

In [6]:
from pathlib import Path
from obsproc.core.config_parser import parse_config
from dataclasses import asdict

config_file = Path("config/config.yaml")
config = parse_config(config_file)

# Construct the pipeline from the config
pipeline = {name : getattr(config, name) for name in config.pipeline}
print(f"Global Config:")
for k, v in asdict(config.global_config).items():
    if k == "canonical_variables": v = "..."
    print(f"    {k} : {v}")
    
for name, stage in pipeline.items():
    print(f"\nStage: {name.capitalize()}")
    for pipe in stage:
        print(f"    {str(pipe)}")

Global Config:
    canonical_variables : ...
    config_path : /Users/math/git/iot-ingester-deployment/dockerfiles/worker/iot-ingester/notebooks/sensor.community/config
    data_path : /Users/math/git/iot-ingester-deployment/dockerfiles/worker/iot-ingester/notebooks/sensor.community/data
    code_source : {'repo_status': 'Dirty', 'git_hash': '7b31e2402adb291e13c0d02d989eaf859449567d'}

Stage: Sources
    MultiFileSource(['**/*.csv'], source = 'sensor.community')

Stage: Other_processors
    CSVChunker([Match(state = 'big_file', source = 'sensor.community')])
    QualityControl(a_string)

Stage: Parsers
    CSVParser([Match(state = 'raw', source = 'sensor.community')])

Stage: Aggregators
    TimeAggregator([Match(state = 'parsed', source = 'sensor_community')], 1min, youngest)

Stage: Encoders
    CSVEncoder([Match(state = 'quality_controlled', source = 'sensor.community')])
    ODCEncoder([Match(state = 'quality_controlled', source = 'sensor.community')])


In [11]:
from obsproc.core.bases import FinishMessage, FileMessage, MetaData
from obsproc.aggregators import TimeAggregator

# Pull out the parser object that matches this message
message = FileMessage(
    metadata=MetaData(state='big_file',
        source='sensor_community',
        filepath=Path('data/inputs/sensor_community/2023-08/2023-08_sds011.csv'))) # May need to download this file manually

actions = [s for stage in list(pipeline.values())[1:] for s in stage]

message_history = []
for _ in range(10):
    display(message)
    message_history.append(message)
    matching = [action for action in actions if action.matches(message)]
    
    if not matching: 
        print("No more matches, the message is fully processed!")
        break

    print("That messages matches with this/these action(s): \n\n", "\n".join(str(a) for a in matching))
    action = matching[0]
    print("\nHere's the first output message from passing the previous message to the first matching action: \n")

    # Special case for the TimeAggregator
    if action.__class__.__name__ == "TimeAggregator":
        list(action.process(message)) #need the list call here to pump the iterator to completion even if it doesn't return anything
        message = next(action.process(FinishMessage("We're done!")))
    else:
        message = next(action.process(message))

0,1
state,big_file
source,sensor_community
filepath,data/inputs/sensor_community/2023-08/2023-08_sds011.csv


That messages matches with this/these action(s): 

 CSVChunker([Match(state = 'big_file', source = 'sensor.community')])

Here's the first output message from passing the previous message to the first matching action: 



0,1
state,raw
source,sensor_community

0,1
name,FileMessage
metadata,"MetaData(source = sensor_community, variable = None)"

0,1
name,CSVChunker

sensor_id,sensor_type,location,lat,lon,timestamp,P1,durP1,ratioP1,P2,durP2,ratioP2
78798,SDS011,68146,51.578000,4.778000,2023-08-01T00:00:00,,,,,,
13649,SDS011,6898,50.918000,4.688000,2023-08-01T00:00:00,0.00,,,0.00,,
24225,SDS011,51462,51.422000,5.514000,2023-08-01T00:00:00,0.00,,,0.00,,
32599,SDS011,19277,51.778000,7.912000,2023-08-01T00:00:00,0.00,,,0.00,,
37227,SDS011,23123,43.125047,25.682165,2023-08-01T00:00:00,0.00,,,0.00,,
44648,SDS011,30354,51.130280,5.607592,2023-08-01T00:00:00,0.00,,,0.00,,
51606,SDS011,37691,53.264591,6.483177,2023-08-01T00:00:00,0.00,,,0.00,,
62884,SDS011,48971,47.539732,19.113040,2023-08-01T00:00:00,0.00,,,0.00,,
65226,SDS011,51805,49.012000,12.104000,2023-08-01T00:00:00,0.00,,,0.00,,
70729,SDS011,58315,53.336000,55.932000,2023-08-01T00:00:00,0.00,,,0.00,,


That messages matches with this/these action(s): 

 CSVParser([Match(state = 'raw', source = 'sensor.community')])

Here's the first output message from passing the previous message to the first matching action: 



0,1
state,parsed
source,sensor_community
observation_variable,P1

0,1,2,3
time,"datetime64[ns, UTC]",,The time that the observation was made.
station_id,object,,A unique identifer for a stationary sensor.
sensor_type,object,,
location,int,,
lat,float64,°,"The lattitude of the observation, referenced to WGS84 (EPSG: 4326)"
lon,float64,°,"The longitude of the observation, referenced to WGS84 (EPSG: 4326)"
P1,float64,,

0,1
name,FileMessage
metadata,"MetaData(source = sensor_community, variable = None)"

0,1
name,CSVChunker

0,1
name,TabularMessage
metadata,"MetaData(source = sensor_community, variable = None)"

0,1
name,CSVParser

time,station_id,sensor_type,location,lat,lon,P1
2023-08-01 00:00:00+00:00,78798,SDS011,68146,51.578000,4.778000,
2023-08-01 00:00:00+00:00,13649,SDS011,6898,50.918000,4.688000,0.00
2023-08-01 00:00:00+00:00,24225,SDS011,51462,51.422000,5.514000,0.00
2023-08-01 00:00:00+00:00,32599,SDS011,19277,51.778000,7.912000,0.00
2023-08-01 00:00:00+00:00,37227,SDS011,23123,43.125047,25.682165,0.00
2023-08-01 00:00:00+00:00,44648,SDS011,30354,51.130280,5.607592,0.00
2023-08-01 00:00:00+00:00,51606,SDS011,37691,53.264591,6.483177,0.00
2023-08-01 00:00:00+00:00,62884,SDS011,48971,47.539732,19.113040,0.00
2023-08-01 00:00:00+00:00,65226,SDS011,51805,49.012000,12.104000,0.00
2023-08-01 00:00:00+00:00,70729,SDS011,58315,53.336000,55.932000,0.00


That messages matches with this/these action(s): 

 TimeAggregator([Match(state = 'parsed', source = 'sensor_community')], 1min, youngest)

Here's the first output message from passing the previous message to the first matching action: 



0,1
state,time_aggregated
source,sensor_community
observation_variable,P1
time_slice,2023-08-01 00:00

0,1,2,3
time,"datetime64[ns, UTC]",,The time that the observation was made.
station_id,object,,A unique identifer for a stationary sensor.
sensor_type,object,,
location,int,,
lat,float64,°,"The lattitude of the observation, referenced to WGS84 (EPSG: 4326)"
lon,float64,°,"The longitude of the observation, referenced to WGS84 (EPSG: 4326)"
P1,float64,,

0,1
name,FileMessage
metadata,"MetaData(source = sensor_community, variable = None)"

0,1
name,CSVChunker

0,1
name,TabularMessage
metadata,"MetaData(source = sensor_community, variable = None)"

0,1
name,CSVParser

0,1
name,TabularMessage
metadata,"MetaData(source = sensor_community, variable = P1)"

0,1
name,TimeAggregator

time,station_id,sensor_type,location,lat,lon,P1
2023-08-01 00:00:00+00:00,78798,SDS011,68146,51.578000,4.778000,
2023-08-01 00:00:00+00:00,13649,SDS011,6898,50.918000,4.688000,0.00
2023-08-01 00:00:00+00:00,24225,SDS011,51462,51.422000,5.514000,0.00
2023-08-01 00:00:00+00:00,32599,SDS011,19277,51.778000,7.912000,0.00
2023-08-01 00:00:00+00:00,37227,SDS011,23123,43.125047,25.682165,0.00
2023-08-01 00:00:00+00:00,44648,SDS011,30354,51.130280,5.607592,0.00
2023-08-01 00:00:00+00:00,51606,SDS011,37691,53.264591,6.483177,0.00
2023-08-01 00:00:00+00:00,62884,SDS011,48971,47.539732,19.113040,0.00
2023-08-01 00:00:00+00:00,65226,SDS011,51805,49.012000,12.104000,0.00
2023-08-01 00:00:00+00:00,70729,SDS011,58315,53.336000,55.932000,0.00


That messages matches with this/these action(s): 

 QualityControl(a_string)

Here's the first output message from passing the previous message to the first matching action: 



0,1
state,quality_controlled
source,sensor_community
observation_variable,P1
time_slice,2023-08-01 00:00

0,1
name,FileMessage
metadata,"MetaData(source = sensor_community, variable = None)"

0,1
name,CSVChunker

0,1
name,TabularMessage
metadata,"MetaData(source = sensor_community, variable = None)"

0,1
name,CSVParser

0,1
name,TabularMessage
metadata,"MetaData(source = sensor_community, variable = P1)"

0,1
name,TimeAggregator

0,1
name,TabularMessage
metadata,"MetaData(source = sensor_community, variable = P1)"

0,1
name,QualityControl

time,station_id,sensor_type,location,lat,lon,P1
2023-08-01 00:00:00+00:00,78798,SDS011,68146,51.578000,4.778000,0
2023-08-01 00:00:00+00:00,13649,SDS011,6898,50.918000,4.688000,0
2023-08-01 00:00:00+00:00,24225,SDS011,51462,51.422000,5.514000,0
2023-08-01 00:00:00+00:00,32599,SDS011,19277,51.778000,7.912000,0
2023-08-01 00:00:00+00:00,37227,SDS011,23123,43.125047,25.682165,0
2023-08-01 00:00:00+00:00,44648,SDS011,30354,51.130280,5.607592,0
2023-08-01 00:00:00+00:00,51606,SDS011,37691,53.264591,6.483177,0
2023-08-01 00:00:00+00:00,62884,SDS011,48971,47.539732,19.113040,0
2023-08-01 00:00:00+00:00,65226,SDS011,51805,49.012000,12.104000,0
2023-08-01 00:00:00+00:00,70729,SDS011,58315,53.336000,55.932000,0


That messages matches with this/these action(s): 

 CSVEncoder([Match(state = 'quality_controlled', source = 'sensor.community')])
ODCEncoder([Match(state = 'quality_controlled', source = 'sensor.community')])

Here's the first output message from passing the previous message to the first matching action: 



0,1
state,encoded
source,sensor_community
observation_variable,P1
time_slice,2023-08-01 00:00
encoded_format,csv
filepath,/Users/math/git/iot-ingester-deployment/dockerfiles/worker/iot-ingester/notebooks/sensor.communit...

0,1
name,FileMessage
metadata,"MetaData(source = sensor_community, variable = None)"

0,1
name,CSVChunker

0,1
name,TabularMessage
metadata,"MetaData(source = sensor_community, variable = None)"

0,1
name,CSVParser

0,1
name,TabularMessage
metadata,"MetaData(source = sensor_community, variable = P1)"

0,1
name,TimeAggregator

0,1
name,TabularMessage
metadata,"MetaData(source = sensor_community, variable = P1)"

0,1
name,QualityControl

0,1
name,TabularMessage
metadata,"MetaData(source = sensor_community, variable = P1)"

0,1
name,CSVEncoder


No more matches, the message is fully processed!


In [17]:
# Pull out the parser object that matches this message
message = FileMessage(
    metadata=MetaData(state='big_file',
        source='sensor_community',
        filepath=Path('data/inputs/sensor_community/2023-08/2023-08_sds011.csv'))) # May need to download this file manually


message_history = []
for _ in range(4):
    message_history.append(message)
    matching = [action for action in actions if action.matches(message)]
    
    if not matching: 
        break

    action = matching[0]

    # Special case for the TimeAggregator
    if action.__class__.__name__ == "TimeAggregator":
        list(action.process(message)) #need the list call here to pump the iterator to completion even if it doesn't return anything
        message = next(action.process(FinishMessage("We're done!")))
    else:
        message = next(action.process(message))

message

0,1
state,quality_controlled
source,sensor_community
observation_variable,P1
time_slice,2023-08-01 00:00

0,1
name,FileMessage
metadata,"MetaData(source = sensor_community, variable = None)"

0,1
name,CSVChunker

0,1
name,TabularMessage
metadata,"MetaData(source = sensor_community, variable = None)"

0,1
name,CSVParser

0,1
name,TabularMessage
metadata,"MetaData(source = sensor_community, variable = P1)"

0,1
name,TimeAggregator

0,1
name,TabularMessage
metadata,"MetaData(source = sensor_community, variable = P1)"

0,1
name,QualityControl

time,station_id,sensor_type,location,lat,lon,P1
2023-08-01 00:00:00+00:00,78798,SDS011,68146,51.578000,4.778000,100
2023-08-01 00:00:00+00:00,13649,SDS011,6898,50.918000,4.688000,100
2023-08-01 00:00:00+00:00,24225,SDS011,51462,51.422000,5.514000,100
2023-08-01 00:00:00+00:00,32599,SDS011,19277,51.778000,7.912000,100
2023-08-01 00:00:00+00:00,37227,SDS011,23123,43.125047,25.682165,100
2023-08-01 00:00:00+00:00,44648,SDS011,30354,51.130280,5.607592,100
2023-08-01 00:00:00+00:00,51606,SDS011,37691,53.264591,6.483177,100
2023-08-01 00:00:00+00:00,62884,SDS011,48971,47.539732,19.113040,100
2023-08-01 00:00:00+00:00,65226,SDS011,51805,49.012000,12.104000,100
2023-08-01 00:00:00+00:00,70729,SDS011,58315,53.336000,55.932000,100
