# Debug IonBeam Sources
The purpose of this notebook is to debug a particular source by running various components of its functionality and printing it. Input the target source below and it will be selected from the config.

In [1]:
# source_to_debug = "meteotracker"
source_to_debug = "smart_citizen_kit"
# source_to_debug = "acronet"

offline = True

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# !pip install wurlitzer

In [4]:
%load_ext wurlitzer

In [5]:
import pandas as pd
import numpy as np
import logging
import pyfdb
import findlibs
import yaml
from pathlib import Path
import os
import pandas as pd
import pyodc
import shutil
from pathlib import Path
from ionbeam.core.config_parser import parse_config
from ionbeam.core.bases import *
from IPython.display import display
from matplotlib import pyplot as plt
from ionbeam.metadata.db import init_db

In [6]:
logging.basicConfig(level = logging.INFO)
logging.getLogger('matplotlib').setLevel(level = logging.WARNING)

print("Parsing config...")
config_file = Path("~/git/IonBeam-Deployment/config/ionbeam").expanduser()
config, actions = parse_config(config_file,
                    config_path = "./",
                    data_path = "../data/",
                    offline = offline,
                    environment  = "local",
                    sources = [source_to_debug]
                    )

if config.globals.environment == "local": 
    print("Wiping and initialising local database...")
    init_db(config.globals)

source = [a for a in actions if isinstance(a, Source)][0]



Parsing config...




Wiping and initialising local database...


INFO:ionbeam.metadata.db:Adding 'Sensor.Community' to Authors table
INFO:ionbeam.metadata.db:Adding 'Meteotracker' to Authors table
INFO:ionbeam.metadata.db:Adding 'Acronet' to Authors table
INFO:ionbeam.metadata.db:Adding 'SmartCitizenKit' to Authors table


In [7]:
source

0,1
id,eed0d6c1-8e5b-4fe3-b516-9e8212041926
mappings,"[InputColumn(name='time', key='time', type=None, unit=None, discard=False, canonical_variable=Non..."
finish_after,
copy_metadata_to_columns,"[station_name, lat, lon, created_at, city, country, author]"
cache_version,3
use_cache,True
cache_directory,/Users/math/git/IonBeam-Deployment/data/inputs/smart_citizen_kit

name,value
source_action_id,eed0d6c1-8e5b-4fe3-b516-9e8212041926
state,raw
mars_request,{}

name,value
canonical_variables,"[CanonicalVariable(name='sensor_name', unit=None, desc='The name of the sensor that made the obse..."
data_path,/Users/math/git/IonBeam-Deployment/data
metkit_language_template,/Users/math/git/IonBeam-Deployment/config/metkit/language.yaml.template
environment,local
fdb_schema_path,/Users/math/git/IonBeam-Deployment/config/fdb/server/custom_schema
secrets_file,/Users/math/git/IonBeam-Deployment/config/ionbeam/secrets.yaml
config_path,.
offline,True
overwrite,True
ingestion_time_constants,"IngestionTimeConstants(query_timespan=(datetime.datetime(2024, 7, 15, 13, 29, 42, 610342, tzinfo=..."


## Show the connection structure of the actions defined in the config

In [8]:
from ionbeam.core.config_parser import print_action_chains
print_action_chains(actions)

SmartCitizenKitSource --> CSVParser --> Splitter --> TimeAggregator --> ODCEncoder
[Match(state = 'odc_encoded')] --> RESTWriter()


## Show the raw data keys that are being extracted from the raw api data and which canonical variables they are mapped to

In [9]:
print("Source copies these external keys to these canonical variables:")
import pandas as pd
cvs = {c.name : c for c in config.globals.canonical_variables}
pd.DataFrame.from_records((dict(
                               source_key = c.key, source_unit = c.unit, 
                               discard = c.discard,
                               canonical_name = c.name if not c.discard else "",
                               canonical_unit = cvs[c.name].unit if not c.discard else "",
                                ) for c in source.mappings))

Source copies these external keys to these canonical variables:


Unnamed: 0,source_key,source_unit,discard,canonical_name,canonical_unit
0,time,,False,time,
1,device.name,,False,station_name,
2,device.name,,False,author,
3,device.location.latitude,EPSG:4326,False,lat,EPSG:4326
4,device.location.longitude,EPSG:4326,False,lon,EPSG:4326
...,...,...,...,...,...
69,adc_48_3,V,True,,
70,adc_49_0,V,True,,
71,adc_49_1,V,True,,
72,adc_49_2,V,True,,


## Get source chunks for the last seven days

In [10]:
from datetime import datetime, timedelta, timezone
import itertools as it

now = datetime.now(tz=timezone.utc)
earlier = now - timedelta(days = 7)

chunk_iterable = source.get_chunks(earlier, now)
chunks = list(it.islice(chunk_iterable, 20))
chunks[0].keys()

dict_keys(['key', 'device_id', 'start_date', 'end_date', 'device'])

## Download one of those chunks

In [11]:
raw_messages = list(source.download_chunk(chunks[0]))      
print(f"{len(raw_messages) = }")

len(raw_messages) = 1


In [12]:
msg = raw_messages[0]
msg.data

Unnamed: 0_level_0,tvoc,eco2,light,noise_dba,no2,bar,o3,pm_avg_1,pm_avg_10,pm_avg_2.5,pm_pn0.3,pm_pn0.5,pm_pn1.0,pm_pn10.0,pm_pn2.5,pm_pn5.0,h,t
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-07-15 15:00:21+00:00,9.0,462.0,1297,66.47,83.430124,100.98,21.809852,8,12,12,,,,,,,46.96,35.21
2024-07-15 16:00:21+00:00,14.0,493.0,975,71.76,46.772197,100.95,11.588342,3,7,7,,,,,,,47.07,33.85
2024-07-15 17:00:21+00:00,26.0,572.0,533,65.64,54.482640,100.96,44.787014,2,4,4,,,,,,,51.20,32.56
2024-07-15 18:00:21+00:00,36.0,641.0,99,65.09,75.703655,100.97,9.398815,4,9,6,,,,,,,59.00,28.98
2024-07-15 19:00:21+00:00,77.0,907.0,8,60.84,34.563297,100.92,31.474285,2,8,6,,,,,,,64.29,26.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-22 09:00:17+00:00,0.0,400.0,1660,66.99,25.229955,100.77,29.472561,13,19,18,,,,,,,53.97,34.02
2024-07-22 10:00:17+00:00,0.0,400.0,1851,59.90,45.771545,100.78,14.966876,12,19,17,,,,,,,51.39,35.42
2024-07-22 11:00:17+00:00,0.0,400.0,1929,59.38,67.411699,100.76,71.020489,11,20,17,,,,,,,51.56,35.40
2024-07-22 12:00:17+00:00,0.0,400.0,1653,66.38,47.925488,100.74,5.169774,10,19,17,,,,,,,49.32,36.72


## Use the actual interface so that extra steps like copying metadata over get done

In [14]:
raw_messages = list(it.islice(source.generate(), 5))
len(raw_messages)

3

In [None]:
raw_messages[0].data.columns

In [None]:
raw_messages[0].data

In [34]:
raw_messages[0].data.station_name

AttributeError: 'DataFrame' object has no attribute 'station_name'

## Check the metadata that got loaded in by this

In [14]:
import json
from sqlalchemy.orm import Session
from ionbeam.metadata import db

with Session(config.globals.sql_engine) as session:
    stations = session.query(db.Station).all()
    for s in stations:
        s = s.as_json()
        del s["geojson"]
        for k in ["name", "description", "external_id"]:
            print(f"{k}: {s[k]}")
        print(f"Properties: {[p['name'] for p in s['sensors'][0]['properties']]}")
        print()

name: I-CHANGE CIMA CORSO EUROPA 2024
description: Smart Citizen Kit
external_id: 17054
Properties: ['total_volatile_organic_compounds', 'equivalent_carbon_dioxide']

name: I-CHANGE MELE NEW
description: Smart Citizen Kit 2.1 with Urban Sensor Board
external_id: 16763
Properties: ['total_volatile_organic_compounds', 'equivalent_carbon_dioxide']

name: I-CHANGE CIMA SAVONA
description: Smart Citizen Kit 2.1 with Urban Sensor Board
external_id: 16030
Properties: ['total_volatile_organic_compounds', 'equivalent_carbon_dioxide']

name: I-CHANGE CIMA ARENZANO
description: Smart Citizen Kit 2.1 with Urban Sensor Board
external_id: 16517
Properties: ['total_volatile_organic_compounds', 'equivalent_carbon_dioxide']

name: I-CHANGE CIMA CASTAGNA
description: Smart Citizen Kit 2.1 with Urban Sensor Board
external_id: 16643
Properties: ['total_volatile_organic_compounds', 'equivalent_carbon_dioxide']



## Pass a message through the CSVParser

In [None]:
from IPython.display import display

def display_columns(cols):
    display(pd.DataFrame.from_records((dict(key = c.key,  
                               discard = c.discard,
                               name = c.name if not c.discard else "", 
                                source_unit = c.unit,
                               unit = cvs[c.name].unit if not c.discard else "", ) for c in cols), index = "name"))


csv_parser = [a for a in actions if isinstance(a, Parser)][0]
print("Columns that will be present in all output because they represent metadata")
display_columns(csv_parser.fixed_columns)

print("\n\nData Columns")
display_columns(csv_parser.value_columns)

In [None]:
raw_messages[0].data

In [None]:
csv_parser.columns_mapping

In [None]:
df = csv_parser.format_dataframe(raw_messages[0].data)
df

In [None]:
def pump_through(action, messages):
    return [out_msg
           for in_msg in messages
           for out_msg in action.process(in_msg)]

parsed_messages = pump_through(csv_parser, raw_messages)

In [None]:

f, ax1 = plt.subplots(figsize = [10,5])
for i, m in enumerate(parsed_messages):
    t = m.data.time
    start, end = m.data.time.min(), m.data.time.max()
    ax1.hlines(y = i, xmin = start, xmax = end, label = m.data.station_id.iloc[0])
    # print(start, end)

ax1.vlines(x = [start, end], ymin = 0, ymax = len(parsed_messages), linestyle = "dotted", color = "k")
ax1.legend()

In [None]:
from ionbeam.aggregators import TimeAggregator
from ionbeam.parsers import Splitter

splitter = [a for a in actions if isinstance(a, Splitter)][0]
time_aggregator = [a for a in actions if isinstance(a, TimeAggregator)][0]

split_messages = pump_through(splitter, parsed_messages)
print(f"{len(split_messages)=}")

In [None]:
aggregated_messages = pump_through(time_aggregator, split_messages)

# Force the aggregator to dumps its contents even if it's not happy about it 
unhappy_aggregated_messages = list(time_aggregator.process(FinishMessage("We're done!")))

aggregated_messages

In [None]:
f, ax1 = plt.subplots(figsize = [10,5])
for i, m in enumerate(aggregated_messages):
    t = m.data.time
    start, end = m.data.time.min(), m.data.time.max()
    ax1.hlines(y = i, xmin = start, xmax = end)
    # print(start, end)

for i, m in enumerate(unhappy_aggregated_messages):
    t = m.data.time
    start, end = m.data.time.min(), m.data.time.max()
    ax1.hlines(y = i, xmin = start, xmax = end, color= "red")
    # print(start, end)

ax1.vlines(x = [start, end], ymin = 0, ymax = len(parsed_messages), linestyle = "dotted", color = "k")