# Inspecting datastreams

Goal is to find the nature of the data, and inspect the timestamps and rates of incoming data streams

In [None]:
import matplotlib.pyplot as plt
import datetime
import tilemapbase as tmb
import numpy as np
import pandas as pd
from pluma.stream.georeference import Georeference
from pluma.stream.ubx import _UBX_MSGIDS
from pluma.schema import Dataset
from IPython.display import clear_output


## Notebook plotting
%matplotlib inline
%matplotlib widget

plt.style.use('ggplot')

## Figure export parameters
new_rc_params = {'text.usetex': False,
"svg.fonttype": 'none'
}
import matplotlib as mpl
mpl.rcParams.update(new_rc_params)

## Check the errors in the code below when if below load from remote is True 
Another point is that environment.yaml should have specific versions of libraries that we are loading 
We need to update readme.md in order to explain how to get data from aws

# Set the paths to the dataset and build the dataset

In [None]:
LOAD_FROM_REMOTE = True
root = r"pathToRoot"

stream_root_folder = 's3://emotional-cities/data/nrg/poc-v1/FMUL/FMUL2022_10_21_11_20_53/'  # Path to the dataset. Can be local or remote.

dataset = Dataset(stream_root_folder, datasetlabel="FMUL_" + stream_root_folder.split("\\")[-1], georeference= Georeference())  # Create a Dataset object that will contain the ingested data.
dataset.populate_streams(autoload=False)  # Add the "schema" that we want to load to our Dataset. If we want to load the whole dataset automatically, set autoload to True.

if LOAD_FROM_REMOTE:
# To load a single stream, we can set the autoload property to "True" and use the Dataset.reload_streams method. In this case, we will load all streams by default
    if False:
        dataset.streams.EEG.autoload = True
        dataset.streams.UBX.autoload = True
        dataset.reload_streams(force_load=False)
    # For now, we will build the whole dataset:
    else:
        dataset.reload_streams(force_load=True)  # We will just load every single stream at the same time. This might take a while if loading from AWS
        dataset.add_georeference_and_calibrate()
        dataset.export_dataset(filename=f"{root}\dataset.pickle") # We can export the dataset as a .pickle file.

    # Some warnings will be printed if some sensors were not acquired during the experiment. These are normal and can be usually ignored.

# In order to not having to run this routine multiple times, the output of the ingestion can be saved as a pickle file to be loaded later. E.g.:
else:
    dataset = Dataset.import_dataset(f"{root}\dataset.pickle")  # ... and reimport it at a later point.

print(f"Dataset: {dataset} loaded successfully, and {'not' if not dataset.has_calibration else 'sucessfully'} calibrated." )



Define function to find the basic statistics and plot the difference between adjacent timestamps

In [None]:
def diff_stats(stream, start = 100, end = 200, xlabel = 'Time', ylabel = ''):
    try:
        diff_series = stream.data.index.to_series().diff().dropna().astype(np.int64)  / int(1e6)
        mini = diff_series[1:].min()
        maxi = diff_series[1:].max()
        meani = diff_series[1:].mean()
        print(f'Minimum = {mini}. Maximum = {maxi}. Mean = {meani}') 
        # names = stream.data.columns
        
        plt.figure()
        plt.plot(diff_series)
        plt.title(stream.streamlabel + ' Difference')
        plt.ylabel('Time Difference (ms)')
        plt.xlabel(stream.data.index.name)
        
        # plt.figure()
        # plt.title('Data')
        stream.data.plot(title = stream.streamlabel)

        # plt.plot(data)
        # plt.title('Data')
        # plt.ylabel(ylabel)
        # plt.xlabel(xlabel)

        print(diff_series[start:end])
    except:
        clear_output(wait=False)
    return diff_series

### Check basic statistics and plot timestamp difference

[ECG](https://neurogears.sharepoint.com/:b:/s/EmotionalCities/EYOX02N88hRHnUCdREf_kq0BEoxvZY92nHfPOPZmq7Ua3Q?e=xWQPvN) 1KHz


In [None]:
ecg = dataset.streams.BioData.ECG
# ecg.streamlabel
diff_stats(ecg)


Microphone 

Microphone has two streams currently captured data is being done at 44100 kHz with buffers od 0.1 ms which means that every buffer has 5 samples
 1. The timestamps for each buffer sample 
 2. Each buffer data 

In [None]:
audio_timestamps = dataset.streams.Microphone.BufferIndex
diff_stats(audio_timestamps)

In [None]:
audio_data = dataset.streams.Microphone.Audio.data
number_of_audio_buffers = audio_data.shape[0]/5

unique_set = sorted(set(pd.DataFrame(audio_timestamps.data).index))

In [None]:
print('Difference between number of audio timestamps and number of audio buffers')
print(number_of_audio_buffers - audio_timestamps.data.shape[0])
print(f'audio_timestamps = {audio_timestamps.data.shape[0]}')
print(f'number_of_audio_buffers = {number_of_audio_buffers}')
print('Unique Timestamps :')
unique_set

## Tinkerforge GPS V2
From [tinkerforge webpage](https://www.tinkerforge.com/en/doc/Hardware/Bricklets/GPS_V2.html)
Supports GPS and GLONASS simultaneously
Receives movement-, position-, altitude, time data and PPS signal
Elevation, azimuth and SNR for each GPS/GLONASS satellite accessible
99 channels, 10Hz update rate
 - Latitude 
 - Longitude 
 - Altitude
 - Date 
 - Time
 - Has Fix


Latitude is only measured when GPS receives satellite signals

In [None]:
tk_latitude = dataset.streams.TK.GPS.Latitude 
diff_stats(tk_latitude)

Longitude is only measured when GPS receives satellite signals

In [None]:
tk_longitude = dataset.streams.TK.GPS.Longitude 
diff_stats(tk_longitude)

Altitude is only measured when GPS receives satellite signals

In [None]:
tk_altitude = dataset.streams.TK.GPS.Longitude 
diff_stats(tk_altitude)

Date the day currently named Data (should be renamed to date)

In [None]:
tk_date = dataset.streams.TK.GPS.Data 
diff_stats(tk_date)

Time hours in the day

In [None]:
tk_time = dataset.streams.TK.GPS.Time 
diff_stats(tk_time)

HasFix means that GPS has a position info from the satellites

In [None]:
tk_has_fix = dataset.streams.TK.GPS.HasFix 
diff_stats(tk_has_fix)

## Air quality 
From [tinkerforge webpage](https://www.tinkerforge.com/en/doc/Hardware/Bricklets/Air_Quality.html)
Measures IAQ (indoor air quality) index, air pressure, humidity and temperature
IAQ index and humidity values are temperature compensated  
Configurable temperature compensation for use cases in enclosures
 - IAQ (indoor air quality) index
 - Temperature in °C
 - Humidity in %RH
 - Air pressure in hPa

IAQ (indoor air quality) index

In [None]:
tk_air_quality = dataset.streams.TK.AirQuality.IAQIndex
diff_stats(tk_air_quality)

Temperature in °C

In [None]:
tk_temperature = dataset.streams.TK.AirQuality.Temperature
diff_stats(tk_temperature)

Humidity in %RH

In [None]:
tk_humidity = dataset.streams.TK.AirQuality.Humidity
diff_stats(tk_humidity)

Air pressure in hPa

In [None]:
tk_air_pressure = dataset.streams.TK.AirQuality.AirPressure
diff_stats(tk_air_pressure)

## Tinkerforge Sound pressure 
From [tinkerforge webpage](https://www.tinkerforge.com/en/doc/Hardware/Bricklets/Sound_Pressure_Level.html)
The values stored are in dbx10


In [None]:
tk_sound_pressure = dataset.streams.TK.SoundPressureLevel.SPL
diff_stats(tk_sound_pressure)


In [None]:
ax = (tk_sound_pressure.data/10).plot(title = 'dB')
ax.set_ylabel("Sound Pressure Level (dB)")


## Tinkerforge Humidity 
From [tinkerforge webpage](https://www.tinkerforge.com/en/doc/Hardware/Bricklets/Humidity_V2.html)
Values stored are in relative humidity x 100.0

In [None]:
tk_humidity = dataset.streams.TK.Humidity.Humidity
diff_stats(tk_humidity)

In [None]:
ax = (tk_humidity.data/100).plot(title = 'RH%')
ax.set_ylabel("Relative Humidity (%)")