## Goals before Sunday

- Create Metadata Manager
    - Enable querying of Notion
    - Show what exists in Notion vs Postgres
    - Import from Notion
    ```python
    from services.metadata import MetadataManager
    from services.file_uploader import DiveDBUploader

    uploader = DiveDBUploader(configs)
    metadata_manager = MetadataManager(configs?)
    metadata_manager.list_models(model_type=("animal", "deployment", "animal_deployment", "logger", "recording"))
    metadata_manager.compare_to_notion(model_type=("animal", "deployment", "animal_deployment", "logger", "recording"))
    metadata_manager.import_from_notion(model_type=("animal", "deployment", "animal_deployment", "logger", "recording"))
    ```    

- Update File Uploader
    - Add hypno upload
    - Add edf upload
        - Use csv_metadata_map to map the metadata to the edf file
    - Write to OpenStack 
    - Write to Postgres

    ```python
    uploader.upload_edf(edf_file_path, csv_metadata_path, csv_metadata_map)
    uploader.upload_hypno(csv_hypno_path, type=("graph"|"gram"))
    ```

    - Include some prompts with questions for the uploader

- Update Delta Lake
    - Read from Delta Lake using metadata
    - Write to OpenStack 
    - Import from OpenStack by Metadata
    ```python
    ducklake.read_from_delta(
        animal_ids=[],
        deployment_ids=[],
        logger_ids=[],
        recording_ids=[],
        signal_names=[]
    )
    ```

In [1]:

import mne
from netCDF4 import Dataset
import json
import dask.array as da
from dask import delayed
import os, logging
from services.utils.timing import TimingContext
import pyarrow as pa
import pyarrow.compute as pc
from services.delta_lake import Duck_Lake
from services.utils.directory_utils import get_tmpdir
from prefect import flow, task
from prefect_dask import DaskTaskRunner
from dataclasses import dataclass, asdict
from typing import List
from uuid import uuid4 as uuid
from datetime import timedelta
import numpy as np
import gc
from datetime import datetime

import django

logging.basicConfig()
logging.root.setLevel(logging.INFO)

ducklake = Duck_Lake()

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "server.django_app.settings")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
django.setup()

from server.metadata.models import (
    Deployments,
    Loggers,
    Animals,
    Recordings,
    AnimalDeployments,
    Files
)

In [2]:
logging.basicConfig()
logging.root.setLevel(logging.INFO)

my_edf_file_path = os.path.join(
    os.environ["CONTAINER_FILE_STORAGE_PATH"],
    "test33_HypoactiveHeidi_05_DAY1_PROCESSED.edf",
)
my_parquet_output_dir = os.path.join(os.environ["CONTAINER_FILE_STORAGE_PATH"], "test")

# Non-EEG
misc_channels = [
    "pitch",
    "roll",
    "heading",
    "GyrZ",
    "MagZ",
    "Tag_On",
    "Depth",
    "MagX",
    "MagY",
]

@dataclass
class SignalMetadata:
    signal_name: str
    frequency: float
    start_time: str
    end_time: str

@dataclass
class SignalData:
    signal_name: str
    # year: int
    # month: int
    # day: int
    # hour: int
    time: str
    data: float

def numpy_datetime64_to_datetime(np_datetime):
    return np_datetime.astype('datetime64[us]').astype(datetime)

# @task
def read_signal(
    edf_file_path,
    signal_name,
    mode: str = "SINGLE"
):
    """Function to read a single signal from an EDF file."""
    raw = mne.io.read_raw_edf(edf_file_path, include=[signal_name], preload=False)
    signal = raw.pick(signal_name).get_data()
    data = signal[0]
    start_time = np.datetime64(raw.info["meas_date"])
    freq = raw.info["sfreq"]
    data_indices = np.arange(len(data)) / float(freq)
    timedelta_array = (data_indices * 1000000).astype('timedelta64[us]')
    times = pa.array((start_time + timedelta_array).astype(float))
    end_time = np.datetime64(int(times[-1].as_py()), 'us')
    signal_name_arr = np.full(len(times), signal_name)
    
    if mode == 'SINGLE':
        return (
            SignalData(
                signal_name=signal_name_arr,
                # year=year,
                # month=month,
                # day=day,
                # hour=hour,
                time=times,
                data=data
            ),
            SignalMetadata(
                signal_name=[signal_name],
                frequency=[freq],
                start_time=[start_time],
                end_time=[end_time]
            ),
        )
     

# @flow
def process_edf(
    edf_file_path: str,
    schema: pa.schema
):
    with TimingContext("EDF Read"):
        raw = mne.io.read_raw_edf(edf_file_path, preload=False)
        
        mode="overwrite"
        for signal_name in raw.ch_names[0:2]:
            logging.info(signal_name)
            signalData, signalMetadata = read_signal(edf_file_path, signal_name)
            logging.info(signalMetadata)
            table = pa.table(asdict(signalData), schema=schema)
            
            # Find a random animal and deployment and logger
            animal = Animals.objects.order_by('?').first()
            deployment = Deployments.objects.order_by('?').first()
            logger = Loggers.objects.order_by('?').first()
            
            # Find a random animal and deployment and logger
            animal = Animals.objects.order_by('?').first()
            deployment = Deployments.objects.order_by('?').first()
            logger = Loggers.objects.order_by('?').first()

            # Create a new animal deployment (if one doesn't exist)
            animal_deployment = AnimalDeployments.objects.create(
                animal=animal,
                deployment=deployment
            )

            # Create a new recording
            recording = Recordings.objects.get_or_create(
                animal_deployment=animal_deployment,
                logger=logger,
                start_time=numpy_datetime64_to_datetime(signalMetadata.start_time[0]).strftime('%Y-%m-%d %H:%M:%S.%f'),
                end_time=numpy_datetime64_to_datetime(signalMetadata.end_time[0]).strftime('%Y-%m-%d %H:%M:%S.%f')
            )
            
            # Create a new file
            file = Files.objects.create(
                recording=recording,
                file_path=edf_file_path,
                extension="edf",
                type="data"
            )
            
            ducklake.write_to_delta(
                data=table,
                schema=schema,
                mode=mode,
                partition_by=['signal_name'],
                name=file.file_path,
                description="test"
            )
            del table, signalData
            gc.collect()




DataSchema = pa.schema(
    [
        pa.field("signal_name", pa.string()),
        # pa.field("year", pa.int16()),
        # pa.field("month", pa.int16()),
        # pa.field("day", pa.int16()),
        # pa.field("hour", pa.int16()),
        
        # Uncomment if working with dates
        # pa.field("time", pa.timestamp('us', tz="UTC")),
        pa.field("time", pa.float64()),
        pa.field("data", pa.float64()),
    ]
)

MetadataSchema = pa.schema(
    [
        pa.field("signal_name", pa.string()),
        pa.field("freq", pa.int16()),
        pa.field("start_time", pa.timestamp('us', tz="UTC")),
        pa.field("end_time", pa.timestamp('us', tz="UTC")),
    ]
)





uploader = DiveDBUploader(
    auth_token="",
    metadata_host="",
    delta_lake_path=os.environ["CONTAINER_DELTA_LAKE_PATH"]
)

uploader.lookup_loggers()
uploader.set_logger()
uploader.create_logger()

uploader.lookup_animals()
uploader.set_animal()
uploader.create_animal()

uploader.lookup_adeployemnts()
uploader.set_deployemnts()
uploader.create_deployemnts()

uploader.lookup_recordings()
uploader.set_recordings()
uploader.create_recordings()


uploader.set_metadata(
    loggers=None,
    animals=None,
    deployemnts=None,
    recordings=None,
)
uploader.upload_files()

# Add project id to animal model
# Load up hypoactive Heidi
# Load up another seals
# Load up the matching hypotrack (if not hypnogram, then create one)
# Heart rate peak detection
    # in pyologger Feature generation utils

with TimingContext("Main"):
    process_edf(my_edf_file_path, DataSchema)


Extracting EDF parameters from /data/files/test33_HypoactiveHeidi_05_DAY1_PROCESSED.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...


  raw = mne.io.read_raw_edf(edf_file_path, preload=False)
  value = np.nanmax([_prefilter_float(x) for x in values])
  raw = mne.io.read_raw_edf(edf_file_path, preload=False)
  value = np.nanmin([_prefilter_float(x) for x in values])


Extracting EDF parameters from /data/files/test33_HypoactiveHeidi_05_DAY1_PROCESSED.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...


  start_time = np.datetime64(raw.info["meas_date"])




ValueError: Cannot assign "(<Recordings: Recordings object (20210420_oror-002_CC-96)>, True)": "Files.recording" must be a "Recordings" instance.

In [None]:
# Define the recording ID you want to query

# Query to get the file path from Django ORM
file_record = Files.objects.order_by('?').first()
if file_record:
    file_path = file_record.file_path

else:
    raise ValueError(f"No file found for file: {file_record}")

# Now use the retrieved file path in your Delta Lake query
df2 = ducklake.conn.execute(
    f'''
    
        SELECT signal_name, data, time::INT64 as t
        FROM delta_scan('{os.environ["CONTAINER_DELTA_LAKE_PATH"]}')
    
    '''
).pl()
display(df2)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

signal_name,data,t
str,f64,i64
"""ECG_ICA8""",-0.000031,1618941728000000
"""ECG_ICA8""",-0.000033,1618941728002000
"""ECG_ICA8""",-0.000032,1618941728004000
"""ECG_ICA8""",-0.000032,1618941728006000
"""ECG_ICA8""",-0.000031,1618941728008000
…,…,…
"""ECG_ICA8""",0.000065,1619028127990000
"""ECG_ICA8""",0.000067,1619028127992000
"""ECG_ICA8""",0.000067,1619028127994000
"""ECG_ICA8""",0.000064,1619028127996000


In [None]:
df_pivot = df2.pivot(on="signal_name", values="data")
df_pivot.plot.line(x="t", y=["ECG_ICA8"])  

: 

: 

: 