In [9]:
import mne
from netCDF4 import Dataset
import json
import dask.array as da
from dask import delayed
import os, logging
from services.utils.timing import TimingContext
import pyarrow as pa
from services.delta_lake import Duck_Lake

print("Imports complete")
logging.basicConfig()
logging.root.setLevel(logging.INFO)


my_edf_file_path = os.path.join(
    os.environ["CONTAINER_FILE_STORAGE_PATH"],
    "test33_HypoactiveHeidi_05_DAY1_PROCESSED.edf",
)
my_parquet_output_dir = os.path.join(os.environ["CONTAINER_FILE_STORAGE_PATH"], "test")

from dataclasses import dataclass, asdict
from typing import List


@dataclass
class SignalSchema:
    signal_name: str
    frequency: float
    start_time: float
    data: List[float]


def read_signal(
    edf_file_path,
    signal_name,
):
    """Function to read a single signal from an EDF file."""
    raw = mne.io.read_raw_edf(edf_file_path, preload=False)
    signal = raw.pick(signal_name).get_data()
    return SignalSchema(
        signal_name=signal_name,
        frequency=raw.info["sfreq"],
        start_time=pa.scalar(raw.info["meas_date"], type=pa.timestamp('s')),
        # start_time=raw.info["meas_date"],
        data=signal[0],
    )


@delayed
def delayed_read_signal(edf_file_path, signal_name):
    return read_signal(edf_file_path, signal_name)


misc_channels = [
    "pitch",
    "roll",
    "heading",
    "GyrZ",
    "MagZ",
    "Tag_On",
    "Depth",
    "MagX",
    "MagY",
]


def process_edf(
    edf_file_path: str,
    schema: pa.schema,
    misc_channels: List[str] = misc_channels,
):
    with TimingContext("EDF Read"):
        raw = mne.io.read_raw_edf(edf_file_path, preload=False)

        channel_types = dict()
        
        for k in raw.ch_names:
            if k in misc_channels:
                channel_types[k] = "misc"
            else:
                channel_types[k] = "eeg"
        raw.set_channel_types(channel_types)
        
        channels_to_use = [ch for ch in raw.ch_names if ch not in misc_channels]
        
        buff = []
        for signal_name in channels_to_use[0:2]:
            signal = read_signal(edf_file_path, signal_name)
            buff.append(asdict(signal))
        logging.info(buff)
        table = pa.Table.from_pylist(buff, schema=schema)
        logging.info(table)
        ducklake = Duck_Lake()
        ducklake.write_to_delta(
            data=table,
            schema=schema,
            mode="append",
            partition_by=['signal_name'],
            name="test",
            description="test"
        )


schema = pa.schema(
    [
        pa.field("signal_name", pa.string()),
        pa.field("frequency", pa.float64()),
        pa.field("start_time", pa.timestamp("s")),
        pa.field("data", pa.list_(pa.float64())),
    ]
)

with TimingContext("Main"):
    process_edf(my_edf_file_path, schema)


# # Process each signal separately and write to a Parquet file
# for signal in signals:
#     # Load data for the current signal
#     signal_data = load_signal_data(signal)

#     # Create a PyArrow table from the data
#     table = pa.table(signal_data, schema=schema)

#     # Write the table to a Parquet file, partitioned by signal name
#     pq.write_table(table, f"{output_parquet_dir}/{signal}.parquet")

# # Once all signals are processed, combine them into a Delta Lake table

# # List of Parquet files to be combined
# parquet_files = [f"{output_parquet_dir}/{signal}.parquet" for signal in signals]

# # Convert each Parquet file into a Pandas DataFrame and write to Delta Lake
# for parquet_file in parquet_files:
#     # Load the Parquet file into a Pandas DataFrame
#     df = pd.read_parquet(parquet_file)

#     # Write the DataFrame to the Delta Lake table (append mode)
#     write_deltalake("/path/to/delta-lake", df, mode="append", partition_by=["signal_name"])

INFO:root:Starting Main...
INFO:root:Starting EDF Read...


Imports complete
Extracting EDF parameters from /data/files/test33_HypoactiveHeidi_05_DAY1_PROCESSED.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Extracting EDF parameters from /data/files/test33_HypoactiveHeidi_05_DAY1_PROCESSED.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...


  raw = mne.io.read_raw_edf(edf_file_path, preload=False)
  raw = mne.io.read_raw_edf(edf_file_path, preload=False)
  raw.set_channel_types(channel_types)
  raw = mne.io.read_raw_edf(edf_file_path, preload=False)
  raw = mne.io.read_raw_edf(edf_file_path, preload=False)


Extracting EDF parameters from /data/files/test33_HypoactiveHeidi_05_DAY1_PROCESSED.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...


  raw = mne.io.read_raw_edf(edf_file_path, preload=False)
  raw = mne.io.read_raw_edf(edf_file_path, preload=False)
INFO:root:[{'signal_name': 'ECG_Raw_Ch1', 'frequency': 500.0, 'start_time': <pyarrow.TimestampScalar: '2021-04-20T18:02:08+0000'>, 'data': array([ 3.96179072e-04,  3.94455253e-04,  3.91199152e-04, ...,
        1.01597684e-04,  8.68494607e-05, -4.14792323e-05])}, {'signal_name': 'ECG_ICA8', 'frequency': 500.0, 'start_time': <pyarrow.TimestampScalar: '2021-04-20T18:02:08+0000'>, 'data': array([-3.09471060e-05, -3.25372927e-05, -3.21631312e-05, ...,
        6.70832314e-05,  6.43705599e-05,  1.74841770e-04])}]
INFO:root:pyarrow.Table
signal_name: string
frequency: double
start_time: timestamp[s, tz=UTC]
data: list<item: double>
  child 0, item: double
----
signal_name: [["ECG_Raw_Ch1","ECG_ICA8"]]
frequency: [[500,500]]
start_time: [[2021-04-20 18:02:08Z,2021-04-20 18:02:08Z]]
data: [[[0.0003961790716411078,0.00039445525337605857,0.0003911991522087434,0.0003860276974135958,0.

SchemaMismatchError: Invalid data type for Delta Lake: Timestamp(Second, Some("UTC"))