# AICore-Bridge

> Fill in a module description here

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import typing
import os
import numpy as np
import pandas as pd

In [None]:
#| export
try:
    print(f"Loading {__name__} from {__file__}")
except:
    pass

## Timeseries dataframes

Timeseries data is a cornerstone of our data manipulation and most processing is on them

### `set_time_index_zone`

Utility to set the timezone on a datetime index

In [None]:
#| export

def set_time_index_zone(df:pd.DataFrame, timezone):
    if isinstance(df.index, pd.DatetimeIndex):
        df.index.name = 'time'
        if not hasattr(df.index, 'tz')  or not df.index.tz or not df.index.tz:
            df.index = df.index.tz_localize('UTC').tz_convert(timezone)
        elif str(df.index.tz) != timezone:
            df.index = df.index.tz_convert(timezone)

    return df


### timeseries_dataframe

Converts Pandas dataframes and series, Numpy array's and recarrays or a dictionary of 
individual timeseries into a Pandas dataframe with one datetime index. With all arrays 
dataframes and series it is assumed that the first column contains the timestamps.

In [None]:
#| export

def timeseries_dataframe(
        data:typing.Union[pd.DataFrame, pd.Series, dict, np.ndarray, np.recarray], 
        timezone='UTC', 
        columnnames=None):
    
    """Convert various tabular data formats to timeseries DataFrame"""

    if isinstance(data, pd.DataFrame):
        df = data

    elif isinstance(data, pd.Series):
        df = pd.DataFrame(data)

    elif isinstance(data, dict):
        # dict/mapping of individual timeseries
        df = pd.DataFrame({
            C:pd.Series(data=A[:,1], index=pd.DatetimeIndex(A[:,0]*1e9)) if isinstance(A, np.ndarray) else A
            for C,A in data.items()
        })

    elif data.dtype.names is not None:
        # structured or recarray, we use column names from the recarray
        df = pd.DataFrame(
            data=data.view(dtype=np.float64).reshape(data.shape[0],len(data.dtype))[:,range(1,len(data.dtype))],
            index=pd.DatetimeIndex(data.view(dtype=np.float64).reshape(data.shape[0],len(data.dtype))[:,0] * 1e9),
            columns=data.dtype.names[1:]
        )

    else:
        if data.shape[0] > 0:
            # column names, either 'value' if there is only one column, or
            # value_0, value_1 .... value_nn when more the one column is present
            if data.shape[1]>2:
                columns=[f"value_{str(i+1)}" for i in range(data.shape[1]-1)] if not columnnames else [f"{str(i)}" for i in columnnames[1:]]
            else:
                columns=['value']

            df = pd.DataFrame(
                data=data[:, 1:],
                index=pd.DatetimeIndex(data[:,0]*1e9),
                columns=columns
            )
        else:
            return pd.DataFrame()

    return set_time_index_zone(df, timezone)

In [None]:
#| export
def timeseries_dataframe_from_datadict(
        data:dict, 
        timecolumns=None,
        recordformat='records'):
        
    "Convert data dict to dataframe"

    orient = recordformat.lower()
    assert orient in ['records', 'table', 'split', 'index', 'tight']
    
    if orient == 'records':
        df = pd.DataFrame.from_records(data)
        time_column = [C for C in df.columns if C in timecolumns][0]

    elif orient == 'table':
        time_column = data['schema']['primaryKey'][0]
        df = pd.DataFrame.from_dict(data['data']).set_index(data['schema']['primaryKey'])
        df.index.name = 'time'
    else:
        df = pd.DataFrame.from_dict(data, orient=orient)
        time_column = df.index.name


    df.columns = list(df.columns)
    df[time_column] = pd.to_datetime(df[time_column],utc=True,format='ISO8601')
    df.set_index(time_column, inplace=True)
    #df.index = pd.DatetimeIndex(df.index).round('ms')
    
    df.index.name = 'time'

    return df


In [None]:
timeseries_dataframe_from_datadict([
      {
         "time":"2023-05-04T10:04:49.000Z",
         "value":16.72
      },
      {
         "time":"2023-05-04T10:24:51.000Z",
         "value":16.65
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":16.55
      }
   ], timecolumns=['time'])

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2023-05-04 10:04:49+00:00,16.72
2023-05-04 10:24:51+00:00,16.65
2023-05-04 10:44:53+00:00,16.55


In [None]:
set_time_index_zone( timeseries_dataframe_from_datadict([
      {
         "time":"2023-05-04T10:04:49",
         "value":16.72
      },
      {
         "time":"2023-05-04T10:24:51",
         "value":16.65
      },
      {
         "time":"2023-05-04T10:44:53",
         "value":16.55
      }
   ], timecolumns=['time']), timezone='Europe/Amsterdam')

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2023-05-04 12:04:49+02:00,16.72
2023-05-04 12:24:51+02:00,16.65
2023-05-04 12:44:53+02:00,16.55


In [None]:
#| export

def pop_nan_values(data):
    if isinstance(data, list):
        return [pop_nan_values(v) for v in data if pd.notnull([v]).any()]
    elif isinstance(data, dict):
        return {k:pop_nan_values(v) for k, v in data.items() if pd.notnull([v]).any()}
    else:
        return data

In [None]:
#| export
def timeseries_dataframe_to_datadict(
        data:typing.Union[pd.DataFrame, pd.Series, dict], 
        recordformat:str='records', 
        timezone:str='UTC',
        popNaN:bool=False):
    
    orient = recordformat.lower()

    normalized_data = timeseries_dataframe(data, timezone=timezone)
    if isinstance(normalized_data.index, pd.DatetimeIndex):
        normalized_data.index = normalized_data.index.map(lambda x: x.isoformat())
    
    if orient == 'records':
        records = normalized_data.reset_index().to_dict(orient='records')
    else:
        records =  normalized_data.to_dict(orient=orient)
    

    if popNaN and normalized_data.isna().any(axis=None):
        #return pop_nan_values(records)
        return [ {k:v for k,v in m.items() if pd.notnull(v)} for m in records]
    else:
        return records




In [None]:
df = timeseries_dataframe_from_datadict([
      {
         "time":"2023-05-04T10:04:49.000Z",
         "value":16.72
      },
      {
         "time":"2023-05-04T10:24:51.000Z",
         "value":16.65
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":16.55
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":np.nan
      }
   ], timecolumns=['time'])



In [None]:
timeseries_dataframe_to_datadict(df, recordformat='records', popNaN=True)

[{'time': '2023-05-04T10:04:49+00:00', 'value': 16.72},
 {'time': '2023-05-04T10:24:51+00:00', 'value': 16.65},
 {'time': '2023-05-04T10:44:53+00:00', 'value': 16.55},
 {'time': '2023-05-04T10:44:53+00:00'}]

In [None]:
timeseries_dataframe_to_datadict(df, recordformat='tight', popNaN=True)

{'index': ['2023-05-04T10:04:49+00:00',
  '2023-05-04T10:24:51+00:00',
  '2023-05-04T10:44:53+00:00',
  '2023-05-04T10:44:53+00:00'],
 'columns': ['value'],
 'data': [[16.72], [16.65], [16.55]],
 'index_names': ['time']}

In [None]:
test_data = {'index': ['2023-05-04T10:04:49+00:00',
  '2023-05-04T10:24:51+00:00',
  '2023-05-04T10:44:53+00:00',
  '2023-05-04T10:44:53+00:00'],
 'columns': ['value'],
 'data': [[16.72], [16.65], [16.55], [np.nan]],
 'index_names': ['time'],
 'column_names': [None]}

In [None]:
pop_nan_values(test_data)

{'index': ['2023-05-04T10:04:49+00:00',
  '2023-05-04T10:24:51+00:00',
  '2023-05-04T10:44:53+00:00',
  '2023-05-04T10:44:53+00:00'],
 'columns': ['value'],
 'data': [[16.72], [16.65], [16.55]],
 'index_names': ['time']}

In [None]:
pd.notnull([[np.nan, 2]]).any()

True

In [None]:
#| export
ResamplerMethods = dict(
    count=lambda R: R.count(),
    median=lambda R: R.median(),
    mean=lambda R: R.mean(),
    min=lambda R: R.min(),
    max=lambda R: R.max(),
    sum=lambda R: R.sum(),
    std=lambda R: R.std(),
    var=lambda R: R.var(),
    cumsum=lambda R: R.cumsum(),
    cummax=lambda R: R.cummax(),
    cummin=lambda R: R.cummin(),

)

ReSamplerPeriods = dict(
    H='h', T='min', S='sec', L='ms', U='us', N='ns'
)

def timeseries_dataframe_resample(df:pd.DataFrame, period:str, method:str):

    sampler = df.resample(ReSamplerPeriods.get(period, str(period)))

    dataframes = [df]
    for M in str(method).split(';'):
        sdf = ResamplerMethods.get(M)(sampler)
        sdf.columns = [f"{C}_{M}" for C in df.columns]
        dataframes.append(sdf)

    return pd.concat(dataframes, axis=1, join='outer')



In [None]:
#| hide
import nbdev; nbdev.nbdev_export()