Related to user story: [SP11-Item04: General Data Wrapper PoC](https://gitlab.inria.fr/fedbiomed/fedbiomed/-/issues/164)

## Tabular dataset

Workflow of data pre processing:

1. Columns name should be shared with the researcher
2. Data format file to be filled by clinicians.
3. Specify if missing data are allowed for a given columns (Exception). The file will be used for data verification during FL pre-processing,
4. Outlier verification for quantitative data, continuous and discrete, and for dates (Critical warning),
5. Missing data imputation by local mean (or optional NN), or majority voting for discrete labels. Give warnings when missing data are found (for verification a posteriori).
6. Give critical warning when too many missing are found (>50%),
7. Verify that number of available data is greater then minimum required (Error)

Critical warnings have different levels of disclosure to the researcher (1) only the warning, 2) type of warning, 3) type of warning and column affected).

In [8]:
#1. load  dataset


import pandas as pd
from typing import List, Tuple, Union


path_file = '/user/ybouilla/home/Documents/data/pseudo_adni_mod/pseudo_adni_mod.csv'
single_view_dataset = pd.read_csv(path_file, delimiter=';', header=0)

In [9]:
single_view_dataset

Unnamed: 0,CDRSB.bl,ADAS11.bl,MMSE.bl,RAVLT.immediate.bl,RAVLT.learning.bl,RAVLT.forgetting.bl,FAQ.bl,WholeBrain.bl,Ventricles.bl,Hippocampus.bl,MidTemp.bl,Entorhinal.bl,ABETA.MEDIAN.bl,PTAU.MEDIAN.bl,TAU.MEDIAN.bl,AGE
0,1,8,27.0,23.739439,4.0,5.821573,3,0.684331,0.012699,0.003786,0.012678,0.002214,154.016065,67.970509,132.571916,75.0
1,0,0,30.0,64.933800,9.0,4.001653,0,0.735892,0.012803,0.004866,0.015071,0.003041,211.573206,5.451168,33.787719,67.0
2,0,8,24.0,36.987722,3.0,6.876316,0,0.738731,0.030492,0.004300,0.012419,0.002316,163.637668,66.704378,110.049924,63.0
3,0,3,29.0,50.314425,5.0,4.733481,3,0.696179,0.032797,0.004720,0.012312,0.002593,182.256297,47.091893,138.690457,75.0
4,0,0,30.0,57.217830,9.0,7.225401,0,0.841806,0.004030,0.006820,0.016948,0.002896,247.997479,-5.997140,-61.573234,65.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,2,29.0,61.896022,8.0,1.663102,0,0.767153,0.011417,0.005209,0.012879,0.002208,231.706787,24.632786,87.065806,76.0
996,0,1,29.0,62.083170,8.0,5.241477,1,0.695168,0.011908,0.004641,0.012534,0.002197,146.949187,57.588115,121.985248,77.0
997,3,14,24.0,22.289059,2.0,5.437600,7,0.628691,0.041537,0.003478,0.010870,0.001939,181.805672,55.052669,157.229102,74.0
998,0,13,26.0,31.650504,2.0,1.669603,4,0.714763,0.020461,0.004713,0.013989,0.001981,178.824412,69.412821,103.238647,64.0


In [3]:
# 2. extract columns name
MISSING = 'MISSING'
dataset_columns = dataset.columns

Data format file to be filled by clinicians (step 2 int he workflow):

Data format file will be a dictionary specifying the type: 
* for single view datasets:
```{<feature_name>: {'data_type': <data_type>, 'type':<values_taken>, 'range': <value_range>}```
 * for multiview datatset
```{{<view_name>: <feature_name>: {'data_type': <data_type>, 'type':<values_taken>, 'range': <value_range>}}```

where
* `<view_name>` is the name of the view
* `<feature_name>` is the name of the feature
* `<data_type>` can be categorical or continuous or missing_data or datetime
* `<value_taken>` is the type of the value (eg int, char, float, signed, unsigned ...)
* `<value_range>` represent either a list of bounds, an upper or a lower bound, or None

In [4]:
# 3. create data format file

import numpy as np
import enum
from enum import Enum, auto

# the use of Enum classes will prevent incorrect combination of values
class QuantitativeDataType(Enum):
    CONTINUOUS = [float, np.float64]
    DISCRETE = [int]

class CategoricalDataType(Enum):
    BOOLEAN = [bool]
    NUMERICAL = [float, int, np.float64, np.int64]
    CHARACTER = [str, object]
    
class DataType(Enum):
    """

    """
    QUANTITATIVE = [QuantitativeDataType.CONTINUOUS,
                   QuantitativeDataType.DISCRETE]
    CATEGORICAL = [CategoricalDataType.BOOLEAN,
                  CategoricalDataType.NUMERICAL,
                  CategoricalDataType.CHARACTER]
    #MISSING = 'MISSING'
    DATETIME = 'DATETIME'
    UNKNOWN = 'UNKNOWN'
    

    

In [5]:
data_format = DataType.CATEGORICAL
dt = data_type
for dtype in DataType:
    if dtype is data_format:
        print('ok', dtype.name)
        for sub_dtype in dtype.value:
            print(sub_dtype.value, dt)
            if any(dt == t for t in tuple(sub_dtype.value)):
                print('ok', dt)

NameError: name 'data_type' is not defined

In [5]:
def get_data_type(avail_data_types: enum.EnumMeta,
                  d_format: Enum,
                  d_type: type) ->  Tuple[Enum, List[Union[type, str]]]:
    present_d_types = []
    sub_d_type_format = None
    for avail_data_type in avail_data_types:
        if d_format is avail_data_type:
            sub_dtypes = avail_data_type.value
            if hasattr(sub_dtypes, '__iter__') and not isinstance(sub_dtypes, str):
                # check if dtype has subtypes
                #(eg if datatype is QUANTITATIVE, subtype will be CONTINOUS or DISCRETE)
                for sub_dtype in sub_dtypes:
                    if any(d_type == t for t in tuple(sub_dtype.value)):
                        present_d_types.append(d_type)
                        sub_d_type_format = sub_dtype
                        print(sub_dtype, d_type)
            else:
                
                present_d_types.append(sub_dtypes)
                sub_d_type_format = sub_dtypes
    return  sub_d_type_format, present_d_types

In [6]:

def check_missing_data(column: pd.Series)->bool:
    is_missing_data = column.isna().any()
    return is_missing_data
df = pd.DataFrame({'w': [1, 2, 3, 4,  'jj', None]})
print(check_missing_data(df['w']))



True


In [105]:
type(DataType)

enum.EnumMeta

In [16]:
print(data_format, data_type)
get_data_type(DataType, data_format, data_type)

CATEGORICAL None


(None, [])

In [8]:
isinstance(dt, (int, float, np.int64))

False

In [64]:
str(DataType.CATEGORICAL)

'DataType.CATEGORICAL'

In [112]:
dataset[feature].dtype

dtype('float64')

In [22]:
# CLI for clinicians

import sys, pprint


data_format_file = {}

available_data_type = [d_type for d_type in DataType]  # get all available data type
n_available_data_type = len(available_data_type)
print(available_data_type)
msg = ''
for i, dtype in enumerate(available_data_type):
    msg += '%d) %s \n' %  (i+1, dtype.name)
for n_feature, feature in enumerate(dataset_columns):
    is_column_parsed = False
    print(f'displaying first 10 values of feature {feature} (n_feature: {n_feature})')
    pprint.pprint(dataset[feature].head(10))  # print first 10 lines of feature value
    try:
        while not is_column_parsed:
            data_format_id = input(f'specify data type for {feature}:\n' + msg )
            if data_format_id.isdigit() and int(data_format_id) <= n_available_data_type:
                # check if value passed by user is correct
                is_column_parsed = True
            else:
                print(f'error ! {data_format_id} value not understood')
    except KeyboardInterrupt as e:
        print('stopping now' + str(e))
    if not is_column_parsed:
        break
    data_format = available_data_type[int(data_format_id)-1]
    
    data_type = dataset[feature].dtype
    print(DataType, data_format, data_type)
    data_type, types = get_data_type(DataType, data_format, data_type)
    print(data_type, types)
    is_missing_values = check_missing_data(dataset[feature])
    data_format_file[feature] = {'data_type': data_type,
                                 'values': types,
                                 'is_missing_values': is_missing_values}


[<DataType.QUANTITATIVE: [<QuantitativeDataType.CONTINUOUS: [<class 'float'>, <class 'numpy.float64'>]>, <QuantitativeDataType.DISCRETE: [<class 'int'>]>]>, <DataType.CATEGORICAL: [<CategoricalDataType.BOOLEAN: [<class 'bool'>]>, <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>, <CategoricalDataType.CHARACTER: [<class 'str'>, <class 'object'>]>]>, <DataType.DATETIME: 'DATETIME'>, <DataType.UNKNOWN: 'UNKNOWN'>]
displaying first 10 values of feature CDRSB.bl (n_feature: 0)
0    1
1    0
2    0
3    0
4    0
5    1
6    4
7    0
8    3
9    2
Name: CDRSB.bl, dtype: int64
specify data type for CDRSB.bl:
1) QUANTITATIVE 
2) CATEGORICAL 
3) DATETIME 
4) UNKNOWN 
2
<enum 'DataType'> DataType.CATEGORICAL int64
CategoricalDataType.NUMERICAL int64
CategoricalDataType.NUMERICAL [dtype('int64')]
displaying first 10 values of feature ADAS11.bl (n_feature: 1)
0     8
1     0
2     8
3     3
4     0
5    10
6    12
7     2
8     8
9    

#### data_format_file

In [23]:
data_format_file

{'CDRSB.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': [dtype('int64')],
  'is_missing_values': False},
 'ADAS11.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': [dtype('int64')],
  'is_missing_values': False},
 'MMSE.bl': {'data_type': <QuantitativeDataType.CONTINUOUS: [<class 'float'>, <class 'numpy.float64'>]>,
  'values': [dtype('float64')],
  'is_missing_values': False},
 'RAVLT.immediate.bl': {'data_type': <QuantitativeDataType.CONTINUOUS: [<class 'float'>, <class 'numpy.float64'>]>,
  'values': [dtype('float64')],
  'is_missing_values': False},
 'RAVLT.learning.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': [dtype('float64')],
  'is_missing_values': False},
 'RAVLT.forgetting.bl': {'data_t

In [26]:
# hardcoded data format file

data_format_file = {
    'CDRSB.bl': {
                 'data_type': CategoricalDataType.NUMERICAL,
                'values': int,
                'is_missing_values': False},
    'ADAS11.bl':{'data_type': CategoricalDataType.NUMERICAL, 
                'values': int,
                'is_missing_values': False},
    
    'MMSE.bl': {'data_type': CategoricalDataType.NUMERICAL, 
                'values': int,
                'is_missing_values': False},
    'RAVLT.immediate.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'RAVLT.learning.bl': {'data_type': CategoricalDataType.NUMERICAL, 
                'values': float,
                'is_missing_values': False},
    'RAVLT.forgetting.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'FAQ.bl': {'data_type': CategoricalDataType.NUMERICAL, 
                'values': int,
                'is_missing_values': False},
    'WholeBrain.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'Ventricles.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'Hippocampus.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'MidTemp.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'Entorhinal.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'ABETA.MEDIAN.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'PTAU.MEDIAN.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'TAU.MEDIAN.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'AGE': {'data_type': QuantitativeDataType.DISCRETE, 
                'values': int,
                'is_missing_values': False}
}

In [27]:
dataset

Unnamed: 0,CDRSB.bl,ADAS11.bl,MMSE.bl,RAVLT.immediate.bl,RAVLT.learning.bl,RAVLT.forgetting.bl,FAQ.bl,WholeBrain.bl,Ventricles.bl,Hippocampus.bl,MidTemp.bl,Entorhinal.bl,ABETA.MEDIAN.bl,PTAU.MEDIAN.bl,TAU.MEDIAN.bl,AGE
0,1,8,27.0,23.739439,4.0,5.821573,3,0.684331,0.012699,0.003786,0.012678,0.002214,154.016065,67.970509,132.571916,75.0
1,0,0,30.0,64.933800,9.0,4.001653,0,0.735892,0.012803,0.004866,0.015071,0.003041,211.573206,5.451168,33.787719,67.0
2,0,8,24.0,36.987722,3.0,6.876316,0,0.738731,0.030492,0.004300,0.012419,0.002316,163.637668,66.704378,110.049924,63.0
3,0,3,29.0,50.314425,5.0,4.733481,3,0.696179,0.032797,0.004720,0.012312,0.002593,182.256297,47.091893,138.690457,75.0
4,0,0,30.0,57.217830,9.0,7.225401,0,0.841806,0.004030,0.006820,0.016948,0.002896,247.997479,-5.997140,-61.573234,65.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,2,29.0,61.896022,8.0,1.663102,0,0.767153,0.011417,0.005209,0.012879,0.002208,231.706787,24.632786,87.065806,76.0
996,0,1,29.0,62.083170,8.0,5.241477,1,0.695168,0.011908,0.004641,0.012534,0.002197,146.949187,57.588115,121.985248,77.0
997,3,14,24.0,22.289059,2.0,5.437600,7,0.628691,0.041537,0.003478,0.010870,0.001939,181.805672,55.052669,157.229102,74.0
998,0,13,26.0,31.650504,2.0,1.669603,4,0.714763,0.020461,0.004713,0.013989,0.001981,178.824412,69.412821,103.238647,64.0


In [28]:
data_format_file

{'CDRSB.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': int,
  'is_missing_values': False},
 'ADAS11.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': int,
  'is_missing_values': False},
 'MMSE.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': int,
  'is_missing_values': False},
 'RAVLT.immediate.bl': {'data_type': <QuantitativeDataType.CONTINUOUS: [<class 'float'>, <class 'numpy.float64'>]>,
  'values': float,
  'is_missing_values': False},
 'RAVLT.learning.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': float,
  'is_missing_values': False},
 'RAVLT.forgetting.bl': {'data_type': <QuantitativeDataType.CON

In [29]:

type(dataset[feature].dtype)

numpy.dtype[float64]

In [28]:
dir(dataset[feature])

['T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__r