Related to user story: [SP11-Item04: General Data Wrapper PoC](https://gitlab.inria.fr/fedbiomed/fedbiomed/-/issues/164)

## Tabular dataset

Workflow of data pre processing:

1. Columns name should be shared with the researcher
2. Data format file to be filled by clinicians.
3. Specify if missing data are allowed for a given columns (Exception). The file will be used for data verification during FL pre-processing,
4. Outlier verification for quantitative data, continuous and discrete, and for dates (Critical warning),
5. Missing data imputation by local mean (or optional NN), or majority voting for discrete labels. Give warnings when missing data are found (for verification a posteriori).
6. Give critical warning when too many missing are found (>50%),
7. Verify that number of available data is greater then minimum required (Error)

Critical warnings have different levels of disclosure to the researcher (1) only the warning, 2) type of warning, 3) type of warning and column affected).

In [2]:
#1. load  a single view dataset


import pandas as pd
import numpy as np
from typing import List, Tuple, Union, Dict
import os

path_file = '/user/ybouilla/home/Documents/data/pseudo_adni_mod/pseudo_adni_mod.csv'
file_name = os.path.basename(path_file)
single_view_dataset = pd.read_csv(path_file, delimiter=';', header=0)
single_view_dataset = {file_name: single_view_dataset}

In [3]:
single_view_dataset

{'pseudo_adni_mod.csv':      CDRSB.bl  ADAS11.bl  MMSE.bl  RAVLT.immediate.bl  RAVLT.learning.bl  \
 0           1          8     27.0           23.739439                4.0   
 1           0          0     30.0           64.933800                9.0   
 2           0          8     24.0           36.987722                3.0   
 3           0          3     29.0           50.314425                5.0   
 4           0          0     30.0           57.217830                9.0   
 ..        ...        ...      ...                 ...                ...   
 995         1          2     29.0           61.896022                8.0   
 996         0          1     29.0           62.083170                8.0   
 997         3         14     24.0           22.289059                2.0   
 998         0         13     26.0           31.650504                2.0   
 999         0         15     28.0           29.089863                3.0   
 
      RAVLT.forgetting.bl  FAQ.bl  WholeBrain.bl  V

In [4]:
# utility functions for multi view dataframe

def create_multi_view_dataframe(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    _header_labels = ['views', 'feature_name']
    # 1. create multiindex header

    _feature_name_array = np.array([])  # store all feature names
    _view_name_array = []  # store all views (ie modalities) names

    _concatenated_datasets = np.array([])  # store dataframe values

    for key in datasets.keys():
        _feature_name_array = np.concatenate([_feature_name_array,
                                              datasets[key].columns.values])
        if len(_concatenated_datasets) <= 0:
            # first pass 
            _concatenated_datasets = datasets[key].values
        else:
            # next passes
            try:
                _concatenated_datasets = np.concatenate(
                                        [_concatenated_datasets,
                                         datasets[key].to_numpy()
                                         ], axis=1)
            except ValueError as val_err:
                # catching case where nb_samples are differents
                raise ValueError(
                    'Cannot create multi view dataset: different number of samples for each modality have been detected'\
                        + 'Details: ' + str(val_err)
                    )
        for _ in datasets[key].columns.values:
            _view_name_array.append(key)

    _header = pd.MultiIndex.from_arrays([_view_name_array,
                                         _feature_name_array],
                                        names=_header_labels)


    # 2. create multi index dataframe

    multi_view_df = pd.DataFrame(_concatenated_datasets,
                                  columns = _header)
    return multi_view_df

In [5]:
def rename_variables_before_joining(multi_view_datasets: Dict[str, pd.DataFrame],
                                    views_name: List[Union[str, int]],
                                    primary_key:Union[str, int]=None) -> Dict[str, pd.DataFrame]:
    """
    Renames variables that have same name but different views using the following naming convention:
    if `a` is the name of a feature of `view1` and `a` is the name of a feature of `view2`,
    features names will be updated into `view1.a` and `view2.a`
    """
    _features_names = {}
    _views_length = len(views_name)
    
    for i_left in range(0, _views_length-1):
        _left_view = views_name[i_left]
        _left_features_name = multi_view_datasets[_left_view].columns.tolist()
        for i_right in range(i_left+1, _views_length):
        
            _right_view = views_name[i_right]
            _right_features_name = multi_view_datasets[_right_view].columns.tolist()
            
            for _f in _left_features_name:
                if primary_key and _f == primary_key:
                    # do not affect primary key (if any)
                    continue
                if _f  in _right_features_name:
                    
                    if _left_view  not in _features_names:
                        _features_names[_left_view] = {}
                        
                    if _right_view not in _features_names:
                        _features_names[_right_view] = {}
                        
                    _features_names[_left_view].update({_f: _left_view + '.' + str(_f)})
                    _features_names[_right_view].update({_f: _right_view + '.' + str(_f)})
    
    for i in range(_views_length):
        _view = views_name[i]
        _new_features = _features_names.get(_view)
        if _new_features:
            multi_view_datasets[_view] = multi_view_datasets[_view].rename(columns=_new_features)
        
    
    return multi_view_datasets



In [6]:
def join_muti_view_dataset(multi_view_dataset: pd.DataFrame, primary_key: str) -> pd.DataFrame:
    _views_name = sorted(set(multi_view_dataset.columns.get_level_values(0)))  # get views name
    
    joined_dataframe = multi_view_dataset[_views_name[0]]  # retrieve the first view
    # (as a result of join operation)
    for x in range(1, len(_views_name)):
        joined_dataframe = joined_dataframe.merge(multi_view_dataset[_views_name[x]],
                                                    on=primary_key,
                                                    suffixes=('', '.'+_views_name[x]))
        
        #df['file1'].join(df['file2'].set_index('pkey'), on='pkey', rsuffix='.file2')
        
    return joined_dataframe



In [14]:
# load multi view dataset
import os
import csv


folder_csv_path = 'test7'
if os.path.isdir(folder_csv_path):
    print('directory found')
    
else:
    pass
    # it is a file 

tabular_data_files = os.listdir(folder_csv_path)
multi_view_dataframe = {}
for tabular_data_file in tabular_data_files:
    # check if `tabular_data` is a csv file
    path_file = os.path.join(folder_csv_path, tabular_data_file)
    with open(path_file, 'r') as csvfile:
        try:
            # do some operation on file using sniffer to make sure considered file
            # is a CSV file
            dialect = csv.Sniffer().sniff(csvfile.readline())
            delimiter = dialect.delimiter
            dialect.lineterminator
            has_header = csv.Sniffer().has_header(csvfile.readline())
            if has_header:
                header = 0
            else:
                header = None
            multi_view_dataframe[tabular_data_file] = pd.read_csv(path_file, delimiter=delimiter,)
        except csv.Error as err:
            print('err', err, 'in file', tabular_data_file)
            #tabular_data_files.remove(tabular_data_file)
    
    
    


directory found


In [61]:
primary_key = 'pkey'

In [62]:
multi_view_dataframe = rename_variables_before_joining(multi_view_dataframe,
                                                      list(multi_view_dataframe.keys()),
                                                       primary_key=primary_key)

mdf = create_multi_view_dataframe(multi_view_dataframe)
mdf = join_muti_view_dataset(mdf, primary_key=primary_key)
mdf.columns

Index(['discrete', 'city', 'pkey', 'a', 'e', 'i', 'o', 'file1.0', 'file1.1',
       'file1.2', 'file1.3', 'file1.time', 'pressure', 'sp02', 'a.1', 'e.1',
       'i.1', 'o.1', 'gender', 'blood type', 'file2.0', 'file2.1', 'file2.2',
       'file2.3', 'file2.time', 'pH'],
      dtype='object', name='feature_name')

In [63]:

mdf

feature_name,discrete,city,pkey,a,e,i,o,file1.0,file1.1,file1.2,...,i.1,o.1,gender,blood type,file2.0,file2.1,file2.2,file2.3,file2.time,pH
0,64.0,Lille,qpqorfhylu gmfjy bdj,67,16,54,25,True,False,False,...,55,27,WOMAN,A,True,False,False,False,2018-01-02 06:00:00,
1,26.0,Lille,kkmjozalfyirgsire ui,42,96,69,61,True,True,True,...,8,82,MAN,AB,False,True,False,True,2018-01-01 00:00:00,0.023107
2,61.0,Paris,ezfasuuycdda foisjte,46,8,89,21,False,True,True,...,15,30,MAN,A,True,False,True,False,2018-01-02 10:00:00,0.587685
3,29.0,Paris,faxiqkt xggzmwzoidbg,29,6,77,14,False,True,False,...,37,55,MAN,AB,False,True,False,False,2018-01-03 12:00:00,0.894073
4,99.0,Lille,znwhlj rwzdutnagwasy,96,79,19,33,False,True,True,...,79,6,WOMAN,O,True,True,True,True,2018-01-01 10:00:00,0.026831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,9.0,Paris,zeqhcikzdodus jn qjf,81,62,52,68,True,True,True,...,39,35,MAN,AB,True,False,False,True,2018-01-02 05:00:00,0.78856
96,98.0,Marseille,iicthcvfmkajbvr gzir,16,49,30,7,False,False,True,...,27,23,MAN,,True,True,False,False,2018-01-05 02:00:00,0.402979
97,21.0,Lille,ztjakcsk bhjoksdz lm,90,14,36,24,False,False,False,...,76,7,MAN,B,True,False,False,True,2018-01-01 12:00:00,
98,42.0,Marseille,sabunaa opt vpulnxj,91,10,69,58,True,True,False,...,72,53,MAN,B,False,True,False,False,2018-01-02 09:00:00,0.651801


In [47]:
dialect.lineterminator

'\r\n'

In [6]:
# 2. extract columns name
MISSING = 'MISSING'
dataset_columns = single_view_dataset.columns

Data format file to be filled by clinicians (step 2 int he workflow):

Data format file will be a dictionary specifying the type: 
* for single view datasets:
```{<feature_name>: {'data_type': <data_type>, 'type':<values_taken>, 'range': <value_range>}```
 * for multiview datatset
```{{<view_name>: <feature_name>: {'data_type': <data_type>, 'type':<values_taken>, 'range': <value_range>}}```

where
* `<view_name>` is the name of the view
* `<feature_name>` is the name of the feature
* `<data_type>` can be categorical or continuous or missing_data or datetime
* `<value_taken>` is the type of the value (eg int, char, float, signed, unsigned ...)
* `<value_range>` represent either a list of bounds, an upper or a lower bound, or None

In [7]:
# 3. create data format file

import numpy as np
import enum
from enum import Enum, auto

# the use of Enum classes will prevent incorrect combination of values
class QuantitativeDataType(Enum):
    CONTINUOUS = [float, np.float64]
    DISCRETE = [int, np.int64]

class CategoricalDataType(Enum):
    BOOLEAN = [bool]
    NUMERICAL = [float, int, np.float64, np.int64]
    CHARACTER = [str, object]
    
class KeyDataType(Enum):
    NUMERICAL = [int, np.int64]
    CHARACTER = [str, object]
    DATETIME = "DATETIME"
    
class DataType(Enum):
    """

    """
    KEY = [KeyDataType.NUMERICAL,
           KeyDataType.CHARACTER,
           KeyDataType.DATETIME]
    QUANTITATIVE = [QuantitativeDataType.CONTINUOUS,
                   QuantitativeDataType.DISCRETE]
    CATEGORICAL = [CategoricalDataType.BOOLEAN,
                  CategoricalDataType.NUMERICAL,
                  CategoricalDataType.CHARACTER]
    #MISSING = 'MISSING'
    DATETIME = 'DATETIME'
    UNKNOWN = 'UNKNOWN'
    
    @staticmethod
    def get_names():
        return tuple(n for n, _ in DataType.__members__.items())

class MissingValueAllowedDefault(Enum):
    KEY = False
    QUANTITATIVE = True
    CATEGORICAL = True
    DATETIME = False
    
    @staticmethod
    def get_names():
        return tuple(n for n, _ in MissingValueAllowedDefault.__members__.items())

In [8]:
DataType.get_names()

('KEY', 'QUANTITATIVE', 'CATEGORICAL', 'DATETIME', 'UNKNOWN')

In [34]:
MissingValueAllowedDefault.get_names()

('KEY', 'QUANTITATIVE', 'CATEGORICAL', 'DATETIME')

In [9]:
def get_data_type(avail_data_types: enum.EnumMeta,
                  d_format: Enum,
                  d_type: type) ->  Tuple[Enum, List[Union[type, str]]]:
    present_d_types = []
    sub_d_type_format = None
    for avail_data_type in avail_data_types:
        if d_format is avail_data_type:
            sub_dtypes = avail_data_type.value
            if not isinstance(sub_dtypes, str) and hasattr(sub_dtypes, '__getitem__') and isinstance(sub_dtypes[0], Enum):
                # check if dtype has subtypes
                #(eg if datatype is QUANTITATIVE, subtype will be CONTINOUS or DISCRETE)
                for sub_dtype in sub_dtypes:
                    if any(d_type == t for t in tuple(sub_dtype.value)):
                        present_d_types.append(d_type)
                        sub_d_type_format = sub_dtype
                        print(sub_dtype, d_type)
            else:
                
                present_d_types.append(sub_dtypes)
                sub_d_type_format = sub_dtypes
    return  sub_d_type_format, present_d_types

In [10]:
multi_view_dataframe['file1']['a'].dtype is np.int64

NameError: name 'multi_view_dataframe' is not defined

In [11]:

def check_missing_data(column: pd.Series)->bool:
    is_missing_data = column.isna().any()
    return is_missing_data
df = pd.DataFrame({'w': [1, 2, 3, 4,  'jj', None]})
print(check_missing_data(df['w']))



True


In [93]:
datasets = single_view_dataset

In [26]:



data_format is DataType.KEY

True

In [None]:
# join multiple csv



In [62]:
available_data_type

NameError: name 'available_data_type' is not defined

In [31]:
# CLI for clinicians for setting up data format file

import sys, pprint




available_data_type = [d_type for d_type in DataType]  # get all available data type
n_available_data_type = len(available_data_type)
print(available_data_type)

data_format_files = {}


msg = ''
msg_yes_or_no_question = '1) YES\n2) NO\n'
yes_or_no_question_key = {'1': True,
                    '2': False}
for i, dtype in enumerate(available_data_type):
    msg += '%d) %s \n' %  (i+1, dtype.name)
msg += '%d) ignore this column\n' % (i+2)
ignoring_key = i+2

[<DataType.KEY: [<KeyDataType.NUMERICAL: [<class 'int'>, <class 'numpy.int64'>]>, <KeyDataType.CHARACTER: [<class 'str'>, <class 'object'>]>, <KeyDataType.DATETIME: 'DATETIME'>]>, <DataType.QUANTITATIVE: [<QuantitativeDataType.CONTINUOUS: [<class 'float'>, <class 'numpy.float64'>]>, <QuantitativeDataType.DISCRETE: [<class 'int'>, <class 'numpy.int64'>]>]>, <DataType.CATEGORICAL: [<CategoricalDataType.BOOLEAN: [<class 'bool'>]>, <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>, <CategoricalDataType.CHARACTER: [<class 'str'>, <class 'object'>]>]>, <DataType.DATETIME: 'DATETIME'>, <DataType.UNKNOWN: 'UNKNOWN'>]


In [32]:
## CLI to use when dataset is available
for tabular_data_file in datasets.keys():
    data_format_file = {}
    
    print(f'++++++++ Now parsing : {tabular_data_file} ++++++++++++++++')
    dataset = datasets[tabular_data_file]
    dataset_columns = dataset.columns
    for n_feature, feature in enumerate(dataset_columns):
        is_column_parsed = False
        is_info_given = False
        is_missing_values_allowed = False
        print(f'displaying first 10 values of feature {feature} (n_feature: {n_feature})')
        pprint.pprint(dataset[feature].head(10))  # print first 10 lines of feature value
        try:
            while not is_column_parsed:
                data_format_id = input(f'specify data type for {feature}:\n' + msg )
                if data_format_id.isdigit() and int(data_format_id) <= n_available_data_type+1:
                    # check if value passed by user is correct (if it is integer,
                    # and whithin range [1, n_available_data_type])
                    is_column_parsed = True
                
                else:
                    print(f'error ! {data_format_id} value not understood')
                    
        except KeyboardInterrupt as e:
            print('stopping now' + str(e))
        if not is_column_parsed:
            break
        if int(data_format_id) < ignoring_key:
            
            data_format = available_data_type[int(data_format_id)-1]
        
            data_type = dataset[feature].dtype
            print(DataType, data_format, data_type)
            data_type, types = get_data_type(DataType, data_format, data_type)
            print(data_type, types)

            is_missing_values = check_missing_data(dataset[feature])
            if is_missing_values:
                if data_type is DataType.KEY:
                    raise ValueError('KEY should not contain any missing points')
            if data_format is DataType.KEY or data_format is DataType.DATETIME:  
                is_missing_values_allowed = False
            else:
                while not is_info_given:
                    # set info
                    missing_values_selection = input(f'Allow {feature} to have missing values:\n'+msg_yes_or_no_question)
                    if missing_values_selection.isdigit() and int(missing_values_selection) < 3:
                        is_info_given = True

                    else:
                        print(f'error ! {missing_values_selection} value not understood')
                is_missing_values_allowed = yes_or_no_question_key[missing_values_selection]

            data_format_file[feature] = {'data_type': data_type,
                                         'values': types,
                                         'is_missing_values': is_missing_values_allowed}
        else:
            print(f'Ignoring {feature}')
    data_format_files[tabular_data_file] = data_format_file

++++++++ Now parsing : file1 ++++++++++++++++
displaying first 10 values of feature a (n_feature: 0)
0    48
1    87
2    46
3    84
4    94
5    18
6    15
7    30
8    54
9    46
Name: a, dtype: int64
specify data type for a:
1) KEY 
2) QUANTITATIVE 
3) CATEGORICAL 
4) DATETIME 
5) UNKNOWN 
6) ignore this column
6
Ignoring a
displaying first 10 values of feature e (n_feature: 1)
0    98
1    83
2    73
3    45
4    84
5     5
6    44
7    55
8    37
9     8
Name: e, dtype: int64
specify data type for e:
1) KEY 
2) QUANTITATIVE 
3) CATEGORICAL 
4) DATETIME 
5) UNKNOWN 
6) ignore this column
6
Ignoring e
displaying first 10 values of feature i (n_feature: 2)
0    65
1    13
2    81
3    81
4     0
5    57
6    14
7    98
8    13
9    89
Name: i, dtype: int64
specify data type for i:
1) KEY 
2) QUANTITATIVE 
3) CATEGORICAL 
4) DATETIME 
5) UNKNOWN 
6) ignore this column
2
<enum 'DataType'> DataType.QUANTITATIVE int64
QuantitativeDataType.DISCRETE int64
QuantitativeDataType.DISCRETE [dty

#### data_format_file

In [29]:
data_format_files

{'file1': {'a': {'data_type': <QuantitativeDataType.DISCRETE: [<class 'int'>, <class 'numpy.int64'>]>,
   'values': [dtype('int64')],
   'is_missing_values': True},
  'e': {'data_type': <QuantitativeDataType.DISCRETE: [<class 'int'>, <class 'numpy.int64'>]>,
   'values': [dtype('int64')],
   'is_missing_values': True},
  '0': {'data_type': <CategoricalDataType.BOOLEAN: [<class 'bool'>]>,
   'values': [dtype('bool')],
   'is_missing_values': True},
  '1': {'data_type': <CategoricalDataType.BOOLEAN: [<class 'bool'>]>,
   'values': [dtype('bool')],
   'is_missing_values': False},
  'time': {'data_type': 'DATETIME',
   'values': ['DATETIME'],
   'is_missing_values': False},
  'pressure': {'data_type': <QuantitativeDataType.CONTINUOUS: [<class 'float'>, <class 'numpy.float64'>]>,
   'values': [dtype('float64')],
   'is_missing_values': False},
  'sp02': {'data_type': <QuantitativeDataType.CONTINUOUS: [<class 'float'>, <class 'numpy.float64'>]>,
   'values': [dtype('float64')],
   'is_missin

In [33]:
msg_yes_or_no_question

'1) YES\n2) NO\n'

In [40]:
is_views_finished = False


views_format_file = {}

while not is_views_finished:
    is_features_finished = False
    resp = input('do you want to add a new view (file)?\n' + msg_yes_or_no_question)
    resp = yes_or_no_question_key.get(resp)
    if not resp:
        is_views_finished = True
        print('process done')
        continue
    new_view = input('please add new view name:\n')
    while not is_features_finished:
        feature_format_file = {}
        new_feature = input('please add new feature name:\n')
        feature_format_file[new_feature] = {}
        is_column_parsed = False
        try:
            while not is_column_parsed:
                data_format_id = input(f'specify data type for {feature}:\n' + msg )
                if data_format_id.isdigit() and int(data_format_id) <= n_available_data_type+1:
                    # check if value passed by user is correct (if it is integer,
                    # and whithin range [1, n_available_data_type])
                    is_column_parsed = True
                
                else:
                    print(f'error ! {data_format_id} value not understood')
                    
        except KeyboardInterrupt as e:
            print('stopping now' + str(e))
        resp = input('do you want to add a new variable (feature) ?' + msg_yes_or_no_question)
        resp = yes_or_no_question_key.get(resp)
        if not resp:
            is_features_finished = True
            print('process done')
            continue
    views_format_file[new_view] = feature_format_file

do you want to add a new view (file)?
1) YES
2) NO
1
please add new view name:
ll
please add new feature name:
ll
specify data type for 0:
1) KEY 
2) QUANTITATIVE 
3) CATEGORICAL 
4) DATETIME 
5) UNKNOWN 
6) ignore this column
4
do you want to add a new variable (feature) ?1) YES
2) NO
2
process done
do you want to add a new view (file)?
1) YES
2) NO
1
please add new view name:
kk
please add new feature name:
vkeof
specify data type for 0:
1) KEY 
2) QUANTITATIVE 
3) CATEGORICAL 
4) DATETIME 
5) UNKNOWN 
6) ignore this column
3
do you want to add a new variable (feature) ?1) YES
2) NO
2
process done
do you want to add a new view (file)?
1) YES
2) NO
2
process done


In [41]:
views_format_file

{'ll': {'ll': {}}, 'kk': {'vkeof': {}}}

In [26]:
# hardcoded data format file

data_format_file = {
    'CDRSB.bl': {
                 'data_type': CategoricalDataType.NUMERICAL,
                'values': int,
                'is_missing_values': False},
    'ADAS11.bl':{'data_type': CategoricalDataType.NUMERICAL, 
                'values': int,
                'is_missing_values': False},
    
    'MMSE.bl': {'data_type': CategoricalDataType.NUMERICAL, 
                'values': int,
                'is_missing_values': False},
    'RAVLT.immediate.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'RAVLT.learning.bl': {'data_type': CategoricalDataType.NUMERICAL, 
                'values': float,
                'is_missing_values': False},
    'RAVLT.forgetting.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'FAQ.bl': {'data_type': CategoricalDataType.NUMERICAL, 
                'values': int,
                'is_missing_values': False},
    'WholeBrain.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'Ventricles.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'Hippocampus.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'MidTemp.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'Entorhinal.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'ABETA.MEDIAN.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'PTAU.MEDIAN.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'TAU.MEDIAN.bl': {'data_type': QuantitativeDataType.CONTINUOUS, 
                'values': float,
                'is_missing_values': False},
    'AGE': {'data_type': QuantitativeDataType.DISCRETE, 
                'values': int,
                'is_missing_values': False}
}

In [27]:
dataset

Unnamed: 0,CDRSB.bl,ADAS11.bl,MMSE.bl,RAVLT.immediate.bl,RAVLT.learning.bl,RAVLT.forgetting.bl,FAQ.bl,WholeBrain.bl,Ventricles.bl,Hippocampus.bl,MidTemp.bl,Entorhinal.bl,ABETA.MEDIAN.bl,PTAU.MEDIAN.bl,TAU.MEDIAN.bl,AGE
0,1,8,27.0,23.739439,4.0,5.821573,3,0.684331,0.012699,0.003786,0.012678,0.002214,154.016065,67.970509,132.571916,75.0
1,0,0,30.0,64.933800,9.0,4.001653,0,0.735892,0.012803,0.004866,0.015071,0.003041,211.573206,5.451168,33.787719,67.0
2,0,8,24.0,36.987722,3.0,6.876316,0,0.738731,0.030492,0.004300,0.012419,0.002316,163.637668,66.704378,110.049924,63.0
3,0,3,29.0,50.314425,5.0,4.733481,3,0.696179,0.032797,0.004720,0.012312,0.002593,182.256297,47.091893,138.690457,75.0
4,0,0,30.0,57.217830,9.0,7.225401,0,0.841806,0.004030,0.006820,0.016948,0.002896,247.997479,-5.997140,-61.573234,65.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,2,29.0,61.896022,8.0,1.663102,0,0.767153,0.011417,0.005209,0.012879,0.002208,231.706787,24.632786,87.065806,76.0
996,0,1,29.0,62.083170,8.0,5.241477,1,0.695168,0.011908,0.004641,0.012534,0.002197,146.949187,57.588115,121.985248,77.0
997,3,14,24.0,22.289059,2.0,5.437600,7,0.628691,0.041537,0.003478,0.010870,0.001939,181.805672,55.052669,157.229102,74.0
998,0,13,26.0,31.650504,2.0,1.669603,4,0.714763,0.020461,0.004713,0.013989,0.001981,178.824412,69.412821,103.238647,64.0


In [28]:
data_format_file

{'CDRSB.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': int,
  'is_missing_values': False},
 'ADAS11.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': int,
  'is_missing_values': False},
 'MMSE.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': int,
  'is_missing_values': False},
 'RAVLT.immediate.bl': {'data_type': <QuantitativeDataType.CONTINUOUS: [<class 'float'>, <class 'numpy.float64'>]>,
  'values': float,
  'is_missing_values': False},
 'RAVLT.learning.bl': {'data_type': <CategoricalDataType.NUMERICAL: [<class 'float'>, <class 'int'>, <class 'numpy.float64'>, <class 'numpy.int64'>]>,
  'values': float,
  'is_missing_values': False},
 'RAVLT.forgetting.bl': {'data_type': <QuantitativeDataType.CON

In [29]:

type(dataset[feature].dtype)

numpy.dtype[float64]

In [28]:
dir(dataset[feature])

['T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__r