# data sanity check


assumption when parsing:


- name of private key is the same for each dataet file

In [1]:

import math 

import pandas as pd
import pprint
import csv
import numpy as np
from typing import List, Tuple, Union, Dict, Any, Iterator, Optional, Callable
import os
from tabulate import tabulate

import json

import utils
from data_type import DataType

In [2]:
    

# utility functions for multi view dataframe
def rename_variables_before_joining(multi_view_datasets: Dict[str, pd.DataFrame],
                                    views_name: List[Union[str, int]],
                                    primary_key:Union[str, int]=None) -> Tuple[Dict[str, pd.DataFrame],
                                                                              Dict[str, str]]:
    """
    Renames variables that have same name but different views using the following naming convention:
    if `a` is the name of a feature of `view1` and `a` is the name of a feature of `view2`,
    features names will be updated into `view1.a` and `view2.a`
    """
    _features_names = {}
    _views_length = len(views_name)
    
    # check for each variable name existing in one view, that it doesnot exist in another
    # view. if it is, rename both variables
    # for this purpose, parse every combination once
    for i_left in range(0, _views_length-1):
        _left_view = views_name[i_left]
        _left_features_name = multi_view_datasets[_left_view].columns.tolist()
        for i_right in range(i_left+1, _views_length):
        
            _right_view = views_name[i_right]
            _right_features_name = multi_view_datasets[_right_view].columns.tolist()
            
            for _f in _left_features_name:
                if primary_key and _f == primary_key:
                    # do not affect primary key (if any)
                    continue
                if _f  in _right_features_name:
                    
                    if _left_view  not in _features_names:
                        _features_names[_left_view] = {}
                        
                    if _right_view not in _features_names:
                        _features_names[_right_view] = {}
                        
                    _features_names[_left_view].update({_f: _left_view + '.' + str(_f)})
                    _features_names[_right_view].update({_f: _right_view + '.' + str(_f)})
    
    for i in range(_views_length):
        _view = views_name[i]
        _new_features = _features_names.get(_view)
        if _new_features:
            multi_view_datasets[_view] = multi_view_datasets[_view].rename(columns=_new_features)
        
    
    return multi_view_datasets, _features_names


def create_multi_view_dataframe_from_dictionary(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    # WARNING: DOESNOT CONTAIN FACILITY FOR KEEPING PRIMARY KEY
    _header_labels = ['views', 'feature_name']
    # 1. create multiindex header

    _feature_name_array = np.array([])  # store all feature names
    _view_name_array = []  # store all views (ie modalities) names

    _concatenated_datasets = np.array([])  # store dataframe values

    for key in datasets.keys():
        _feature_name_array = np.concatenate([_feature_name_array,
                                              datasets[key].columns.values])
        if len(_concatenated_datasets) <= 0:
            # first pass 
            _concatenated_datasets = datasets[key].values
        else:
            # next passes
            try:
                _concatenated_datasets = np.concatenate(
                                        [_concatenated_datasets,
                                         datasets[key].to_numpy()
                                         ], axis=1)
            except ValueError as val_err:
                # catching case where nb_samples are differents
                raise ValueError(
                    'Cannot create multi view dataset: different number of samples for each modality have been detected'\
                        + 'Details: ' + str(val_err)
                    )
        for _ in datasets[key].columns.values:
            _view_name_array.append(key)

    _header = pd.MultiIndex.from_arrays([_view_name_array,
                                         _feature_name_array],
                                        names=_header_labels)


    # 2. create multi index dataframe

    multi_view_df = pd.DataFrame(_concatenated_datasets,
                                  columns = _header)
    return multi_view_df

def create_multi_view_dataframe_from_dataframe(dataframe: pd.DataFrame,
                                               views_features: Dict[str, List[str]],
                                               primary_key: str = None):
    # convert plain dataframe into multi index dataframe
    # primary key will have its own view
    _header_labels = ['views', 'feature_name']
    _primary_key_label = 'primary_key'
    _n_features = 0
    
    _multi_index = dataframe.columns
    if primary_key is not None:
        _key_values = dataframe[primary_key].values  # storing primary key values

    _all_features_names = []
    _new_views_names = []
    for view_name in views_features.keys():
        # get all columns name for each view, and remove primary keymulti_view_dataset[view_name] = pd.concat[]
        _features_names = list(views_features[view_name])
        
        if primary_key is not None:
            _features_names.remove(primary_key)
        
        for feature_name in _features_names:
            #if feature_name not in _all_features_names:
            _new_views_names.append(view_name)
            # appending as much as there are feature within each view
        _n_features += len(_features_names)
        _all_features_names.extend(_features_names)
        
        #_all_features_names = list(set(_all_features_names))  # remove duplicates
    
    print('length', _all_features_names, _new_views_names)
    _header = pd.MultiIndex.from_arrays([ _new_views_names, _all_features_names],
                                        names=_header_labels)
    
    print('BUG',_all_features_names, dataframe[_all_features_names].values.shape, dataframe.shape)
    print(_header)
    multi_view_dataframe = pd.DataFrame(dataframe[_all_features_names].values, columns=_header)
    
    if primary_key is not None:
        
        multi_view_dataframe[_primary_key_label, primary_key] = _key_values  # creating a specific value for
    # private key
    return multi_view_dataframe


def join_multi_view_dataset(multi_view_dataset: Union[pd.DataFrame, Dict[str, pd.DataFrame]],
                           #multi_view_dataframe: pd.DataFrame=None,
                           #multi_view_dictionary_dataset: Dict[str, pd.DataFrame] = None, 
                           primary_key: str=None,
                          as_multi_index: bool = True) -> pd.DataFrame:
    """Concatenates a multi view dataset into a plain pandas dataframe,
    by doing a join operation along specified primary_key"""
    
    if isinstance(multi_view_dataset, pd.DataFrame):
        _views_names = sorted(set(multi_view_dataset.columns.get_level_values(0)))  # get views name

        
    elif isinstance(multi_view_dataset, dict):
        _views_names = sorted(list(multi_view_dataset.keys()))
        
    else:
        raise ValueError('method can only accept as input multi view pandas dataframe or dictionary of pandas dataframes')
        
    joined_dataframe = multi_view_dataset[_views_names[0]]  # retrieve the first view
    # (as a result of join operation)
    for x in range(1, len(_views_names)):
        joined_dataframe = joined_dataframe.merge(multi_view_dataset[_views_names[x]],
                                                    on=primary_key,
                                                    suffixes=('', '.'+_views_names[x]))
    
    if as_multi_index:
        # convert plain dataframe into multi index dataframe
        # primary key will have its own view
        _header_labels = ['views', 'feature_name']
        _primary_key_label = 'primary_key'
        
        _multi_index = multi_view_dataset.columns
        
        _key_values = joined_dataframe[primary_key].values  # storing primary key

        _all_features_names = []
        _new_views_names = []
        for view_name in _views_names:
            # get all columns name for each view, and remove primary key
            _features_names = list(multi_view_dataset[view_name].columns)
            if primary_key is not None:
                _features_names.remove(primary_key)
            _all_features_names.extend(_features_names)

            for feature_name in _features_names:
                _new_views_names.append(view_name)
                # appending as much as there are feature within each view
            #features_name[name].remove(primary_key)

        _header = pd.MultiIndex.from_arrays([ _new_views_names, _all_features_names],
                                            names=_header_labels)
        print(_header)
        joined_dataframe  = pd.DataFrame(joined_dataframe[_all_features_names].values, columns=_header)
        joined_dataframe[_primary_key_label, primary_key] = _key_values
        
    return joined_dataframe



def search_primary_key(format_file_ref: Dict[str, Dict[str, Any]]) -> Optional[str]: 
    """"""
    _views_names = list(format_file_ref.keys())
    primary_key = None
    _c_view = None
    for view_name in _views_names:
        file_content = format_file_ref[view_name]
        _features_names = list(file_content.keys())
        for feature_name in _features_names:
            feature_content  = file_content[feature_name]
            _d_format = feature_content.get('data_format')
            
            if _d_format == DataType.KEY.name:
                if _c_view is None:
                    primary_key = feature_name
                    _c_view = view_name
                    print(f'found primary key {primary_key}')
                else:
                    print(f'error: found 2 primary keys is same view {view_name}')
        _c_view = None
    return primary_key



def select_data_from_format_file_ref(datasets: Dict[str, Dict[str, Any]],
                                     format_file: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
    """returns an updated dataset containing only the features detailed in format_file"""
    # variables initialisation
    
    updated_dataset = {}
    _views_format_file = list(format_file.keys())
    
    for view in _views_format_file:
        if view in datasets.keys():
            # only extract features from format_file
            _format_file_features = list(format_file[view].keys())
            _current_dataset_feature = datasets[view].columns.tolist()
            try:
                updated_dataset[view] = datasets[view][_format_file_features]
            except KeyError as ke:
                # catch error if a column is specified in data format file
                # but not found in dataset
                _missing_feature = []
                for feature in _format_file_features:
                    if feature not in _current_dataset_feature:
                        _missing_feature.append(feature)
                print('Error: th following features', *_missing_feature, f'are not found in view: {view}')
        else:
            # trigger error
            print(f'error!: missing view {view} in dataset')
            
    return updated_dataset

In [3]:
import logging

class CustomWarning():
    
        def __init__(self, disclosure: int,level: str):
            #super().__init__('l')
            self.disclosure = disclosure
            self.level = level
        
        def display(self,message, columns = ' ' ):  
            
            
            logger = logging.getLogger('mylogger')
            #logger.setLevel(logging.DEBUG)

            #handler = logging.FileHandler('mylog.log')
            #formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            #handler.setFormatter(formatter)
            #logger.addHandler(handler)
            
            if self.disclosure == 1:
                if self.level == 'CRITICAL':
                    logger.critical(message)
                elif self.level == 'WARNING':
                    logger.warning(message)
            elif self.disclosure == 2:
                if self.level == 'CRITICAL':
                    message = 'Critical Warning.' + message
                    logger.critical(message)
                elif self.level == 'WARNING':
                    message = 'Regular Warning.' + message
                    logger.warning(message)
            elif self.disclosure == 3:
                if self.level == 'CRITICAL':
                    message = 'Critical Warning. ' + message + 'Columns affected :' + columns
                    logger.critical(message)
                elif self.level == 'WARNING':
                    message = 'Regular Warning. ' + message  + 'Columns affected :' + columns
                    logger.warning(message) 
            print (message) 
            return message
        
        

In [4]:
def create_dictionary_multi_view_dataset(dataframe: pd.DataFrame,
                                         views_features_mapping: Dict[str, List[str]],
                                         primary_key: str=None) -> Dict[str, pd.DataFrame]:
    _primary_key_label = 'primary_key'
    
    multi_view_dataset = {}
    
    if primary_key is not None:
        _key_values = dataframe[primary_key].values  # storing primary key values

    _all_features_names = []
    _new_views_names = []
    for view_name in views_features_mapping.keys():
        # get all columns name for each view, and remove primary key
        _features_names = list(views_features_mapping[view_name])
        
        if primary_key is not None:
            _features_names.remove(primary_key)
        _tmp_dataframe = dataframe[_features_names[0]].values
        _tmp_dataframe = _tmp_dataframe.reshape(-1, 1)  # need to reshape,
        #(otherwise concatenation wont work)
        for feature in _features_names[1:]:
            # iterate over the remaining items in _feature_name
            # need to do it that way because indexing dataframe is somehow broken
            
            _new_feature = dataframe[feature].to_numpy()
            _new_feature = _new_feature.reshape(-1, 1)
            _tmp_dataframe = np.concatenate([_tmp_dataframe, _new_feature], axis=1)
            
        multi_view_dataset[view_name] =pd.DataFrame( _tmp_dataframe, columns=_features_names)
    
    if primary_key is not None:
        multi_view_dataset[primary_key] = dataframe[primary_key]
    return multi_view_dataset



In [5]:
new_feature_name = { v: list(pre_parsed_dataset_to_check[v].columns) for v in views_names}

print(new_feature_name)

create_multi_view_dataframe_from_dataframe(df_joined,new_feature_name, primary_key=primary_key)


NameError: name 'views_names' is not defined

In [140]:
np.concatenate([[[1],[2],[3]], [[4],[4],[5]]], axis=1)

array([[1, 4],
       [2, 4],
       [3, 5]])

In [106]:
df_to_check[['2', 'city', 'pkey', 'discrete', 'pH', 'pressure', 'e.1', 'file1.1', 'file2.1', 'file1.time', 'e', 'gender', 'file2.time', 'blood type']]

feature_name,2,city,pkey,pkey.1,pkey.2,discrete,pH,pressure,e.1,file1.1,file2.1,file1.time,e,gender,file2.time,blood type
0,False,Lille,zmixzrgvxrjqxoe sluk,qpqorfhylu gmfjy bdj,kkmjozalfyirgsire ui,64.0,0.023107,0.088082,63,True,True,2018-01-01 00:00:00,98,MAN,2018-01-01 00:00:00,A
1,True,Lille,vrzahnpfluspdcbfnaqt,kkmjozalfyirgsire ui,xkdawggpnuulcewuoyzz,26.0,,0.774788,20,False,False,2018-01-01 01:00:00,83,MAN,2018-01-01 01:00:00,O
2,False,Paris,pnrepvmrxqabdlvisclv,ezfasuuycdda foisjte,khuulhwgwnjggrfoefce,61.0,0.407279,0.514092,2,False,True,2018-01-01 02:00:00,73,WOMAN,2018-01-01 02:00:00,A
3,True,Paris,gwj luzejwdxzsiljxzd,faxiqkt xggzmwzoidbg,xxysdmwwmjsmyhaswfdb,29.0,0.536301,0.832881,70,True,True,2018-01-01 03:00:00,45,WOMAN,2018-01-01 03:00:00,AB
4,False,Lille,jjdvcnofivbqhirxzdyo,znwhlj rwzdutnagwasy,ldejfuij mnbnf wwmms,99.0,0.749443,0.696152,90,True,True,2018-01-01 04:00:00,84,MAN,2018-01-01 04:00:00,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,True,Paris,hrvepmqjn llgbzplshv,zeqhcikzdodus jn qjf,wrmdecb s pohtmrcdj,9.0,,0.295578,41,True,True,2018-01-04 23:00:00,66,WOMAN,2018-01-04 23:00:00,A
96,True,Marseille,wroevwyuamxibzshlxxh,iicthcvfmkajbvr gzir,whmwrpvqmerdpwwzxasf,98.0,0.388389,0.474322,41,False,False,2018-01-05 00:00:00,81,WOMAN,2018-01-05 00:00:00,B
97,True,Lille,ywadcykylymkdtzfctpg,ztjakcsk bhjoksdz lm,pnrepvmrxqabdlvisclv,21.0,0.889067,0.927511,7,True,True,2018-01-05 01:00:00,82,MAN,2018-01-05 01:00:00,B
98,True,Marseille,ruchbfa zwgenxslegrl,sabunaa opt vpulnxj,iicthcvfmkajbvr gzir,42.0,0.402979,0.494798,11,False,True,2018-01-05 02:00:00,18,MAN,2018-01-05 02:00:00,O


In [58]:
df_to_check

feature_name,e,file1.1,2,file1.time,pressure,e.1,gender,blood type,pkey,discrete,city,pkey.1,file2.1,file2.time,pH,pkey.2
0,98,True,False,2018-01-01 00:00:00,0.088082,63,MAN,A,zmixzrgvxrjqxoe sluk,64.0,Lille,qpqorfhylu gmfjy bdj,True,2018-01-01 00:00:00,0.023107,kkmjozalfyirgsire ui
1,83,False,True,2018-01-01 01:00:00,0.774788,20,MAN,O,vrzahnpfluspdcbfnaqt,26.0,Lille,kkmjozalfyirgsire ui,False,2018-01-01 01:00:00,,xkdawggpnuulcewuoyzz
2,73,False,False,2018-01-01 02:00:00,0.514092,2,WOMAN,A,pnrepvmrxqabdlvisclv,61.0,Paris,ezfasuuycdda foisjte,True,2018-01-01 02:00:00,0.407279,khuulhwgwnjggrfoefce
3,45,True,True,2018-01-01 03:00:00,0.832881,70,WOMAN,AB,gwj luzejwdxzsiljxzd,29.0,Paris,faxiqkt xggzmwzoidbg,True,2018-01-01 03:00:00,0.536301,xxysdmwwmjsmyhaswfdb
4,84,True,False,2018-01-01 04:00:00,0.696152,90,MAN,B,jjdvcnofivbqhirxzdyo,99.0,Lille,znwhlj rwzdutnagwasy,True,2018-01-01 04:00:00,0.749443,ldejfuij mnbnf wwmms
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,66,True,True,2018-01-04 23:00:00,0.295578,41,WOMAN,A,hrvepmqjn llgbzplshv,9.0,Paris,zeqhcikzdodus jn qjf,True,2018-01-04 23:00:00,,wrmdecb s pohtmrcdj
96,81,False,True,2018-01-05 00:00:00,0.474322,41,WOMAN,B,wroevwyuamxibzshlxxh,98.0,Marseille,iicthcvfmkajbvr gzir,False,2018-01-05 00:00:00,0.388389,whmwrpvqmerdpwwzxasf
97,82,True,True,2018-01-05 01:00:00,0.927511,7,MAN,B,ywadcykylymkdtzfctpg,21.0,Lille,ztjakcsk bhjoksdz lm,True,2018-01-05 01:00:00,0.889067,pnrepvmrxqabdlvisclv
98,18,False,True,2018-01-05 02:00:00,0.494798,11,MAN,O,ruchbfa zwgenxslegrl,42.0,Marseille,sabunaa opt vpulnxj,True,2018-01-05 02:00:00,0.402979,iicthcvfmkajbvr gzir


existing tests

- test keys (should be done before joining) 
 |-> unicity of value
- test datetime
 |-> are they datetime parsable (for that i am using `dateutil` python package)
 
- test variables including
 |-> test if data have missing values and missing values are not allowed
 | -> test correct categories / sub categories
 | -> test lower bound
 | -> test upper bound
 | -> check if defined values are contained in categorical variables
 
- data transformation

|-> interpolate missing values (if allowed) using specific method

In [3]:
inst = CustomWarning(2, 'WARNING')
inst.display('hsskks', 'kl')







In [7]:
pre_parsed_dataset_to_check


{'file1':      e  file1.1      2           file1.time  pressure  e.1 gender blood type  \
 0   98     True  False  2018-01-01 00:00:00  0.088082   63    MAN          A   
 1   83    False   True  2018-01-01 01:00:00  0.774788   20    MAN          O   
 2   73    False  False  2018-01-01 02:00:00  0.514092    2  WOMAN          A   
 3   45     True   True  2018-01-01 03:00:00  0.832881   70  WOMAN         AB   
 4   84     True  False  2018-01-01 04:00:00  0.696152   90    MAN          B   
 ..  ..      ...    ...                  ...       ...  ...    ...        ...   
 95  66     True   True  2018-01-04 23:00:00  0.295578   41  WOMAN          A   
 96  81    False   True  2018-01-05 00:00:00  0.474322   41  WOMAN          B   
 97  82     True   True  2018-01-05 01:00:00  0.927511    7    MAN          B   
 98  18    False   True  2018-01-05 02:00:00  0.494798   11    MAN          O   
 99  70    False   True  2018-01-05 03:00:00  0.316395   74  WOMAN         AB   
 
                 

In [9]:
multi_format_file_ref

{'file1': {'e': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': True},
  '1': {'data_format': 'CATEGORICAL',
   'data_type': 'BOOLEAN',
   'values': 'bool',
   'is_missing_values': False},
  '2': {'data_format': 'CATEGORICAL',
   'data_type': 'BOOLEAN',
   'values': 'bool',
   'is_missing_values': True},
  'time': {'data_format': 'DATETIME',
   'data_type': 'DATETIME',
   'values': 'object',
   'is_missing_values': False},
  'pressure': {'data_format': 'QUANTITATIVE',
   'data_type': 'CONTINUOUS',
   'values': 'float64',
   'is_missing_values': False},
  'e.1': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': False},
  'gender': {'data_format': 'CATEGORICAL',
   'data_type': 'CHARACTER',
   'values': 'object',
   'is_missing_values': False},
  'blood type': {'data_format': 'CATEGORICAL',
   'data_type': 'CHARACTER',
   'values': 'object',
   'is_missing_values': True},
  '

{'file1': ['e',
  'file1.1',
  '2',
  'file1.time',
  'pressure',
  'e.1',
  'gender',
  'blood type',
  'pkey'],
 'contatct': ['discrete', 'city', 'pkey'],
 'file2': ['file2.1', 'file2.time', 'pH', 'pkey']}

In [18]:
multi_df_joined

views,contatct,contatct,file1,file1,file1,file1,file1,file1,file1,file1,file2,file2,file2,primary_key
feature_name,discrete,city,e,file1.1,2,file1.time,pressure,e.1,gender,blood type,file2.1,file2.time,pH,pkey
0,64.0,Lille,16,False,False,2018-01-03 04:00:00,0.98667,98,WOMAN,A,False,2018-01-02 06:00:00,,qpqorfhylu gmfjy bdj
1,26.0,Lille,96,True,True,2018-01-02 04:00:00,0.996889,35,MAN,AB,True,2018-01-01 00:00:00,0.023107,kkmjozalfyirgsire ui
2,61.0,Paris,8,True,True,2018-01-01 09:00:00,0.777026,65,MAN,A,False,2018-01-02 10:00:00,0.587685,ezfasuuycdda foisjte
3,29.0,Paris,6,True,False,2018-01-04 20:00:00,0.877527,81,MAN,AB,True,2018-01-03 12:00:00,0.894073,faxiqkt xggzmwzoidbg
4,99.0,Lille,79,True,True,2018-01-04 09:00:00,0.447389,88,WOMAN,O,True,2018-01-01 10:00:00,0.026831,znwhlj rwzdutnagwasy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,9.0,Paris,62,True,True,2018-01-02 13:00:00,0.953184,53,MAN,AB,False,2018-01-02 05:00:00,0.78856,zeqhcikzdodus jn qjf
96,98.0,Marseille,49,False,True,2018-01-02 21:00:00,0.442283,35,MAN,,True,2018-01-05 02:00:00,0.402979,iicthcvfmkajbvr gzir
97,21.0,Lille,14,False,False,2018-01-02 06:00:00,0.988543,67,MAN,B,False,2018-01-01 12:00:00,,ztjakcsk bhjoksdz lm
98,42.0,Marseille,10,True,False,2018-01-02 01:00:00,0.059791,48,MAN,B,True,2018-01-02 09:00:00,0.651801,sabunaa opt vpulnxj


# for simple datasets

In [121]:
# extract views names
views_names = list(format_file.keys())



# look for primary key
primary_key = search_primary_key(format_file)
print('primary key', primary_key)

# select only features in dataset that will be checked
pre_parsed_dataset_to_check = select_data_from_format_file_ref(dataset_to_check, format_file)
# rename columns names before join operation
pre_parsed_dataset_to_check = rename_variables_before_joining(pre_parsed_dataset_to_check, views_names)
pre_parsed_dataset_to_check

multi_df_to_check = create_multi_view_dataframe_from_dictionary(pre_parsed_dataset_to_check)
multi_df_to_check

#if primary_key is not None:
# jointure operation (takesplace only if primary key has been specfied in foramt_file)
df_joined = join_muti_view_dataset(multi_df_to_check)
    
df_joined

NameError: name 'format_file' is not defined

# for multiple datasets

In [6]:
multi_format_file_ref = utils.load_format_file_ref('multi_format_file')
multi_dataset_to_check = utils.load_tabular_datasets(r'test7')

directory found


In [6]:
multi_format_file_ref

{'file1': {'e': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': True},
  '1': {'data_format': 'CATEGORICAL',
   'data_type': 'BOOLEAN',
   'values': 'bool',
   'is_missing_values': False},
  '2': {'data_format': 'CATEGORICAL',
   'data_type': 'BOOLEAN',
   'values': 'bool',
   'is_missing_values': True},
  'time': {'data_format': 'DATETIME',
   'data_type': 'DATETIME',
   'values': 'object',
   'is_missing_values': False},
  'pressure': {'data_format': 'QUANTITATIVE',
   'data_type': 'CONTINUOUS',
   'values': 'float64',
   'is_missing_values': False},
  'e.1': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': False},
  'gender': {'data_format': 'CATEGORICAL',
   'data_type': 'CHARACTER',
   'values': 'object',
   'is_missing_values': False},
  'blood type': {'data_format': 'CATEGORICAL',
   'data_type': 'CHARACTER',
   'values': 'object',
   'is_missing_values': True},
  '

In [7]:
# extract views names
views_names = list(multi_format_file_ref.keys())



# look for primary key
primary_key = search_primary_key(multi_format_file_ref)
print('primary key', primary_key)

# select only features in dataset that will be checked
pre_parsed_dataset_to_check = select_data_from_format_file_ref(multi_dataset_to_check, multi_format_file_ref)
# rename columns names before join operation
pre_parsed_dataset_to_check, new_views_name = rename_variables_before_joining(pre_parsed_dataset_to_check, views_names,
                                                             primary_key)
pre_parsed_dataset_to_check

#multi_df_to_check = create_multi_view_dataframe(pre_parsed_dataset_to_check)  # remove that
#multi_df_to_check

#if primary_key is not None:
# jointure operation (takesplace only if primary key has been specfied in foramt_file)
df_joined = join_multi_view_dataset(pre_parsed_dataset_to_check, primary_key, False) # should accept DIct[pd.DataFrame]
    
#df_to_check = multi_df_joined.droplevel(0, axis=1)  # remove views from dataset
#df_to_check

df_joined

found primary key pkey
found primary key pkey
found primary key pkey
primary key pkey


Unnamed: 0,discrete,city,pkey,e,file1.1,2,file1.time,pressure,e.1,gender,blood type,file2.1,file2.time,pH
0,64.0,Lille,qpqorfhylu gmfjy bdj,16,False,False,2018-01-03 04:00:00,0.986670,98,WOMAN,A,False,2018-01-02 06:00:00,
1,26.0,Lille,kkmjozalfyirgsire ui,96,True,True,2018-01-02 04:00:00,0.996889,35,MAN,AB,True,2018-01-01 00:00:00,0.023107
2,61.0,Paris,ezfasuuycdda foisjte,8,True,True,2018-01-01 09:00:00,0.777026,65,MAN,A,False,2018-01-02 10:00:00,0.587685
3,29.0,Paris,faxiqkt xggzmwzoidbg,6,True,False,2018-01-04 20:00:00,0.877527,81,MAN,AB,True,2018-01-03 12:00:00,0.894073
4,99.0,Lille,znwhlj rwzdutnagwasy,79,True,True,2018-01-04 09:00:00,0.447389,88,WOMAN,O,True,2018-01-01 10:00:00,0.026831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,9.0,Paris,zeqhcikzdodus jn qjf,62,True,True,2018-01-02 13:00:00,0.953184,53,MAN,AB,False,2018-01-02 05:00:00,0.788560
96,98.0,Marseille,iicthcvfmkajbvr gzir,49,False,True,2018-01-02 21:00:00,0.442283,35,MAN,,True,2018-01-05 02:00:00,0.402979
97,21.0,Lille,ztjakcsk bhjoksdz lm,14,False,False,2018-01-02 06:00:00,0.988543,67,MAN,B,False,2018-01-01 12:00:00,
98,42.0,Marseille,sabunaa opt vpulnxj,10,True,False,2018-01-02 01:00:00,0.059791,48,MAN,B,True,2018-01-02 09:00:00,0.651801


In [12]:
new_views_name

{'file1': {'1': 'file1.1', 'time': 'file1.time'},
 'file2': {'1': 'file2.1', 'time': 'file2.time'}}

In [13]:
# convert joined data frame into muti view (multi index dataframe)

new_feature_name = { v: list(pre_parsed_dataset_to_check[v].columns) for v in views_names}
new_feature_name


create_dictionary_multi_view_dataset(df_joined, new_feature_name, primary_key=primary_key)


{'file1':      e file1.1  2           file1.time  pressure e.1 gender blood type
 0   16       0  0  2018-01-03 04:00:00   0.98667  98  WOMAN          A
 1   96       1  1  2018-01-02 04:00:00  0.996889  35    MAN         AB
 2    8       1  1  2018-01-01 09:00:00  0.777026  65    MAN          A
 3    6       1  0  2018-01-04 20:00:00  0.877527  81    MAN         AB
 4   79       1  1  2018-01-04 09:00:00  0.447389  88  WOMAN          O
 ..  ..     ... ..                  ...       ...  ..    ...        ...
 95  62       1  1  2018-01-02 13:00:00  0.953184  53    MAN         AB
 96  49       0  1  2018-01-02 21:00:00  0.442283  35    MAN        NaN
 97  14       0  0  2018-01-02 06:00:00  0.988543  67    MAN          B
 98  10       1  0  2018-01-02 01:00:00  0.059791  48    MAN          B
 99  89       1  1  2018-01-03 22:00:00  0.939352  13    MAN          B
 
 [100 rows x 8 columns],
 'contatct':    discrete       city
 0      64.0      Lille
 1      26.0      Lille
 2      61.0    

In [33]:
new_feature_name

{'file1': ['e',
  'file1.1',
  '2',
  'file1.time',
  'pressure',
  'e.1',
  'gender',
  'blood type',
  'pkey'],
 'contatct': ['discrete', 'city', 'pkey'],
 'file2': ['file2.1', 'file2.time', 'pH', 'pkey']}

In [89]:
exception_collector = []  #collects Exception

# Data sanity check

warning_report = WarningReportLogger(disclosure=3)

for view in views_names:
    print(view)
    
    feature_names = list(multi_format_file_ref[view].keys())
    for n_feature_name, feature_name in zip(new_feature_name[view], feature_names):
        # new_feature_ame is the result of the join operation
        check_variable_compliance(df_joined[n_feature_name], multi_format_file_ref[view][feature_name])
        data_format = multi_format_file_ref[view][feature_name].get('data_format')
        
        
        check_missing_entry_format_file_ref(multi_format_file_ref[view][feature_name],
                                            warning_report, 'lol', feature_name)
        
        check_missing_values(multi_format_file_ref[view][feature_name], 
                             df_joined[n_feature_name],
                            warning_report, exception_collector)
        if data_format == DataType.DATETIME.name:
            # addtional check for DATETIME data format
            check_datetime_variable_compliance( df_joined[n_feature_name], warning_report)
            
        if data_format == DataType.KEY.name:
            check_key_variable_compliance(df_joined[n_feature_name])
            
        print(warning_report.get_report())
        
if exception_collector:
    # case where exception collector is not empty
    raise DataSanityCheckException(exception_collector)
    

file1


NameError: name 'new_feature_name' is not defined

In [27]:
warning_report = WarningReportLogger(disclosure=3)

checker = PreProcessingChecker(multi_format_file_ref, df_joined,'jdl' , warning_report)

checker.update_views_features_name(new_views_name)
checker.check_all()

checker.get_warning_logger()

features names updated
N_SAMPLES_BELOW_THRESHOLD
file1 e
file1 e
INCORRECT_FORMAT_FILE
file1 e
DATA_TYPE_MISMATCH
INCORRECT_DATA_TYPE
file1 e
MISSING_DATA_ALLOWED
file1 e
OUTLIER_DETECTION_LOWER_BOUND
file1 e
OUTLIER_DETECTION_UPPER_BOUND
file1 e
INCORRECT_VALUES_CATEGORICAL_DATA
file1 1
file1 1
file1 1
file1 1
file1 1
file1 1
file1 1
file1 2
file1 2
file1 2
file1 2
file1 2
file1 2
file1 2
file1 time
file1 time
file1 time
file1 time
file1 time
file1 time
file1 time
file1 time
INCORRECT_DATETIME_DATA
file1 pressure
file1 pressure
file1 pressure
file1 pressure
file1 pressure
file1 pressure
file1 pressure
file1 e.1
file1 e.1
file1 e.1
file1 e.1
file1 e.1
file1 e.1
file1 e.1
file1 gender
file1 gender
file1 gender
file1 gender
file1 gender
file1 gender
file1 gender
file1 blood type
file1 blood type
file1 blood type
file1 blood type
file1 blood type
file1 blood type
file1 blood type
file1 pkey
file1 pkey
file1 pkey
file1 pkey
file1 pkey
file1 pkey
file1 pkey
contatct discrete
contatct discre

{'N_SAMPLES_BELOW_THRESHOLD': [{'feature': 'ALL',
   'msg': 'Test passed',
   'view': 'file1',
   'success': True},
  {'feature': 'ALL',
   'msg': 'Test passed',
   'view': 'contatct',
   'success': True},
  {'feature': 'ALL', 'msg': 'Test passed', 'view': 'file2', 'success': True}],
 'INCORRECT_FORMAT_FILE': [{'feature': 'e',
   'msg': 'Test passed',
   'view': '',
   'success': True},
  {'feature': '1', 'msg': 'Test passed', 'view': '', 'success': True},
  {'feature': '2', 'msg': 'Test passed', 'view': '', 'success': True},
  {'feature': 'time', 'msg': 'Test passed', 'view': '', 'success': True},
  {'feature': 'pressure', 'msg': 'Test passed', 'view': '', 'success': True},
  {'feature': 'e.1', 'msg': 'Test passed', 'view': '', 'success': True},
  {'feature': 'gender', 'msg': 'Test passed', 'view': '', 'success': True},
  {'feature': 'blood type', 'msg': 'Test passed', 'view': '', 'success': True},
  {'feature': 'pkey', 'msg': 'Test passed', 'view': '', 'success': True},
  {'feature':

In [104]:
checker._warning_logger.raise_exception()

DataSanityCheckException: MissingDataException: Variable pH must not have missing data, but some were found

In [117]:
pd.DataFrame(checker.get_warning_logger()['INCORRECT_FORMAT_FILE'])

Number of error: 1


Unnamed: 0,feature,msg,view,success
0,e,Test passed,,True
1,1,Test passed,,True
2,2,Test passed,,True
3,time,Test passed,,True
4,pressure,Test passed,,True
5,e.1,Test passed,,True
6,gender,Test passed,,True
7,blood type,Test passed,,True
8,pkey,Test passed,,True
9,discrete,Test passed,,True


In [93]:
exception_collector

[__main__.MissingDataException('Variable pH must not have missing data, but some were found')]

In [54]:
check_correct_variable_sub_type(multi_format_file_ref[view][feature_name],
                                df_joined[n_feature_name],
                                            warning_report,)

DATA_TYPE_MISMATCH
INCORRECT_DATA_TYPE


True

In [76]:
warning_report.get_report()

{}

In [11]:
class WarningReportLogger:
    def __init__(self, disclosure:int):
        self._disclosure = disclosure
        
        
        self._report = {}
        self._current_entry = None
        self._n_warnings = 1
        self._n_exception = 1
        self._n_feature = 1
        self._saved_msg = None
        self._exception_collector = []

        
    def write_new_entry(self, check: PreProcessingChecks):
        self._current_entry = check.name
        if check.name not in self._report:
            class WarningReportLogger:
    def __init__(self, disclosure:int):
        self._disclosure = disclosure
        
        
        self._report = {}
        self._current_entry = None
        self._n_warnings = 1
        self._n_exception = 1
        self._n_feature = 1
        self._saved_msg = None
        self._exception_collector = []

        
    def write_new_entry(self, check: PreProcessingChecks):
        self._current_entry = check.name
        if check.name not in self._report:
            
            #self._current_entry = checkand
            
            if self._disclosure < 2:
                if isinstance(check, PreProcessingChecks):
                    self._current_entry = 'Warning_' + str(self._n_warnings)
                    self._n_warnings += 1
                
                elif issubclass(check.warning_type, Exception):
                    self._current_entry = 'Error_' + str(self._n_exception)
                    self._n_exception += 1
                else:
                    print("input not understood")
            
            print(self._current_entry)
            self._report[self._current_entry] = []
        
    def write_checking_result(self, success: bool=None, msg:str='', feature_name:str='', view_name: str=''):
        
        _new_entry = {}
        
        if success :
            msg = "Test passed"
        elif success is None:
            msg = 'Test skipped'
        if self._disclosure > 2:
            _new_entry['feature'] = feature_name
            _new_entry['msg'] = msg
        else:
            _new_entry['feature'] = 'feature_' + str(self._n_feature)
            _new_entry['msg']= ''
            self._n_feature += 1
        _new_entry['view'] = view_name
        _new_entry['success'] = success
        self._report[self._current_entry].append(_new_entry)
        
    

    def get_report(self):
        print(f'number of warnings: {self._n_warnings}\nNumber of error: {self._n_exception}')
        return self._report
    
    def clean_report(self):
        self._report = {}
    def add_exception(self, exception: Exception):
        self._exception_collector.append(exception)
        
    def raise_exception(self):
        if self._exception_collector:
            # case where exception collector is not empty
            raise DataSanityCheckException(self._exception_collector)
            #self._current_entry = checkand
            
            if self._disclosure < 2:
                if isinstance(check, PreProcessingChecks):
                    self._current_entry = 'Warning_' + str(self._n_warnings)
                    self._n_warnings += 1
                
                elif issubclass(check.warning_type, Exception):
                    self._current_entry = 'Error_' + str(self._n_exception)
                    self._n_exception += 1
                else:
                    print("input not understood")
            
            print(self._current_entry)
            self._report[self._current_entry] = []
        
    def write_checking_result(self, success: bool=None, msg:str='', feature_name:str='', view_name: str=''):
        
        _new_entry = {}
        
        if success :
            msg = "Test passed"
        elif success is None:
            msg = 'Test skipped'
        if self._disclosure > 2:
            _new_entry['feature'] = feature_name
            _new_entry['msg'] = msg
        else:
            _new_entry['feature'] = 'feature_' + str(self._n_feature)
            _new_entry['msg']= ''
            self._n_feature += 1
        _new_entry['view'] = view_name
        _new_entry['success'] = success
        self._report[self._current_entry].append(_new_entry)
        
    

    def get_report(self):
        print(f'number of warnings: {self._n_warnings}\nNumber of error: {self._n_exception}')
        return self._report
    
    def clean_report(self):
        self._report = {}
    def add_exception(self, exception: Exception):
        self._exception_collector.append(exception)
        
    def raise_exception(self):
        if self._exception_collector:
            # case where exception collector is not empty
            raise DataSanityCheckException(self._exception_collector)

In [51]:
if False is None:
    print('l')

In [45]:
warning_report = WarningReportLogger(disclosure=3)

warning_report.write_new_entry(PreProcessingChecks.INCORRECT_DATA_TYPE)
warning_report.write_checking_result(False, PreProcessingChecks.INCORRECT_DATA_TYPE.error_message)
warning_report.write_checking_result(False, PreProcessingChecks.INCORRECT_DATA_TYPE.error_message)
return self
warning_report.write_new_entry(PreProcessingChecks.INCORRECT_DATETIME_DATA)
warning_report.write_checking_result(False, PreProcessingChecks.INCORRECT_DATETIME_DATA.error_message)
warning_report.get_report()

INCORRECT_DATA_TYPE
INCORRECT_DATETIME_DATA


{'INCORRECT_DATA_TYPE': [{'feature': '',
   'msg': 'Incorrect Data Type for variable %s: Excpected %s but found %s',
   'view': '',
   'success': False},
  {'feature': '',
   'msg': 'Incorrect Data Type for variable %s: Excpected %s but found %s',
   'view': '',
   'success': False}],
 'INCORRECT_DATETIME_DATA': [{'feature': '',
   'msg': 'Variable %s has been defined as a DATETIME variable, but samples are not parsable as date',
   'view': '',
   'success': False}]}

In [97]:
" ".join(['q', 'e', 'f'])

'q e f'

In [10]:
from enum import Enum

class MissingDataException(Exception):
    def __init__(self,message:str=""):
        self._message = message
        super().__init__(message)
        
    def __str__(self):
        return 'MissingDataException: ' + self._message

class MinimumSamplesViolatedException(Exception):
    def __init__(self, message: str=""):
        self._message = message
        super().__init__(message)
        
    def __str__(self):
        return 'MinimumSamplesViolatedException: ' + self._message
        
class MissingFeatureException(Exception):
    def __init__(self, message: str = ""):
        self._message = message
        super().__init__(message)
        
    def __str__(self):
        return 'MissingFeatureException: ' + self._message
    
class MissingViewException(Exception):
    def __init__(self, message: str=""):
        self._message = message
        super().__init__(message)
        
    def __str__(self):
        return 'MissingViewException' + self._message

class DataSanityCheckException(Exception):
    def __init__(self, exceptions: List[Exception]):
        message  = "\n".join([str(exception) for exception in exceptions])
        super().__init__(message)
    
class WarningType(Enum):
    REGULAR_WARNING = 1
    CRITICAL_WARNING = 2

class PreProcessingChecks(Enum):
    INCORRECT_FORMAT_FILE = ("Format File %s is incorrect: cannot parse variable %s", WarningType.CRITICAL_WARNING,
                            )
    KEY_UNICITY_VIOLATED = ("Key Variable %s violated unicity of data", WarningType.CRITICAL_WARNING, 
                           )
    MISSING_DATA_NOT_ALLOWED = ("Variable %s must not have missing data, but some were found",
                                MissingDataException, 
                               )
    MISSING_DATA_ALLOWED = ("Missing data found in variable %s", WarningType.REGULAR_WARNING 
                           )
    INCORRECT_STRUCTURE_DATA_TYPE = ("Data Type %s has an incorrect structure: %s", WarningType.CRITICAL_WARNING)
    DATA_TYPE_MISMATCH = ("Data Type  %s mismatch: %s is not a subtype of %s",
                           WarningType.REGULAR_WARNING)
    
    INCORRECT_DATA_TYPE = ('Variable named %s should be a %s variable, but it contains %s type',
                          WarningType.REGULAR_WARNING)
    INCORRECT_DATETIME_DATA = ("Variable %s has been defined as a DATETIME variable, but samples are not parsable as date",
                               WarningType.CRITICAL_WARNING)
    
    OUTLIER_DETECTION_LOWER_BOUND = ("Detected outliers for Variable %s: samples violate lower bound %s",
                                   WarningType.CRITICAL_WARNING)
    
    OUTLIER_DETECTION_UPPER_BOUND = ("Detected outliers for Varaiable %s: samples violate upper bound %s",
                                   WarningType.CRITICAL_WARNING)
    
    INCORRECT_VALUES_CATEGORICAL_DATA = ("Found at least one sample with incorrect label in Categorical Vraiable %s. Expected data are %s, but found %s",
                                        WarningType.CRITICAL_WARNING)
    
    N_MISSING_DATA_ABOVE_THRESHOLD = ("Found too many missing samples in variable %s, threshold is set at %s",
                                     WarningType.CRITICAL_WARNING)
    
    N_SAMPLES_BELOW_THRESHOLD = ("Number of samples contained in dataset %s is below threshold (expected at least %s samples, found %s samples)",
                                MinimumSamplesViolatedException)
    MISSING_FEATURE = ("Feature %s has not been found in dataset, but is needed for experiment",
                       MissingFeatureException)
    
    #MISSING_VIEW = ("View %s not found in dataset, but needed for experiment")
    
    def __init__(self, message: str,  warning_type: Union[WarningType, Exception]):
        self._message = message
        #self._additional_message = additional_message
        self._warning_type = warning_type
        #self._is_exception = is_exception
        
    @property
    def error_message(self):
        return self._message

    @property
    def warning_type(self):
        return self._warning_type
    
    @property
    def additional_message(self):
        return self._additional_message
    
    def __call__(self, *kwargs) -> Union[str, Exception]:
        
        msg = self.error_message % kwargs
        if isinstance(self.warning_type, WarningType):
            
            return msg
        elif issubclass(self.warning_type, Exception):
            
            return self.warning_type(message=msg)

    
def raise_warning(warning: PreProcessingChecks, *kwargs) -> str:
    if isinstance(warning.warning_type, WarningType):
        #warning.warning_type.value(warning_disclosure)
        
        return warning.error_message % kwargs
    elif issubclass(warning.warning_type, Exception):
        
        raise warning(*kwargs)



In [29]:
PreProcessingChecks.N_SAMPLES_BELOW_THRESHOLD('2', '5', '55')

('2', '5', '55')


__main__.MinimumSamplesViolatedException('Number of samples contained in dataset 2 is below threshold (expected at least 5 samples, found 55 samples)')

In [88]:
str(MissingDataException('djkfff'))

'MissingDataException: djkfff'

In [55]:
PreProcessingChecks.MISSING_DATA_ALLOWED.warning_type



In [31]:
raise_warning(PreProcessingChecks.MISSING_DATA_ALLOWED, 'kdld')

'Missing data found in variable kdld'

In [46]:
PreProcessiINCORRECT_DATETIME_DATngChecks.MISSING_DATA_ALLOWED.name

'MISSING_DATA_ALLOWED'

In [45]:
hasattr(PreProcessingChecks.MISSING_DATA_ALLOWED.name, 'WarningType')

False

In [30]:
def func(*kwargs:str):
    print(*kwargs)
    
func('jfkfl', 'djk')

jfkfl djk


class PreProcessingChecker:
    def check_...
    def check_all(view=None, feature=None)
    

In [26]:
MIN_NB_SAMPLES = 30  # TODO: specify it in data formt file (clinician should define it)

class PreProcessingChecker:
    def __init__(self, file_format_ref:dict,
                data_frame: pd.DataFrame,
                file_format_ref_name:str, 
                warning_logger: WarningReportLogger):
        self._file_format_ref = file_format_ref
        self._data_frame = data_frame
        self._file_format_ref_name = file_format_ref_name
        self._warning_logger = warning_logger
        self._new_features_name = None
        
        self._view_feature_names = {v: [f for f in file_format_ref[v].keys()] for v in file_format_ref.keys()}
        self._features = None
        
        self._warning_logger.clean_report()
    
    def _get_all_features(self, view:str) -> List[str]:
        return self._view_feature_names.get(view)
    
    def _get_feature_defined_in_format_file(self, view:str, feature:str)-> str:
        #feature_name = self._file_format_ref[view][feature]
        print(view, feature)
        if self._new_features_name is not None:
            
            _new_features_name = self._new_features_name.get(view)
            
            if _new_features_name is not None and feature in _new_features_name.keys():
                feature = _new_features_name.get(feature)
                
        return feature
    
    def get_warning_logger(self):
        return self._warning_logger.get_report()
    
    def update_views_features_name(self, new_features_name: Dict[str, Dict[str, str]]):
        self._new_features_name = new_features_name
        for view in self._view_feature_names:
            if view in new_features_name:
                for former_feature_name, new_feature_name in new_features_name[view].items():
                    self._view_feature_names[view].append(new_feature_name)
                    self._view_feature_names[view].remove(former_feature_name)
        print('features names updated')
    
    def check_all(self, view:str=None, feature:str=None):
        
        if view is not None:
            _views = [view]
        
        else:
            _views = self._view_feature_names
        
        
        for _view in _views:
            # define here test that happens on whole dataset
            
            ###
            self.check_number_of_samples(MIN_NB_SAMPLES, _view)
            
            
            ####
            _features = self._file_format_ref[_view].keys()
            
            for _feature in _features:
                
                # check fi feature does exist
                _is_feature_exist = self.check_feature_exists_in_dataset(_view,
                                                                         _feature)
                if not _is_feature_exist:
                    continue
                
                #
                _is_format_file_correct = self.check_missing_entry_format_file_ref(_view,
                                                                                  _feature)
                if not _is_format_file_correct:
                    continue
                self.check_correct_variable_sub_type(_view, _feature)
                self.check_missing_values(_view, _feature)
                self.check_lower_bound(_view, _feature)
                self.check_upper_bound(_view, _feature)
                self.check_values_in_categorical_variable(_view, _feature)
                
                _feature_data_type = self._file_format_ref[_view][_feature]['data_type']
                
                if _feature_data_type == DataType.DATETIME.name:
                    self.check_datetime_variable_compliance(_view,
                                                           _feature)
                    
                if _feature_data_type == DataType.KEY.name:
                    self.check_key_variable_compliance(_view,
                                                      _feature)
    
    def check_number_of_samples(self, min_nb_samples: int, view_name: str='') -> bool:
        #Checking samples limit

        feature_name ='ALL' 
        sample_count = self._data_frame.shape[0]
           

        self._warning_logger.write_new_entry(PreProcessingChecks.N_SAMPLES_BELOW_THRESHOLD)
        if sample_count < min_nb_samples:
            success = False
            try:
                warning_msg = raise_warning(PreProcessingChecks.N_SAMPLES_BELOW_THRESHOLD,
                                            view_name, min_nb_samples, sample_count)
            except MinimumSamplesViolatedException as err:
                print(err)
                self._warning_logger.add_exception(err)
                warning_msg = str(err)
            #message = critical_warning.display(f'Samples count exceeds the threshold limit {MIN_NB_SAMPLES}')
        else:
            success = True
            warning_msg='Test passed'

        self._warning_logger.write_checking_result(success,
                                                   warning_msg,
                                                   feature_name,
                                                   view_name)

        return success
    
    def check_feature_exists_in_dataset(self,
                                        view:str,
                                     feature_name: str) -> bool:
        renamed_feature_name = self._get_feature_defined_in_format_file(view, feature_name)
        if renamed_feature_name in self._data_frame.columns:
            success = True
        else:
            success = False
            self._warning_logger.write_new_entry(PreProcessingChecks.MISSING_FEATURE)
            try:
                raise_warning(PreProcessingChecks.MISSING_FEATURE,
                                        feature_name)
            except MissingFeatureException as exc:
                warning_msg = str(exc)
                self._warning_logger.add_exception(exc)
            self._warning_logger.write_checking_result(success, warning_msg, feature_name)
            
        return success
    
    def check_missing_entry_format_file_ref(self, view:str,
                                          feature_name:str) -> bool:
        """Tests if format file ref is parsable"""
        
        success = True
        warning_msg = 'Test passed'
        
        renamed_feature_name = self._get_feature_defined_in_format_file(view, feature_name)
        
        _view_format_file = self._file_format_ref[view]
        _feature_format_file = _view_format_file.get(feature_name)
        if _feature_format_file is not None:
            _data_format_name = _feature_format_file.get('data_format')
            _data_type_name = _feature_format_file.get('data_type')
        else:
            _data_format_name, _data_type_name = None, None

        self._warning_logger.write_new_entry(PreProcessingChecks.INCORRECT_FORMAT_FILE)
        if _data_format_name is None or _data_type_name is None:
            #success: bool, msg:str='', feature_name:str='
            warning_msg = raise_warning(PreProcessingChecks.INCORRECT_FORMAT_FILE,
                                    self._file_format_ref_name,
                                    renamed_feature_name)

            success = False
        self._warning_logger.write_checking_result(success,
                                             warning_msg,
                                             feature_name)  
        return success
    
    def check_correct_variable_sub_type(self, 
                                        view_name:str,
                                    feature_name:str,
                                    ) -> bool:
        """checks consistancy between general data type and subtype"""
        
        renamed_feature_name = self._get_feature_defined_in_format_file(view_name,
                                                                        feature_name)
        
        column = self._data_frame[renamed_feature_name]
        _feature_format_ref = self._file_format_ref[view_name][feature_name]
        success = True
        warning_msg = 'test passed'
        data_format_name = _feature_format_ref.get('data_format')
        data_type_name = _feature_format_ref.get('data_type')

        #feature_name = column.name

        # first test
        self._warning_logger.write_new_entry(PreProcessingChecks.DATA_TYPE_MISMATCH)
        if data_format_name is None or data_type_name is None:

            warning_msg = 'test skipped'
        else:
            try:
                data_type = utils.find_data_type(data_format_name, data_type_name)
                warning_msg = 'test passed'
            except ValueError as err:
                warning_msg = raise_warning(PreProcessingChecks.DATA_TYPE_MISMATCH, 
                                           data_format_name, data_type_name)
                success = False

        self._warning_logger.write_checking_result(success, warning_msg, feature_name)

        # second test 
        self._warning_logger.write_new_entry(PreProcessingChecks.INCORRECT_DATA_TYPE)
        if data_format_name is None or data_type_name is None:
            warning_msg = 'test skipped'
        else:
            actual_dtype = column.dtype


            _does_column_have_correct_data_type = any(t == actual_dtype for t in data_type.value)
            if not _does_column_have_correct_data_type:
                warning_msg = raise_warning(PreProcessingChecks.INCORRECT_DATA_TYPE, 
                                           feature_name, data_type_name, str(actual_dtype))
                success = False
            else:
                warning_msg = 'test passed'
            self._warning_logger.write_checking_result(success, warning_msg, feature_name)

        return success
    
    
    def check_missing_values(self, 
                             view_name: str,
                         feature_name: str) -> bool:
        """checks if missing data are present in column, and triggers error depending
        of the fact that missing data are whether allowed or not in the format_ref_file"""
        
        renamed_feature_name = self._get_feature_defined_in_format_file(view_name,
                                                                        feature_name)
        _column = self._data_frame[renamed_feature_name]
        _feature_format_ref = self._file_format_ref[view_name][feature_name]
        _is_missing_data = utils.check_missing_data(_column)
        _is_missing_values_authorized = _feature_format_ref.get('is_missing_values', 'test_skipped')
        success = True

        
        self._warning_logger.write_new_entry(PreProcessingChecks.MISSING_DATA_ALLOWED)

        if _is_missing_values_authorized == 'test_skipped':
            warning_msg = 'Test skipped'
            success = None
        elif _is_missing_data:
            success = False
            # test fails: 
            if _is_missing_values_authorized:
                # case where missing values are present BUT allowed
                warning_msg = raise_warning(PreProcessingChecks.MISSING_DATA_ALLOWED,
                                           feature_name)
            else:
                # case where missing values are present AND NOT allowed
                try:
                    warning_msg = raise_warning(PreProcessingChecks.MISSING_DATA_NOT_ALLOWED,
                                               feature_name)
                except MissingDataException as err:
                    print(err)
                    self._warning_logger.add_exception(err)
                    warning_msg = str(err)
        else:
            # test passed
            warning_msg = 'Test passed'

        self._warning_logger.write_checking_result(success, warning_msg, feature_name)

        return success
    
    
    def check_lower_bound(self,
                          view_name:str,
                          feature_name:str) -> bool:
    
        _renamed_feature_name = self._get_feature_defined_in_format_file(view_name,
                                                                        feature_name)
        _column = self._data_frame[_renamed_feature_name]
        
        # remove nan (missing values) from vriable 
        _column_without_nan = _column.dropna()
        _feature_format_ref = self._file_format_ref[view_name][feature_name]
        lower_bound = _feature_format_ref.get('lower_bound')

        self._warning_logger.write_new_entry(PreProcessingChecks.OUTLIER_DETECTION_LOWER_BOUND)
        if lower_bound is not None:

            # should work for both numerical and datetime data types

            is_lower_bound_correct = np.all(_column_without_nan >= lower_bound)


            if not is_lower_bound_correct:
                warning_msg = raise_warning(PreProcessingChecks.OUTLIER_DETECTION_LOWER_BOUND,
                                               feature_name, lower_bound)
            else:
                warning_msg = 'Test passed'
        else:
            warning_msg = 'Test skipped'
            is_lower_bound_correct = None
        self._warning_logger.write_checking_result(is_lower_bound_correct, warning_msg, feature_name)

        return is_lower_bound_correct
    
    
    def check_upper_bound(self, view_name: str, feature_name:str) -> bool:
        _renamed_feature_name = self._get_feature_defined_in_format_file(view_name,
                                                                        feature_name)
        
        _column = self._data_frame[_renamed_feature_name]
        _feature_format_ref = self._file_format_ref[view_name][feature_name]
        
        # remove nan (missing values) from vriable 
        _column_without_nan = _column.dropna()
        upper_bound = _feature_format_ref.get('upper_bound')

        self._warning_logger.write_new_entry(PreProcessingChecks.OUTLIER_DETECTION_UPPER_BOUND)
        if upper_bound is not None:
             # should work for both numerical and datetime data sets
            is_upper_bound_correct = np.all(_column_without_nan <= lower_bound)

            if not is_upper_bound_correct:
                warning_msg = raise_warning(PreProcessingChecks.OUTLIER_DETECTION_LOWER_BOUND,
                                            feature_name, upper_bound)
            else:
                warning_nsg = 'Test passed'

        else:
            warning_msg = 'Test skipped'
            is_upper_bound_correct = None

        self._warning_logger.write_checking_result(is_upper_bound_correct, warning_msg, feature_name)
        return is_upper_bound_correct
    
    
    def check_values_in_categorical_variable(self, 
                                             view_name:str,
                                             feature_name:str)-> bool:
        """Checks if values are contained in categorical variables"""

        _renamed_feature_name = self._get_feature_defined_in_format_file(view_name,
                                                                        feature_name)
        _column = self._data_frame[_renamed_feature_name]
        
        _feature_format_ref = self._file_format_ref[view_name][feature_name]
        categorical_values = _feature_format_ref.get('categorical_values')

        self._warning_logger.write_new_entry(PreProcessingChecks.INCORRECT_VALUES_CATEGORICAL_DATA)
        if categorical_values is None:
            warning_msg = 'test skipped'
            success = None
        else:
            unique_values = utils.unique(column)
            success = True
            for val in unique_values:
                if val not in categorical_values and not np.isnan(val):
                    warning_msg = raise_warning(PreProcessingChecks.INCORRECT_VALUES_CATEGORICAL_DATA,
                                               feature_name, val, *categorical_values)
                    success = False
            if success:
                warning_msg = 'test passed'
        self._warning_logger.write_checking_result(success, warning_msg, feature_name)
        return success

    def check_missing_values_threshold(self,
                                       view_name: str,
                                   feature_name: str,
                                  threshold: int = 50) -> bool:
        #Checking if missing values exceed threshold limit(50%)
        
        _renamed_feature_name = self._get_feature_defined_in_format_file(view_name,
                                                                        feature_name)
        _column = self._data_frame[_renamed_feature_name]
        _feature_format_ref = self._file_format_ref[view_name][feature_name]
        
        min_nb_missing_data = math.ceil((threshold/100)*_column.shape[0])

        self._warning_logger.write_new_entry(PreProcessingChecks.N_MISSING_DATA_ABOVE_THRESHOLD)
        n_missing_data = _column.isnull().sum()
        if (n_missing_data>min_nb_missing_data):
            success = False
            #message = critical_warning.display(f'Missing value exceeds threshold limit {MIN_NB_MISSING_DATA}',col) 
            warning_msg = raise_warning(PreProcessingChecks.N_MISSING_DATA_ABOVE_THRESHOLD,
                                        feature_name, n_missing_data,
                                        min_nb_missing_data)
        else:
            success = True
            warning_msg ='Test passed'

        #report['check_missing_values_limit'] = report_details
        self._warning_logger.write_checking_result(success, warning_msg, feature_name)
        return success
    
    
    def check_key_variable_compliance(self, 
                                      view_name:str,
                                      feature_name:str) -> bool:
        """Performs data sanity check over variable of type `KEY`
        warning should be Critical warnings
        """
        # variables initialisation

        _renamed_feature_name = self._get_feature_defined_in_format_file(view_name,
                                                                        feature_name)
        _column = self._data_frame[_renamed_feature_name]
        _feature_format_ref = self._file_format_ref[view_name][feature_name]
        
        # 1. check unicity of values in column

        n_unique_samples = utils.unique(_column, number=True)
        n_samples = _column.shape[0]

        

        self._warning_logger.write_new_entry(PreProcessingChecks.KEY_UNICITY_VIOLATED)
        if n_unique_samples != n_samples:
            success = False
            warning_msg = raise_warning(PreProcessingChecks.KEY_UNICITY_VIOLATED,
                                       feature_name,)
        else:
            warning_msg = 'Test passed'
            success = True
        self._warning_logger.write_checking_result(success, warning_msg, feature_name)

        return success


    def check_datetime_variable_compliance(self,
                                           view_name:str,
                                           feature_name:str) -> bool:
        """additional data sanity checks for datetime variable"""
        # test 1. check if datetime is parsable
        
        _renamed_feature_name = self._get_feature_defined_in_format_file(view_name,
                                                                        feature_name)
        _column = self._data_frame[_renamed_feature_name]
        _feature_format_ref = self._file_format_ref[view_name][feature_name]
        
        # remove missing values (nan) from column
        _column_without_nan = _column.dropna()
        are_datetime_parsables =  np.all(_column_without_nan.apply(utils.is_datetime))
        
        self._warning_logger.write_new_entry(PreProcessingChecks.INCORRECT_DATETIME_DATA)

        if not are_datetime_parsables:
            
            print('Warning: at least one variable is not a datetime')
            warning_msg = raise_warning(PreProcessingChecks.INCORRECT_DATETIME_DATA,
                                        feature_name)

        else:
            
            warning_msg = 'Test passed'
        self._warning_logger.write_checking_result(are_datetime_parsables,
                                                     warning_msg,
                                                     feature_name) 

        return are_datetime_parsables

In [19]:


def run_data_sanity_checks(data_format_ref: Dict[str, Dict[str, Any]],
                           data_frame: pd.DataFrame,
                           new_features_names : Dict[str, str],
                           view_name: str, 
                           data_format_ref_name: str,
                            warning_logger: WarningReportLogger):
    """
    Runs all data sanity checks
    Args:
    
     - data_frame: 
     
    """
    
    # run tests related to the whole dataset
    
    # data anity check : check if number of samples is above threshold
    check_number_of_samples(data_frame, MIN_NB_SAMPLES, view_name,print(sel)
                            warning_logger)
    # iterates over all features within a view
    for feature in data_format_ref:
        if feature in new_feature_name:
            # rename feature ( it has been renamed after join operation,
            # because 2 or more views have the same feature name)
            feature = new_feature_name.get(feature)
        # data sanity check : check if format file is parsable
        _is_format_file_correct = check_missing_entry_format_file_ref(data_format_ref[feature],
                                           feature,
                                            data_format_ref_name,
                                           warning_logger)
        
        if _is_format_file_correct:
            # can not parse the format_file_ref, so skipping next data sanity check
            continue
        # data sanity check: check if feature defined in format_file_ref, 
        # is also present in data_frame
        check_feature_exists_in_dataset(data_frame, 
                                       feature,
                                       warning_logger)
        
        # data sanity check: check if type and sub type are consistants
        check_correct_variable_sub_type(data_format_ref[feature],
                                        data_frame[feature],
                                    warning_logger)
        #check_missing_entry_format_file_ref()
    
        # data sanity check: check if for the specified feature, missing data
        # are allowed
        check_missing_values(data_format_ref[feature],
                             data_frame[feature],
                             warning_logger)
        
        # data sanity check: check if number of missing value in
        # feature is below threshold
def check_feature_exists_in_dataset(data_frame: pd.DataFrame,
                                     feature_name: str,
                                    warning_logger: WarningReportLogger) -> bool:
    
    if feature_name in data_frame.column:
        success = True
    else:
        success = False
        warning_logger.write_new_entry(PreProcessingChecks.MISSING_FEATURE)
        try:
            raise_warning(PreProcessingChecks.MISSING_FEATURE,
                                    feature_name)
        except MissingDataException as exc:
            warning_msg = str(exc)
        warning_logger.write_checking_result(success, warning_msg, feature_name)
        warning_logger.add_exception(exc)
        
    return success

def check_key_variable_compliance(column: pd.Series,
                                  warning_logger: WarningReportLogger=None) -> bool:
    """performs data sanity check over variable of type `KEY`
    warning should be Critical warnings
    """
    # variables initialisation
    
    
    # 1. check unicity of values in column
    
    n_unique_samples = utils.unique(column, number=True)
    n_samples = column.shape[0]_new_features_name.keys()
    
    feature_name = column.name
    
    warning_logger.write_new_entry(PreProcessingChecks.KEY_UNICITY_VIOLATED)
    if n_unique_samples != n_samples:
        success = False
        warning_msg = raise_warning(PreProcessingChecks.KEY_UNICITY_VIOLATED,
                                   feature_name,)
    else:
        warning_msg = 'Test passed'
        success = True
    warning_logger.write_checking_result(success, warning_msg, feature_name)
                
    return success


def check_datetime_variable_compliance(column: pd.Series, 
                                      warning_logger: WarningReportLogger):
    """additional data sanity checks for datetime variable"""
    # test 1. check if datetime is parsable
    
    # remove nan
    column_without_nan = column.dropna()
    are_datetime_parsables =  np.all(column.apply(utils.is_datetime))
    feature_name = column.namenew_features_name
    warning_logger.write_new_entry(PreProcessingChecks.INCORRECT_DATETIME_DATA)
    
    if not are_datetime_parsables:
        success = False
        print('Warning: at least one variable is not a datetime')
        warning_msg = raise_warning(PreProcessingChecks.INCORRECT_DATETIME_DATA,
                                    feature_name)
        
    else:
        success = True
        warning_msg = 'Test passed'
    warning_logger.write_checking_result(success,
                                     warning_msg,
                                     feature_name) 
    
    return success

def check_missing_entry_format_file_ref(format_file_ref: Dict[str, Any],
                                          feature_name:str='',
                                        format_file_ref_name:str='',
                                       warning_logger: WarningReportLogger=None) -> bool:
    """Tests if format file ref is parsable"""
    data_format_name = format_file_ref.get('data_format')
    data_type_name = format_file_ref.get('data_type')
    success = True
    warning_msg = 'Test passed'
    
    warning_logger.write_new_entry(PreProcessingChecks.INCORRECT_FORMAT_FILE)
    if data_format_name is None or data_type_name is None:
        #success: bool, msg:str='', feature_name:str='
        warning_msg = raise_warning(PreProcessingChecks.INCORRECT_FORMAT_FILE,
                                format_file_ref_name,
                                feature_name)
        
        success = False
    warning_logger.write_checking_result(success,
                                         warning_msg,
                                         feature_name)  
    return success
    
def check_correct_variable_sub_type(format_file_ref: Dict[str, Any],
                                    column: pd.Series,
                                    warning_logger: WarningReportLogger,
                                    ) -> bool:
    """checks consistancy between general data type and subtype"""
    success = True
    warning_msg = 'test passed'
    data_format_name = format_file_ref.get('data_format')
    data_type_name = format_file_ref.get('data_type')
    
    feature_name = column.name
    
    # first test
    warning_logger.write_new_entry(PreProcessingChecks.DATA_TYPE_MISMATCH)
    if data_format_name is None or data_type_name is None:
        
        warning_msg = 'test skipped'
    else:
        try:
            data_type = utils.find_data_type(data_format_name, data_type_name)
            warning_msg = 'test passed'
        except ValueError as err:
            warning_msg = raise_warning(PreProcessingChecks.DATA_TYPE_MISMATCH, 
                                       data_format_name, data_type_name)
            success = False
    
    self._warning_logger.write_checking_result(success, warning_msg, feature_name)
    
    # second test 
    warning_logger.write_new_entry(PreProcessingChecks.INCORRECT_DATA_TYPE)
    if data_format_name is None or data_type_name is None:
        warning_msg = 'test skipped'
    else:
        actual_dtype = column.dtype
    
    
        _does_column_have_correct_data_type = any(t == actual_dtype for t in data_type.value)
        if not _does_column_have_correct_data_type:
            warning_msg = raise_warning(PreProcessingChecks.INCORRECT_DATA_TYPE, 
                                       feature_name, data_type_name, str(actual_dtype))
            success = False
        else:
            warning_msg = 'test passed'
        warning_logger.write_checking_result(success, warning_msg, feature_name)
            
    return success


def check_missing_values(format_file_ref: Dict[str, Any],
                         column: pd.Series,
                         warning_logger:WarningReportLogger) -> bool:
    """checks if missing data are present in column, and triggers error depending
    of the fact that missing data are whether allowed or not in the format_ref_file"""
    warning_logger: WarningReportLogger
    _is_missing_data = utils.check_missing_data(column)
    _is_missing_values_authorized = format_file_ref.get('is_missing_values', 'test_skipped')
    success = True
    
    feature_name = column.name
    warning_report.write_new_entry(PreProcessingChecks.MISSING_DATA_ALLOWED)
    
    if _is_missing_values_authorized == 'test_skipped':
        warning_msg = 'Test skipped'
        success = None
    elif _is_missing_data:
        success = False
        # test fails: 
        if _is_missing_values_authorized:
            # case where missing values are present BUT allowed
            warning_msg = raise_warning(PreProcessingChecks.MISSING_DATA_ALLOWED,
                                       feature_name)
        else:
            # case where missing values are present AND NOT allowed
            try:
                warning_msg = raise_warning(PreProcessingChecks.MISSING_DATA_NOT_ALLOWED,
                                           feature_name)
            except MissingDataException as err:
                print(err)
                warning_logger.add_exception(err)
                warning_msg = str(err)
    else:
        # test passed
        warning_msg = 'Test passed'
    
    warning_report.write_checking_result(success, warning_msg, feature_name)
        
    return success


def check_lower_bound(format_file_ref: Dict[str, Any],
                      column: pd.Series,
                      warning_logger:WarningReportLogger) -> bool:
    
    feature_name = column.name
    # remove nan (missing values) from vriable 
    column_without_nan = column.dropna()
    
    lower_bound = format_file_ref.get('lower_bound')
    
    warning_report.write_new_entry(PreProcessingChecks.OUTLIER_DETECTION_LOWER_BOUND)
    if lower_bound is not None:
        
        # should work for both numerical and datetime data types
        
        is_lower_bound_correct = np.all(column_without_nan >= lower_bound)
        
            
        if not is_lower_bound_correct:
            warning_msg = raise_warning(PreProcessingChecks.OUTLIER_DETECTION_LOWER_BOUND,
                                           feature_name, lower_bound)
        else:
            warning_msg = 'Test passed'
    else:
        warning_msg = 'Test skipped'
        is_lower_bound_correct = None
    warning_logger.write_checking_result(is_lower_bound_correct, warning_msg, feature_name)
    
    return is_lower_bound_correct

def check_upper_bound(format_file_ref: Dict[str, Any],
                      column: pd.Series,
                      warning_logger:WarningReportLogger) -> bool:
    feature_name = column.name
    # remove nan (missing values) from vriable 
    column_without_nan = column.dropna()
    upper_bound = format_file_ref.get('upper_bound')
    
    warning_report.write_new_entry(PreProcessingChecks.OUTLIER_DETECTION_UPPER_BOUND)
    if upper_bound is not None:
         # should work for both numerical and datetime data sets
        is_upper_bound_correct = np.all(column_without_nan <= lower_bound)
        
        if not is_upper_bound_correct:
            warning_msg = raise_warning(PreProcessingChecks.OUTLIER_DETECTION_LOWER_BOUND,
                                        feature_name, upper_bound)
        else:
            warning_nsg = 'Test passed'
            
    else:
        warning_msg = 'Test skipped'
        is_upper_bound_correct = None
        
    warning_logger.write_checking_result(is_upper_bound_correct, warning_msg, feature_name)
    return is_upper_bound_correct


def check_values_in_categorical_variable(format_file_ref: Dict[str, Any],
                                        column: pd.Series,
                                        warning_logger:WarningReportLogger)-> bool:
    """Checks if values are contained in categorical variables"""
    
    
    feature_name = column.name
    categorical_values = format_file_ref.get('categorical_values')
    
    warning_report.write_new_entry(PreProcessingChecks.INCORRECT_VALUES_CATEGORICAL_DATA)
    if categorical_values is None:
        warning_msg = 'test skipped'
        success = None
    else:
        unique_values = utils.unique(column)
        success = True
        for val in unique_values:
            if val not in categorical_values and not np.isnan(val):
                warning_msg = raise_warning(PreProcessingChecks.INCORRECT_VALUES_CATEGORICAL_DATA,
                                           feature_name, val, *categorical_values)
                success = False
        if success:
            warning_msg = 'test passed'
    warning_logger.write_checking_result(success, warning_msg, feature_name)
    return success





def check_missing_values_threshold(column: pd.Series,
                                   warning_logger :WarningReportLogger,
                                  threshold: int = 50) -> bool:
    #Checking if missing values exceed threshold limit(50%)
    feature_name = column.name
    min_nb_missing_data = math.ceil((threshold/100)*column.shape[0])

    warning_logger.write_new_entry(PreProcessingChecks.N_MISSING_DATA_ABOVE_THRESHOLD)
    n_missing_data = data[col].isnull().sum()
    if (n_missing_data>min_nb_missing_data):
        success = False
        #message = critical_warning.display(f'Missing value exceeds threshold limit {MIN_NB_MISSING_DATA}',col) 
        warning_msg = raise_warning(PreProcessingChecks.N_MISSING_DATA_ABOVE_THRESHOLD,
                                    feature_name, n_missing_data,
                                    min_nb_missing_data)
    else:
        success = True
        warning_msg ='Test passed'
   
    #report['check_missing_values_limit'] = report_details
    warning_logger.write_checking_result(success, warning_msg, feature_name)
    return success


def check_number_of_samples(data: Union[pd.DataFrame, pd.Series],
                            min_nb_samples: int,
                            view_name: str='',
                            warning_logger:CustomWarning=None) -> bool:
    #Checking samples limit
    
    sample_count = data.shape[0]
    feature_name ='ALL'    
    
    warning_logger.write_new_entry(PreProcessingChecks.N_SAMPLES_BELOW_THRESHOLD)
    if sample_count> min_nb_samples:
        success = False
        try:
            warning_msg = raise_warning(PreProcessingChecks.N_SAMPLES_BELOW_THRESHOLD,
                                        view_name, min_nb_samples, sample_count)
        except MinimumSamplesViolatedException as err:
            print(err)
            
        #message = critical_warning.display(f'Samples count exceeds the threshold limit {MIN_NB_SAMPLES}')
    else:
        success = True
        warning_msg='Test passed'
        
    warning_logger.write_checking_result(success, warning_msg, feature_name, view_name)

    return success

def check_variable_compliance(column: pd.Series,
                               format_file_ref: Dict[str, Any],
                               col_name:str=None,
                               warning=None) -> Tuple[bool, bool]:
    """performs a data sanity check on variable `col_name` given instruction in 
    data_file_ref
    """
    is_test_passed = True
    
    
    data_format_name = format_file_ref.get('data_format')
    data_type_name = format_file_ref.get('data_type')
    # remove nan (missing values) from 
    column_without_nan = column.dropna()
    
    
    if data_format_name is None:
        print(f'critical wraning: data fromat {data_format_name} not understood')
    # 1. check data sub type
    try:
        data_type = utils.find_data_type(data_format_name, data_type_name)
    except ValueError as err:
        data_type = None
        print('Critical warning: data format and data type mismatch')
    does_column_have_correct_data_type = any(t for t in data_type.value)
    if not does_column_have_correct_data_type:
        print(f'error: data type {column.dtype} doesnot have the data type specified in format reference file')
    else:
        print('test 1 passed')

    # 2. check if missing values are allowed
    is_missing_data = utils.check_missing_data(column)
    is_missing_values_authorized = format_file_ref.get('is_missing_values', 'test_skipped')
    print('is_missing_values', is_missing_values_authorized, is_missing_data)
    if is_missing_values_authorized == 'test_skipped':
        print('missing_value test skipped')
    elif not is_missing_values_authorized and is_missing_data:

        print('Error found missing data but missing data are not authorized')
    else:
        print('test 2 passed')
    
    
    # 3. check lower bound
    print(format_file_ref)
    lower_bound = format_file_ref.get('lower_bound')
    
    if lower_bound is not None:
        
        # should work for both numerical and datetime data sets
        
        is_lower_bound_correct = np.all(column_without_nan >= lower_bound)
        
            
        if not is_lower_bound_correct:
            print('Warning: found some data below lower bound')
        else:
            print('test 3 passed')
    else:
        print('test 3 skipped ')
    # 4. check upper bound
    upper_bound = format_file_ref.get('upper_bound')
    if upper_bound is not None:
         # should work for both numerical and datetime data sets
        is_upper_bound_correct = np.all(column_without_nan <= lower_bound)
        
            
        if not is_upper_bound_correct:
            print('Warning: found some data  above upper bound')
        else:
            print('test 4 passed')
            
    else:
        print('test 4 skipped')
    # 5. check if possible_values are contained in variable
    categorical_values = format_file_ref.get('categorical_values')    
    if categorical_values is None:
        print('categorical value check test skipped')
    else:
        unique_values = utils.unique(column)
        _is_error_found = False
        for val in unique_values:
            if val not in categorical_values and not np.isnan(val):
                print(f'critical warning: {val} not in possible values')
                _is_error_found = True
        if not _is_error_found:
            print('test 5: passed')

In [None]:
def check_missing_data(column: pd.Series)->bool:
    is_missing_data = column.isna().any()
    return is_missing_data

check_missing_data()