# data sanity check

In [145]:
import pandas as pd
import pprint
import csv
import numpy as np
from typing import List, Tuple, Union, Dict, Any, Iterator, Optional, Callable
import os
from tabulate import tabulate
import json

import utils
from data_type import DataType

In [146]:
    

# utility functions for multi view dataframe
def rename_variables_before_joining(multi_view_datasets: Dict[str, pd.DataFrame],
                                    views_name: List[Union[str, int]],
                                    primary_key:Union[str, int]=None) -> Tuple[Dict[str, pd.DataFrame]]:
    """
    Renames variables that have same name but different views using the following naming convention:
    if `a` is the name of a feature of `view1` and `a` is the name of a feature of `view2`,
    features names will be updated into `view1.a` and `view2.a`
    """
    _features_names = {}
    _views_length = len(views_name)
    
    # check for each variable name existing in one view, that it doesnot exist in another
    # view. if it is, rename both variables
    # for this purpose, parse every combination once
    for i_left in range(0, _views_length-1):
        _left_view = views_name[i_left]
        _left_features_name = multi_view_datasets[_left_view].columns.tolist()
        for i_right in range(i_left+1, _views_length):
        
            _right_view = views_name[i_right]
            _right_features_name = multi_view_datasets[_right_view].columns.tolist()
            
            for _f in _left_features_name:
                if primary_key and _f == primary_key:
                    # do not affect primary key (if any)
                    continue
                if _f  in _right_features_name:
                    
                    if _left_view  not in _features_names:
                        _features_names[_left_view] = {}
                        
                    if _right_view not in _features_names:
                        _features_names[_right_view] = {}
                        
                    _features_names[_left_view].update({_f: _left_view + '.' + str(_f)})
                    _features_names[_right_view].update({_f: _right_view + '.' + str(_f)})
    
    for i in range(_views_length):
        _view = views_name[i]
        _new_features = _features_names.get(_view)
        if _new_features:
            multi_view_datasets[_view] = multi_view_datasets[_view].rename(columns=_new_features)
        
    
    return multi_view_datasets


def create_multi_view_dataframe_from_dictionary(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    # WARNING: DOESNOT CONTAIN FACILITY FOR KEEPING PRIMARY KEY
    _header_labels = ['views', 'feature_name']
    # 1. create multiindex header

    _feature_name_array = np.array([])  # store all feature names
    _view_name_array = []  # store all views (ie modalities) names

    _concatenated_datasets = np.array([])  # store dataframe values

    for key in datasets.keys():
        _feature_name_array = np.concatenate([_feature_name_array,
                                              datasets[key].columns.values])
        if len(_concatenated_datasets) <= 0:
            # first pass 
            _concatenated_datasets = datasets[key].values
        else:
            # next passes
            try:
                _concatenated_datasets = np.concatenate(
                                        [_concatenated_datasets,
                                         datasets[key].to_numpy()
                                         ], axis=1)
            except ValueError as val_err:
                # catching case where nb_samples are differents
                raise ValueError(
                    'Cannot create multi view dataset: different number of samples for each modality have been detected'\
                        + 'Details: ' + str(val_err)
                    )
        for _ in datasets[key].columns.values:
            _view_name_array.append(key)

    _header = pd.MultiIndex.from_arrays([_view_name_array,
                                         _feature_name_array],
                                        names=_header_labels)


    # 2. create multi index dataframe

    multi_view_df = pd.DataFrame(_concatenated_datasets,
                                  columns = _header)
    return multi_view_df

def create_multi_view_dataframe_from_dataframe(dataframe: pd.DataFrame,
                                               views_features: Dict[str, List[str]],
                                               primary_key: str = None):
    # convert plain dataframe into multi index dataframe
    # primary key will have its own view
    _header_labels = ['views', 'feature_name']
    _primary_key_label = 'primary_key'
    _n_features = 0
    
    _multi_index = dataframe.columns
    if primary_key is not None:
        _key_values = dataframe[primary_key].values  # storing primary key values

    _all_features_names = []
    _new_views_names = []
    for view_name in views_features.keys():
        # get all columns name for each view, and remove primary keymulti_view_dataset[view_name] = pd.concat[]
        _features_names = list(views_features[view_name])
        
        if primary_key is not None:
            _features_names.remove(primary_key)
        
        for feature_name in _features_names:
            #if feature_name not in _all_features_names:
            _new_views_names.append(view_name)
            # appending as much as there are feature within each view
        _n_features += len(_features_names)
        _all_features_names.extend(_features_names)
        
        #_all_features_names = list(set(_all_features_names))  # remove duplicates
    
    print('length', _all_features_names, _new_views_names)
    _header = pd.MultiIndex.from_arrays([ _new_views_names, _all_features_names],
                                        names=_header_labels)
    
    print('BUG',_all_features_names, dataframe[_all_features_names].values.shape, dataframe.shape)
    print(_header)
    multi_view_dataframe = pd.DataFrame(dataframe[_all_features_names].values, columns=_header)
    
    if primary_key is not None:
        
        multi_view_dataframe[_primary_key_label, primary_key] = _key_values  # creating a specific value for
    # private key
    return multi_view_dataframe


def join_muti_view_dataset(multi_view_dataset: Union[pd.DataFrame, Dict[str, pd.DataFrame]],
                           #multi_view_dataframe: pd.DataFrame=None,
                           #multi_view_dictionary_dataset: Dict[str, pd.DataFrame] = None, 
                           primary_key: str=None,
                          as_multi_index: bool = True) -> pd.DataFrame:
    """Concatenates a multi view dataset into a plain pandas dataframe,
    by doing a join operation along specified primary_key"""
    
    if isinstance(multi_view_dataset, pd.DataFrame):
        _views_names = sorted(set(multi_view_dataset.columns.get_level_values(0)))  # get views name

        
    elif isinstance(multi_view_dataset, dict):
        _views_names = sorted(list(multi_view_dataset.keys()))
        
    else:
        raise ValueError('method can only accept multi view pandas dataframe or dictionary of pandas dataframes')
        
    joined_dataframe = multi_view_dataset[_views_names[0]]  # retrieve the first view
    # (as a result of join operation)
    for x in range(1, len(_views_names)):
        joined_dataframe = joined_dataframe.merge(multi_view_dataset[_views_names[x]],
                                                    on=primary_key,
                                                    suffixes=('', '.'+_views_names[x]))
    
    if as_multi_index:
        # convert plain dataframe into multi index dataframe
        # primary key will have its own view
        _header_labels = ['views', 'feature_name']
        _primary_key_label = 'primary_key'
        
        _multi_index = multi_view_dataset.columns
        
        _key_values = joined_dataframe[primary_key].values  # storing primary key

        _all_features_names = []
        _new_views_names = []
        for view_name in _views_names:
            # get all columns name for each view, and remove primary key
            _features_names = list(multi_view_dataset[view_name].columns)
            if primary_key is not None:
                _features_names.remove(primary_key)
            _all_features_names.extend(_features_names)

            for feature_name in _features_names:
                _new_views_names.append(view_name)
                # appending as much as there are feature within each view
            #features_name[name].remove(primary_key)

        _header = pd.MultiIndex.from_arrays([ _new_views_names, _all_features_names],
                                            names=_header_labels)
        print(_header)
        joined_dataframe  = pd.DataFrame(joined_dataframe[_all_features_names].values, columns=_header)
        joined_dataframe[_primary_key_label, primary_key] = _key_values
        
    return joined_dataframe



def search_primary_key(format_file_ref: Dict[str, Dict[str, Any]]) -> Optional[str]: 
    """"""
    _views_names = list(format_file_ref.keys())
    primary_key = None
    _c_view = None
    for view_name in _views_names:
        file_content = format_file_ref[view_name]
        _features_names = list(file_content.keys())
        for feature_name in _features_names:
            feature_content  = file_content[feature_name]
            _d_format = feature_content.get('data_format')
            
            if _d_format == DataType.KEY.name:
                if _c_view is None:
                    primary_key = feature_name
                    _c_view = view_name
                    print(f'found primary key {primary_key}')
                else:
                    print(f'error: found 2 primary keys is same view {view_name}')
        _c_view = None
    return primary_key



def select_data_from_format_file_ref(datasets: Dict[str, Dict[str, Any]],
                                     format_file: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
    """returns an updated dataset containing only the features detailed in format_file"""
    # variables initialisation
    
    updated_dataset = {}
    _views_format_file = list(format_file.keys())
    
    for view in _views_format_file:
        if view in datasets.keys():
            # only extract features from format_file
            _format_file_features = list(format_file[view].keys())
            _current_dataset_feature = datasets[view].columns.tolist()
            try:
                updated_dataset[view] = datasets[view][_format_file_features]
            except KeyError as ke:
                # catch error if a column is specified in data format file
                # but not found in dataset
                _missing_feature = []
                for feature in _format_file_features:
                    if feature not in _current_dataset_feature:
                        _missing_feature.append(feature)
                print('Error: th following features', *_missing_feature, f'are not found in view: {view}')
        else:
            # trigger error
            print(f'error!: missing view {view} in dataset')
            
    return updated_dataset

In [147]:
import logging

class CustomWarning():
    
        def __init__(self, disclosure: int,level: str):
            #super().__init__('l')
            self.disclosure = disclosure
            self.level = level
        
        def display(self,message, columns = ' ' ):  
            
            
            logger = logging.getLogger('mylogger')
            #logger.setLevel(logging.DEBUG)

            #handler = logging.FileHandler('mylog.log')
            #formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            #handler.setFormatter(formatter)
            #logger.addHandler(handler)
            
            if self.disclosure == 1:
                if self.level == 'CRITICAL':
                    logger.critical(message)
                elif self.level == 'WARNING':
                    logger.warning(message)
            elif self.disclosure == 2:
                if self.level == 'CRITICAL':
                    message = 'Critical Warning.' + message
                    logger.critical(message)
                elif self.level == 'WARNING':
                    message = 'Regular Warning.' + message
                    logger.warning(message)
            elif self.disclosure == 3:
                if self.level == 'CRITICAL':
                    message = 'Critical Warning. ' + message + 'Columns affected :' + columns
                    logger.critical(message)
                elif self.level == 'WARNING':
                    message = 'Regular Warning. ' + message  + 'Columns affected :' + columns
                    logger.warning(message) 
            print (message) 
            return message
        
        

In [148]:
def create_dictionary_multi_view_dataset(dataframe: pd.DataFrame,
                                         views_features_mapping: Dict[str, List[str]],
                                         primary_key: str=None):
    _primary_key_label = 'primary_key'
    
    multi_view_dataset = {}
    
    if primary_key is not None:
        _key_values = dataframe[primary_key].values  # storing primary key values

    _all_features_names = []
    _new_views_names = []
    for view_name in views_features_mapping.keys():
        # get all columns name for each view, and remove primary key
        _features_names = list(views_features_mapping[view_name])
        
        if primary_key is not None:
            _features_names.remove(primary_key)
        _tmp_dataframe = dataframe[_features_names[0]].values
        _tmp_dataframe = _tmp_dataframe.reshape(-1, 1)  # need to reshape,
        #(otherwise concatenation wont work)
        for feature in _features_names[1:]:
            # iterate over the remaining items in _feature_name
            # need to do it that way because indexing dataframe is somehow broken
            
            _new_feature = dataframe[feature].to_numpy()
            _new_feature = _new_feature.reshape(-1, 1)
            _tmp_dataframe = np.concatenate([_tmp_dataframe, _new_feature], axis=1)
            
        multi_view_dataset[view_name] =pd.DataFrame( _tmp_dataframe, columns=_features_names)
    
    if primary_key is not None:
        multi_view_dataset[_primary_key_label, pimary_key] = dataframe[primary_key]
    return multi_view_dataset



In [149]:
new_feature_name = { v: list(pre_parsed_dataset_to_check[v].columns) for v in views_names}

print(new_feature_name)

create_dictionary_multi_view_dataset(df_to_check,new_feature_name)


{'file1': ['e', 'file1.1', '2', 'file1.time', 'pressure', 'e.1', 'gender', 'blood type', 'pkey'], 'contatct': ['discrete', 'city', 'pkey'], 'file2': ['file2.1', 'file2.time', 'pH', 'pkey']}


{'file1':      e file1.1      2           file1.time  pressure e.1 gender blood type  \
 0   16   False  False  2018-01-03 04:00:00   0.98667  98  WOMAN          A   
 1   96    True   True  2018-01-02 04:00:00  0.996889  35    MAN         AB   
 2    8    True   True  2018-01-01 09:00:00  0.777026  65    MAN          A   
 3    6    True  False  2018-01-04 20:00:00  0.877527  81    MAN         AB   
 4   79    True   True  2018-01-04 09:00:00  0.447389  88  WOMAN          O   
 ..  ..     ...    ...                  ...       ...  ..    ...        ...   
 95  62    True   True  2018-01-02 13:00:00  0.953184  53    MAN         AB   
 96  49   False   True  2018-01-02 21:00:00  0.442283  35    MAN        NaN   
 97  14   False  False  2018-01-02 06:00:00  0.988543  67    MAN          B   
 98  10    True  False  2018-01-02 01:00:00  0.059791  48    MAN          B   
 99  89    True   True  2018-01-03 22:00:00  0.939352  13    MAN          B   
 
                     pkey  
 0   qpqorfhy

In [140]:
np.concatenate([[[1],[2],[3]], [[4],[4],[5]]], axis=1)

array([[1, 4],
       [2, 4],
       [3, 5]])

In [106]:
df_to_check[['2', 'city', 'pkey', 'discrete', 'pH', 'pressure', 'e.1', 'file1.1', 'file2.1', 'file1.time', 'e', 'gender', 'file2.time', 'blood type']]

feature_name,2,city,pkey,pkey.1,pkey.2,discrete,pH,pressure,e.1,file1.1,file2.1,file1.time,e,gender,file2.time,blood type
0,False,Lille,zmixzrgvxrjqxoe sluk,qpqorfhylu gmfjy bdj,kkmjozalfyirgsire ui,64.0,0.023107,0.088082,63,True,True,2018-01-01 00:00:00,98,MAN,2018-01-01 00:00:00,A
1,True,Lille,vrzahnpfluspdcbfnaqt,kkmjozalfyirgsire ui,xkdawggpnuulcewuoyzz,26.0,,0.774788,20,False,False,2018-01-01 01:00:00,83,MAN,2018-01-01 01:00:00,O
2,False,Paris,pnrepvmrxqabdlvisclv,ezfasuuycdda foisjte,khuulhwgwnjggrfoefce,61.0,0.407279,0.514092,2,False,True,2018-01-01 02:00:00,73,WOMAN,2018-01-01 02:00:00,A
3,True,Paris,gwj luzejwdxzsiljxzd,faxiqkt xggzmwzoidbg,xxysdmwwmjsmyhaswfdb,29.0,0.536301,0.832881,70,True,True,2018-01-01 03:00:00,45,WOMAN,2018-01-01 03:00:00,AB
4,False,Lille,jjdvcnofivbqhirxzdyo,znwhlj rwzdutnagwasy,ldejfuij mnbnf wwmms,99.0,0.749443,0.696152,90,True,True,2018-01-01 04:00:00,84,MAN,2018-01-01 04:00:00,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,True,Paris,hrvepmqjn llgbzplshv,zeqhcikzdodus jn qjf,wrmdecb s pohtmrcdj,9.0,,0.295578,41,True,True,2018-01-04 23:00:00,66,WOMAN,2018-01-04 23:00:00,A
96,True,Marseille,wroevwyuamxibzshlxxh,iicthcvfmkajbvr gzir,whmwrpvqmerdpwwzxasf,98.0,0.388389,0.474322,41,False,False,2018-01-05 00:00:00,81,WOMAN,2018-01-05 00:00:00,B
97,True,Lille,ywadcykylymkdtzfctpg,ztjakcsk bhjoksdz lm,pnrepvmrxqabdlvisclv,21.0,0.889067,0.927511,7,True,True,2018-01-05 01:00:00,82,MAN,2018-01-05 01:00:00,B
98,True,Marseille,ruchbfa zwgenxslegrl,sabunaa opt vpulnxj,iicthcvfmkajbvr gzir,42.0,0.402979,0.494798,11,False,True,2018-01-05 02:00:00,18,MAN,2018-01-05 02:00:00,O


In [58]:
df_to_check

feature_name,e,file1.1,2,file1.time,pressure,e.1,gender,blood type,pkey,discrete,city,pkey.1,file2.1,file2.time,pH,pkey.2
0,98,True,False,2018-01-01 00:00:00,0.088082,63,MAN,A,zmixzrgvxrjqxoe sluk,64.0,Lille,qpqorfhylu gmfjy bdj,True,2018-01-01 00:00:00,0.023107,kkmjozalfyirgsire ui
1,83,False,True,2018-01-01 01:00:00,0.774788,20,MAN,O,vrzahnpfluspdcbfnaqt,26.0,Lille,kkmjozalfyirgsire ui,False,2018-01-01 01:00:00,,xkdawggpnuulcewuoyzz
2,73,False,False,2018-01-01 02:00:00,0.514092,2,WOMAN,A,pnrepvmrxqabdlvisclv,61.0,Paris,ezfasuuycdda foisjte,True,2018-01-01 02:00:00,0.407279,khuulhwgwnjggrfoefce
3,45,True,True,2018-01-01 03:00:00,0.832881,70,WOMAN,AB,gwj luzejwdxzsiljxzd,29.0,Paris,faxiqkt xggzmwzoidbg,True,2018-01-01 03:00:00,0.536301,xxysdmwwmjsmyhaswfdb
4,84,True,False,2018-01-01 04:00:00,0.696152,90,MAN,B,jjdvcnofivbqhirxzdyo,99.0,Lille,znwhlj rwzdutnagwasy,True,2018-01-01 04:00:00,0.749443,ldejfuij mnbnf wwmms
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,66,True,True,2018-01-04 23:00:00,0.295578,41,WOMAN,A,hrvepmqjn llgbzplshv,9.0,Paris,zeqhcikzdodus jn qjf,True,2018-01-04 23:00:00,,wrmdecb s pohtmrcdj
96,81,False,True,2018-01-05 00:00:00,0.474322,41,WOMAN,B,wroevwyuamxibzshlxxh,98.0,Marseille,iicthcvfmkajbvr gzir,False,2018-01-05 00:00:00,0.388389,whmwrpvqmerdpwwzxasf
97,82,True,True,2018-01-05 01:00:00,0.927511,7,MAN,B,ywadcykylymkdtzfctpg,21.0,Lille,ztjakcsk bhjoksdz lm,True,2018-01-05 01:00:00,0.889067,pnrepvmrxqabdlvisclv
98,18,False,True,2018-01-05 02:00:00,0.494798,11,MAN,O,ruchbfa zwgenxslegrl,42.0,Marseille,sabunaa opt vpulnxj,True,2018-01-05 02:00:00,0.402979,iicthcvfmkajbvr gzir


existing tests

- test keys (should be done before joining) 
 |-> unicity of value
- test datetime
 |-> are they datetime parsable (for that i am using `dateutil` python package)
 
- test variables including
 |-> test if data have missing values and missing values are not allowed
 | -> test correct categories / sub categories
 | -> test lower bound
 | -> test upper bound
 | -> check if defined values are contained in categorical variables
 
- data transformation

|-> interpolate missing values (if allowed) using specific method

In [3]:
inst = CustomWarning(2, 'WARNING')
inst.display('hsskks', 'kl')







In [7]:
pre_parsed_dataset_to_check


{'file1':      e  file1.1      2           file1.time  pressure  e.1 gender blood type  \
 0   98     True  False  2018-01-01 00:00:00  0.088082   63    MAN          A   
 1   83    False   True  2018-01-01 01:00:00  0.774788   20    MAN          O   
 2   73    False  False  2018-01-01 02:00:00  0.514092    2  WOMAN          A   
 3   45     True   True  2018-01-01 03:00:00  0.832881   70  WOMAN         AB   
 4   84     True  False  2018-01-01 04:00:00  0.696152   90    MAN          B   
 ..  ..      ...    ...                  ...       ...  ...    ...        ...   
 95  66     True   True  2018-01-04 23:00:00  0.295578   41  WOMAN          A   
 96  81    False   True  2018-01-05 00:00:00  0.474322   41  WOMAN          B   
 97  82     True   True  2018-01-05 01:00:00  0.927511    7    MAN          B   
 98  18    False   True  2018-01-05 02:00:00  0.494798   11    MAN          O   
 99  70    False   True  2018-01-05 03:00:00  0.316395   74  WOMAN         AB   
 
                 

In [9]:
multi_format_file_ref

{'file1': {'e': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': True},
  '1': {'data_format': 'CATEGORICAL',
   'data_type': 'BOOLEAN',
   'values': 'bool',
   'is_missing_values': False},
  '2': {'data_format': 'CATEGORICAL',
   'data_type': 'BOOLEAN',
   'values': 'bool',
   'is_missing_values': True},
  'time': {'data_format': 'DATETIME',
   'data_type': 'DATETIME',
   'values': 'object',
   'is_missing_values': False},
  'pressure': {'data_format': 'QUANTITATIVE',
   'data_type': 'CONTINUOUS',
   'values': 'float64',
   'is_missing_values': False},
  'e.1': {'data_format': 'QUANTITATIVE',
   'data_type': 'DISCRETE',
   'values': 'int64',
   'is_missing_values': False},
  'gender': {'data_format': 'CATEGORICAL',
   'data_type': 'CHARACTER',
   'values': 'object',
   'is_missing_values': False},
  'blood type': {'data_format': 'CATEGORICAL',
   'data_type': 'CHARACTER',
   'values': 'object',
   'is_missing_values': True},
  '

{'file1': ['e',
  'file1.1',
  '2',
  'file1.time',
  'pressure',
  'e.1',
  'gender',
  'blood type',
  'pkey'],
 'contatct': ['discrete', 'city', 'pkey'],
 'file2': ['file2.1', 'file2.time', 'pH', 'pkey']}

In [18]:
multi_df_joined

views,contatct,contatct,file1,file1,file1,file1,file1,file1,file1,file1,file2,file2,file2,primary_key
feature_name,discrete,city,e,file1.1,2,file1.time,pressure,e.1,gender,blood type,file2.1,file2.time,pH,pkey
0,64.0,Lille,16,False,False,2018-01-03 04:00:00,0.98667,98,WOMAN,A,False,2018-01-02 06:00:00,,qpqorfhylu gmfjy bdj
1,26.0,Lille,96,True,True,2018-01-02 04:00:00,0.996889,35,MAN,AB,True,2018-01-01 00:00:00,0.023107,kkmjozalfyirgsire ui
2,61.0,Paris,8,True,True,2018-01-01 09:00:00,0.777026,65,MAN,A,False,2018-01-02 10:00:00,0.587685,ezfasuuycdda foisjte
3,29.0,Paris,6,True,False,2018-01-04 20:00:00,0.877527,81,MAN,AB,True,2018-01-03 12:00:00,0.894073,faxiqkt xggzmwzoidbg
4,99.0,Lille,79,True,True,2018-01-04 09:00:00,0.447389,88,WOMAN,O,True,2018-01-01 10:00:00,0.026831,znwhlj rwzdutnagwasy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,9.0,Paris,62,True,True,2018-01-02 13:00:00,0.953184,53,MAN,AB,False,2018-01-02 05:00:00,0.78856,zeqhcikzdodus jn qjf
96,98.0,Marseille,49,False,True,2018-01-02 21:00:00,0.442283,35,MAN,,True,2018-01-05 02:00:00,0.402979,iicthcvfmkajbvr gzir
97,21.0,Lille,14,False,False,2018-01-02 06:00:00,0.988543,67,MAN,B,False,2018-01-01 12:00:00,,ztjakcsk bhjoksdz lm
98,42.0,Marseille,10,True,False,2018-01-02 01:00:00,0.059791,48,MAN,B,True,2018-01-02 09:00:00,0.651801,sabunaa opt vpulnxj


# for simple datasets

In [121]:
# extract views names
views_names = list(format_file.keys())



# look for primary key
primary_key = search_primary_key(format_file)
print('primary key', primary_key)

# select only features in dataset that will be checked
pre_parsed_dataset_to_check = select_data_from_format_file_ref(dataset_to_check, format_file)
# rename columns names before join operation
pre_parsed_dataset_to_check = rename_variables_before_joining(pre_parsed_dataset_to_check, views_names)
pre_parsed_dataset_to_check

multi_df_to_check = create_multi_view_dataframe_from_dictionary(pre_parsed_dataset_to_check)
multi_df_to_check

#if primary_key is not None:
# jointure operation (takesplace only if primary key has been specfied in foramt_file)
df_to_check = join_muti_view_dataset(multi_df_to_check)
    
df_to_check

NameError: name 'format_file' is not defined

# for multiple datasets

In [150]:
multi_format_file_ref = utils.load_format_file_ref('multi_format_file')
multi_dataset_to_check = utils.load_tabular_datasets(r'test7')

directory found


In [151]:
# extract views names
views_names = list(multi_format_file_ref.keys())



# look for primary key
primary_key = search_primary_key(multi_format_file_ref)
print('primary key', primary_key)

# select only features in dataset that will be checked
pre_parsed_dataset_to_check = select_data_from_format_file_ref(multi_dataset_to_check, multi_format_file_ref)
# rename columns names before join operation
pre_parsed_dataset_to_check = rename_variables_before_joining(pre_parsed_dataset_to_check, views_names,
                                                             primary_key)
pre_parsed_dataset_to_check

#multi_df_to_check = create_multi_view_dataframe(pre_parsed_dataset_to_check)  # remove that
#multi_df_to_check

#if primary_key is not None:
# jointure operation (takesplace only if primary key has been specfied in foramt_file)
df_joined = join_muti_view_dataset(pre_parsed_dataset_to_check, primary_key, False) # should accept DIct[pd.DataFrame]
    
#df_to_check = multi_df_joined.droplevel(0, axis=1)  # remove views from dataset
#df_to_check

df_joined

found primary key pkey
found primary key pkey
found primary key pkey
primary key pkey


Unnamed: 0,discrete,city,pkey,e,file1.1,2,file1.time,pressure,e.1,gender,blood type,file2.1,file2.time,pH
0,64.0,Lille,qpqorfhylu gmfjy bdj,16,False,False,2018-01-03 04:00:00,0.986670,98,WOMAN,A,False,2018-01-02 06:00:00,
1,26.0,Lille,kkmjozalfyirgsire ui,96,True,True,2018-01-02 04:00:00,0.996889,35,MAN,AB,True,2018-01-01 00:00:00,0.023107
2,61.0,Paris,ezfasuuycdda foisjte,8,True,True,2018-01-01 09:00:00,0.777026,65,MAN,A,False,2018-01-02 10:00:00,0.587685
3,29.0,Paris,faxiqkt xggzmwzoidbg,6,True,False,2018-01-04 20:00:00,0.877527,81,MAN,AB,True,2018-01-03 12:00:00,0.894073
4,99.0,Lille,znwhlj rwzdutnagwasy,79,True,True,2018-01-04 09:00:00,0.447389,88,WOMAN,O,True,2018-01-01 10:00:00,0.026831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,9.0,Paris,zeqhcikzdodus jn qjf,62,True,True,2018-01-02 13:00:00,0.953184,53,MAN,AB,False,2018-01-02 05:00:00,0.788560
96,98.0,Marseille,iicthcvfmkajbvr gzir,49,False,True,2018-01-02 21:00:00,0.442283,35,MAN,,True,2018-01-05 02:00:00,0.402979
97,21.0,Lille,ztjakcsk bhjoksdz lm,14,False,False,2018-01-02 06:00:00,0.988543,67,MAN,B,False,2018-01-01 12:00:00,
98,42.0,Marseille,sabunaa opt vpulnxj,10,True,False,2018-01-02 01:00:00,0.059791,48,MAN,B,True,2018-01-02 09:00:00,0.651801


In [152]:
pre_parsed_dataset_to_check

{'file1':      e  file1.1      2           file1.time  pressure  e.1 gender blood type  \
 0   98     True  False  2018-01-01 00:00:00  0.088082   63    MAN          A   
 1   83    False   True  2018-01-01 01:00:00  0.774788   20    MAN          O   
 2   73    False  False  2018-01-01 02:00:00  0.514092    2  WOMAN          A   
 3   45     True   True  2018-01-01 03:00:00  0.832881   70  WOMAN         AB   
 4   84     True  False  2018-01-01 04:00:00  0.696152   90    MAN          B   
 ..  ..      ...    ...                  ...       ...  ...    ...        ...   
 95  66     True   True  2018-01-04 23:00:00  0.295578   41  WOMAN          A   
 96  81    False   True  2018-01-05 00:00:00  0.474322   41  WOMAN          B   
 97  82     True   True  2018-01-05 01:00:00  0.927511    7    MAN          B   
 98  18    False   True  2018-01-05 02:00:00  0.494798   11    MAN          O   
 99  70    False   True  2018-01-05 03:00:00  0.316395   74  WOMAN         AB   
 
                 

In [22]:
# Data sanity check

new_feature_name = { v: list(pre_parsed_dataset_to_check[v].columns) for v in views_names}
new_feature_name

for view in views_names:
    print(view)
    
    feature_names = list(multi_format_file_ref[view].keys())
    for n_feature_name, feature_name in zip(new_feature_name[view], feature_names):
        check_variable_compliance(df_to_check[n_feature_name], multi_format_file_ref[view][feature_name])
        data_format = multi_format_file_ref[view][feature_name].get('data_format')
        if data_format == DataType.DATETIME.name:
            # addtional check for DATETIME data format
            check_datetime_variable_compliance(df_to_check[n_feature_name])
            
        if data_format == DataType.KEY.name:
            check_key_variable_compliance(df_to_check[n_feature_name])

file1
test 1 passed
is_missing_values True False
test 2 passed
{'data_format': 'QUANTITATIVE', 'data_type': 'DISCRETE', 'values': 'int64', 'is_missing_values': True}
test 3 skipped 
test 4 skipped
categorical value check test skipped
test 1 passed
is_missing_values False False
test 2 passed
{'data_format': 'CATEGORICAL', 'data_type': 'BOOLEAN', 'values': 'bool', 'is_missing_values': False}
test 3 skipped 
test 4 skipped
categorical value check test skipped
test 1 passed
is_missing_values True False
test 2 passed
{'data_format': 'CATEGORICAL', 'data_type': 'BOOLEAN', 'values': 'bool', 'is_missing_values': True}
test 3 skipped 
test 4 skipped
categorical value check test skipped
test 1 passed
is_missing_values False False
test 2 passed
{'data_format': 'DATETIME', 'data_type': 'DATETIME', 'values': 'object', 'is_missing_values': False}
test 3 skipped 
test 4 skipped
categorical value check test skipped
datetime parsed
test 1 passed
is_missing_values False False
test 2 passed
{'data_format

In [17]:
df_to_check

feature_name,e,file1.1,2,file1.time,pressure,e.1,gender,blood type,pkey,discrete,city,pkey.1,file2.1,file2.time,pH,pkey.2
0,98,True,False,2018-01-01 00:00:00,0.088082,63,MAN,A,zmixzrgvxrjqxoe sluk,64.0,Lille,qpqorfhylu gmfjy bdj,True,2018-01-01 00:00:00,0.023107,kkmjozalfyirgsire ui
1,83,False,True,2018-01-01 01:00:00,0.774788,20,MAN,O,vrzahnpfluspdcbfnaqt,26.0,Lille,kkmjozalfyirgsire ui,False,2018-01-01 01:00:00,,xkdawggpnuulcewuoyzz
2,73,False,False,2018-01-01 02:00:00,0.514092,2,WOMAN,A,pnrepvmrxqabdlvisclv,61.0,Paris,ezfasuuycdda foisjte,True,2018-01-01 02:00:00,0.407279,khuulhwgwnjggrfoefce
3,45,True,True,2018-01-01 03:00:00,0.832881,70,WOMAN,AB,gwj luzejwdxzsiljxzd,29.0,Paris,faxiqkt xggzmwzoidbg,True,2018-01-01 03:00:00,0.536301,xxysdmwwmjsmyhaswfdb
4,84,True,False,2018-01-01 04:00:00,0.696152,90,MAN,B,jjdvcnofivbqhirxzdyo,99.0,Lille,znwhlj rwzdutnagwasy,True,2018-01-01 04:00:00,0.749443,ldejfuij mnbnf wwmms
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,66,True,True,2018-01-04 23:00:00,0.295578,41,WOMAN,A,hrvepmqjn llgbzplshv,9.0,Paris,zeqhcikzdodus jn qjf,True,2018-01-04 23:00:00,,wrmdecb s pohtmrcdj
96,81,False,True,2018-01-05 00:00:00,0.474322,41,WOMAN,B,wroevwyuamxibzshlxxh,98.0,Marseille,iicthcvfmkajbvr gzir,False,2018-01-05 00:00:00,0.388389,whmwrpvqmerdpwwzxasf
97,82,True,True,2018-01-05 01:00:00,0.927511,7,MAN,B,ywadcykylymkdtzfctpg,21.0,Lille,ztjakcsk bhjoksdz lm,True,2018-01-05 01:00:00,0.889067,pnrepvmrxqabdlvisclv
98,18,False,True,2018-01-05 02:00:00,0.494798,11,MAN,O,ruchbfa zwgenxslegrl,42.0,Marseille,sabunaa opt vpulnxj,True,2018-01-05 02:00:00,0.402979,iicthcvfmkajbvr gzir


In [162]:
from enum import Enum

class MissingDataException(Exception):
    def __init__(self, message:str):
        super().__init__(message)

class MinimumSamplesViolatedException(Exception):
    def __init__(self, message: str):
        super().__init__(message)
        
class WarningType(Enum):
    REGULAR_WARNING = 1
    CRITICAL_WARNING = 2

class PreProcessingTests(Enum):
    INCORRECT_FORMAT_FILE = ("Format File %s is incorrect: cannot parse variable %s", WarningType.CRITICAL_WARNING,
                            )
    KEY_UNICITY_VIOLATED = ("Key Variable %s violated unicity of data", WarningType.CRITICAL_WARNING, 
                           )
    MISSING_DATA_NOT_ALLOWED = ("Variable %s must not have missing data, but some were found",
                                MissingDataException, 
                               )
    MISSING_DATA_ALLOWED = ("Missing data found in variable %s", WarningType.REGULAR_WARNING 
                           )
    INCORRECT_STRUCTURE_DATA_TYPE = ("Data Type %s has an incorrect structure", WarningType.CRITICAL_WARNING)
    INCORRECT_DATA_TYPE = ("Incorrect Data Type for variable %s: Excpected %s but found %s",
                           WarningType.REGULAR_WARNING)
    INCORRECT_DATETIME_DATA = ("Variable %s has been defined as a DATETIME variable, but samples are not parsable as date",
                               WarningType.CRITICAL_WARNING)
    
    OUTLIER_DETECTION_LOWER_BOUND = ("Detected outliers for Varaiable %s: samples violate lower bound %s",
                                   WarningType.CRITICAL_WARNING)
    
    OUTLIER_DETECTION_UPPER_BOUND = ("Detected outliers for Varaiable %s: samples violate upper bound %s",
                                   WarningType.CRITICAL_WARNING)
    
    INCORRECT_VALUES_CATEGORICAL_DATA = ("Found at least one sample with incorrect label in Categroical Data %s. Expected data are %s",
                                        WarningType.CRITICAL_WARNING)
    
    N_MISSING_DATA_ABOVE_THRESHOLD = ("Found too many missing data in variable %s, threshold is set at %s",
                                     WarningType.CRITICAL_WARNING)
    
    N_SAMPLES_BELOW_THRESHOLD = ("Number of samples contained in dataset %s is below threshold (expected at least %d samples, found %d samples)",
                                MinimumSamplesViolatedException)
    
    
    def __init__(self, message: str, warning_type: WarningType):
        self._message = message
        self._warning_type = warning_type
        #self._is_exception = is_exception
        
    @property
    def error_message(self):
        return self._message
    
    @property
    def warning_type(self):
        return self._warning_type
    
    def __call__(self, disclosure:int =None):
        return self.warning_type(self.error_message)

def raise_warnings(warning: Enum, warning_disclosure:int) -> str:
    if warning.warning_type is Enum:
        #warning.warning_type.value(warning_disclosure)
        print(warning.warning_type)
    elif warning.warning_type is Exception:
        raise warning()
    

In [164]:
raise PreProcessingTests.N_SAMPLES_BELOW_THRESHOLD()

MinimumSamplesViolatedException: Number of samples contained in dataset %s is below threshold (expected at least %d samples, found %d samples)

In [21]:
def run_data_sanity_check(data_frame: pd.DataFrame):
    """
    
    Args:
    
     - data_frame: 
    """
    pass

def check_key_variable_compliance(column: pd.Series,
                                  
                                  col_name:str=None,
                                  warning=None) -> bool:
    """performs data sanity check over variable of type `KEY`
    warning should be Critical warnings
    """
    # variables initialisation
    is_test_passed = True 
    
    # 1. check unicity of values in column
    
    n_unique_samples = utils.unique(column, number=True)
    n_samples = column.shape[0]
    
    if n_unique_samples != n_samples:
        is_test_passed = False
        print(f'error: keys not unique ! b of samples= {n_samples} and unique values {n_unique_samples}')
    else:
        print('test 1 passed')
    # 2. check if missing database contained in key (key should not contain any missing data)
    if utils.check_missing_data(column):
        is_test_passed = False
        print('error: missing data found in key')
    else:
        
        print('test 2 passed')
Key 
                
    return is_test_passed


def check_datetime_variable_compliance(column: pd.Series):
    """additional data sanity checks for datetime variable"""
    # test 1. check if datetime is parsable
    
    # remove nan
    column_without_nan = column.dropna()
    are_datetime_parsables =  np.all(column.apply(utils.is_datetime))
    if not are_datetime_parsables:
        print('Warning: at least one variable is not a datetime')
        
    else:
        print('datetime parsed')

def test_missing_entries_format_file_ref(format_file) -> bool:
    """Tests if format file ref is parsable"""
    pass
    
def test_correct_sub_type() -> bool:
    pass
    
def check_variable_compliance(column: pd.Series,
                               format_file_ref: Dict[str, Any],
                               col_name:str=None,
                               warning=None) -> Tuple[bool, bool]:
    """performs a data sanity check on variable `col_name` given instruction in 
    data_file_ref
    """
    is_test_passed = True
    
    
    data_format_name = format_file_ref.get('data_format')
    data_type_name = format_file_ref.get('data_type')
    # remove nan (missing values) from 
    column_without_nan = column.dropna()
    
    
    if data_format_name is None:
        print(f'critical wraning: data fromat {data_format_name} not understood')
    # 1. check data sub type
    try:
        data_type = utils.find_data_type(data_format_name, data_type_name)
    except ValueError as err:
        data_type = None
        print('Critical warning: data format and data type mismatch')
    does_column_have_correct_data_type = any(t for t in data_type.value)
    if not does_column_have_correct_data_type:
        print(f'error: data type {column.dtype} doesnot have the data type specified in format reference file')
    else:
        print('test 1 passed')

    # 2. check if missing values are allowed
    is_missing_data = utils.check_missing_data(column)
    is_missing_values_authorized = format_file_ref.get('is_missing_values', 'test_skipped')
    print('is_missing_values', is_missing_values_authorized, is_missing_data)
    if is_missing_values_authorized == 'test_skipped':
        print('missing_value test skipped')
    elif not is_missing_values_authorized and is_missing_data:

        print('Error found missing data but missing data are not authorized')
    else:
        print('test 2 passed')
    
    
    # 3. check lower bound
    print(format_file_ref)
    lower_bound = format_file_ref.get('lower_bound')
    
    if lower_bound is not None:
        
        # should work for both numerical and datetime data sets
        
        is_lower_bound_correct = np.all(column_without_nan >= lower_bound)
        
            
        if not is_lower_bound_correct:
            print('Warning: found some data below lower bound')
        else:
            print('test 3 passed')
    else:
        print('test 3 skipped ')
    # 4. check upper bound
    upper_bound = format_file_ref.get('upper_bound')
    if upper_bound is not None:
         # should work for both numerical and datetime data sets
        is_upper_bound_correct = np.all(column_without_nan <= lower_bound)
        
            
        if not is_upper_bound_correct:
            print('Warning: found some data  above upper bound')
        else:
            print('test 4 passed')
            
    else:
        print('test 4 skipped')
    # 5. check if possible_values are contained in variable
    categorical_values = format_file_ref.get('categorical_values')    
    if categorical_values is None:
        print('categorical value check test skipped')
    else:
        unique_values = utils.unique(column)
        _is_error_found = False
        for val in unique_values:
            if val not in categorical_values and not np.isnan(val):
                print(f'critical warning: {val} not in possible values')
                _is_error_found = True
        if not _is_error_found:
            print('test 5: passed')

In [None]:
def check_missing_data(column: pd.Series)->bool:
    is_missing_data = column.isna().any()
    return is_missing_data

check_missing_data()