In [18]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

#Loading data from csv files
#Excel data files
def load_data(data_set):
    try:
        excel = ['.xls','.xlsx']
        if '.csv' in data_set:
            d1 = pd.read_csv(data_set,delimiter=',',header=0)
            print('CSV file loaded')
            print(d1.head())
        elif any(x in data_set for x in excel): 
            d1 = pd.read_excel(data_set,index_col=None)
            print('Excel file loaded')
            print(d1.head())
        return d1
    except Exception as err:
        print(err)
        print('Error encountered in loading data file')

    
#Missing values 

#Impute missing values with mean for Qualitative Data

def impute_missing_values_mean(data,listforimputation):
    try:
        if type(data) == pd.core.frame.DataFrame:
            for col in data.columns:
                if col in listforimputation:
                    if (data[col].isnull().sum()>0):
                        if any(data[col].dtype in x2 for x2 in  [x.value for x in QuantitativeDataType]):
                            data[col].fillna(value=data[col].mean(),inplace=True)
        else:
            data = data.fillna(data.mean())
        return data
    except Exception as err:
        print(err)
        print('Error encountered in loading data file')

 
 #Categorical Data
def impute_missing_values_mode(data,listforimputation):
    try:
        if type(data) == pd.core.frame.DataFrame:
            for col in data.columns:
                if col in listforimputation:
                    if (data[col].isnull().sum()>0):
                        if any(data[col].dtype in x2 for x2 in  [x.value for x in CategoricalDataType]):
                            print(col)
                            data[col].fillna(value=data[col].mode()[0],inplace=True)
                        
        else:
            data = data.fillna(data.value_counts().index[0])
            
        return data
    except Exception as err:
        print(err)
        print('Error encountered in imputing missing values - mode')
       


    # Impute missing values with KNN            

def impute_missing_values_knn(data,k=2):
    try:
        if type(data) == pd.core.frame.DataFrame:
            missing_cols = data.columns[data.isnull().any()]
            if len(missing_cols)>0:
               
                data = data.apply(lambda series: pd.Series(
                LabelEncoder().fit_transform(series[series.notnull()]),
                index=series[series.notnull()].index
                ))
                
                imputer =KNNImputer(n_neighbors=k)
                data = pd.DataFrame(imputer.fit_transform(data),columns=data.columns) 
        else:
            imputer =KNNImputer(n_neighbors=k)
            data =pd.DataFrame( (imputer.fit_transform(np.array(data).reshape(1,-1))).reshape(-1,1),columns=[data.name])                    

        return data  
    except Exception as err:
        print(err)
        print('Error encountered in imputing missing values - knn')

#Impute missing values with Interpolate
def impute_missing_values_interpolate(data):
    try:
        data_filled = data.interpolate()
            
        return data_filled
    except Exception as err:
        print(err)
        print('Error encountered in imputing missing values - interpolate')

In [2]:
import numpy as np
import enum
from enum import Enum, auto
import warnings
import logging

# the use of Enum classes will prevent incorrect combination of values
class QuantitativeDataType(Enum):
    CONTINUOUS = [float, np.float64]
    DISCRETE = [int]

class CategoricalDataType(Enum):
    BOOLEAN = [bool]
    NUMERICAL = [float, int, np.float64, np.int64]
    CHARACTER = [str, object]
    
class DataType(Enum):
    """

    """
    QUANTITATIVE = [QuantitativeDataType.CONTINUOUS,
                   QuantitativeDataType.DISCRETE]
    CATEGORICAL = [CategoricalDataType.BOOLEAN,
                  CategoricalDataType.NUMERICAL,
                  CategoricalDataType.CHARACTER]
    #MISSING = 'MISSING'
    DATETIME = 'DATETIME'
    UNKNOWN = 'UNKNOWN'
    
class CustomWarning:
    
        def __init__(self, disclosure,level):
            self.level = level
            self.disclosure = disclosure

            
        def display(self,message, columns = ' ' ):
            
            logger = logging.getLogger('mylogger')
            logger.setLevel(self.level)

            handler = logging.FileHandler('mylog.log')
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            logger.addHandler(handler)
            
            print('Display disclosure:',self.disclosure)
            print('Display level',self.level)
            
            disclosure = self.disclosure
            level = self.level
            
            if disclosure == 1:
                if level == 'CRITICAL':
                    logger.critical(message)
                elif level == 'WARNING':
                    logger.warning(message)
                print("disclosure level 1",message)
            elif disclosure == 2:
                print('Level 2')
                if level == 'CRITICAL':
                    print("disclosure level 2",message)
                    message = 'Critical Warning.' + message
                    logger.critical(message)
                elif level == 'WARNING':
                    message = 'Regular Warning.' + message
                    logger.warning(message)
                print("disclosure level 2",message)
            elif self.disclosure == 3:
                if self.level == 'CRITICAL':
                    message = 'Critical Warning. ' + message + 'Columns affected :' + columns
                    logger.critical(message)
                elif self.level == 'WARNING':
                    message = 'Regular Warning. ' + message  + 'Columns affected :' + columns
                    logger.warning(message)
                print("disclosure level 3", message)
            warnings.warn(message) 
            print(message)
            return message


In [3]:

import math #Checking if missing values exceed threshold limit(50%)
def check_missing_values_limit(data,percent,critical_warning : CustomWarning):
    
    MIN_NB_MISSING_DATA = math.ceil((percent/100)*data.shape[0])
    report = {}
    report_details = []
    for col in data.columns:
        elem_report ={}
       
        elem_report['feature_name']=col
        
        if (data[col].isnull().sum()>MIN_NB_MISSING_DATA):
            elem_report['success']='No'
            message = critical_warning.display(f'Missing value exceeds threshold limit {MIN_NB_MISSING_DATA}',col) 
            elem_report['msg']= message
            
        else:
            elem_report['success']='Yes'
            elem_report['msg']='Test passed'
        
        report_details.append(elem_report)
        print(report_details)
            
        report['check_missing_values_limit'] = report_details

    return report

In [22]:
cwarn = CustomWarning(2,'CRITICAL')
check_missing_values_limit(d1,0.1,cwarn)

In [5]:
#Checking samples limit

def check_number_of_samples(data,MIN_NB_SAMPLES,critical_warning:CustomWarning):
    
    sample_count = data.shape[0]
    report = {}
    elem_report ={}
    report_details = []
    elem_report['feature_name']='ALL'    
    
    if sample_count> MIN_NB_SAMPLES:
        elem_report['success']='No'
        
        message = critical_warning.display(f'Samples count exceeds the threshold limit {MIN_NB_SAMPLES}')
        elem_report['msg']= message
    else:
        elem_report['success']='Yes'
        elem_report['msg']='Test passed'
        
    report_details.append(elem_report)
    print(report_details)
        
    report['check_number_of_samples'] = report_details

    return report

In [6]:
def report_warnings(data):
    
    try:
        report_warnings ={}
        
        disclosure = input('Enter level of disclosure needed for warnings: ')
        critical_warning = CustomWarning(disclosure,'CRITICAL')
        regular_warning = CustomWarning(disclosure,'WARNING')
        
        missing_limit_percent = float(input('Enter percentage for missing value limit :'))   
        report1 = check_missing_values_limit(data,missing_limit_percent,critical_warning)
        print('Test1 done')
        samples_limit = int(input('Enter max samples limit needed :'))
        report2 = check_number_of_samples(data,samples_limit,critical_warning)
        print('Test2 done')
        
        report2.update(report1)
        print(report2)
        
    except Exception as err:
        print(err)
        print('Error encountered in generating report for warnings')
    
    return report2
    

In [7]:
csv_data1 = load_data('C:\\Users\\p.santosha.dasari\\Desktop\\Feature164\\pseudo_adni_mod.csv')

[Errno 2] No such file or directory: 'C:\\Users\\p.santosha.dasari\\Desktop\\Feature164\\pseudo_adni_mod.csv'
Error encountered in loading data file


In [8]:
d1 = load_data('C:\\Users\\p.santosha.dasari\\Desktop\\Feature164\\Exceltest.xlsx')

[Errno 2] No such file or directory: 'C:\\Users\\p.santosha.dasari\\Desktop\\Feature164\\Exceltest.xlsx'
Error encountered in loading data file


In [15]:
d1 = load_data('/mnt/c/Users/p.santosha.dasari/Desktop/Feature164/Exceltest.xlsx')

Excel file loaded
   ID   Age Eligibility
0    1   45           Y
1    2   45           Y
2    3   33           N
3    4   54           Y
4    5   45           Y


In [19]:
d1

Unnamed: 0,ID,Age,Eligibility
0,1,45,Y
1,2,45,Y
2,3,33,N
3,4,54,Y
4,5,45,Y
5,6,54,
6,7,34,N
7,8,54,
8,9,45,
9,10,44,Y


In [20]:
imp = ['Eligibility']
d2 = impute_missing_values_knn(d1)

In [21]:
d2

Unnamed: 0,ID,Age,Eligibility
0,0.0,3.0,1.0
1,1.0,3.0,1.0
2,2.0,0.0,0.0
3,3.0,4.0,1.0
4,4.0,3.0,1.0
5,5.0,4.0,1.0
6,6.0,1.0,0.0
7,7.0,4.0,1.0
8,8.0,3.0,0.5
9,9.0,2.0,1.0


In [28]:
rep1 = report_warnings(d1)

Enter percentage for missing value limit :1




Disclosure level : 2
[{'feature_name': 'ID ', 'success': 'Yes', 'msg': 'Test passed'}]
[{'feature_name': 'ID ', 'success': 'Yes', 'msg': 'Test passed'}, {'feature_name': 'Age', 'success': 'Yes', 'msg': 'Test passed'}]
Display disclosure: 2
Display level CRITICAL
Missing value exceeds threshold limit 1
[{'feature_name': 'ID ', 'success': 'Yes', 'msg': 'Test passed'}, {'feature_name': 'Age', 'success': 'Yes', 'msg': 'Test passed'}, {'feature_name': 'Eligibility', 'success': 'No', 'msg': 'Missing value exceeds threshold limit 1'}]
Test1 done
Enter max samples limit needed :2
Display disclosure: 2
Display level CRITICAL
Samples count exceeds the threshold limit 2
[{'feature_name': 'ALL', 'success': 'No', 'msg': 'Samples count exceeds the threshold limit 2'}]
Test2 done
{'check_number_of_samples': [{'feature_name': 'ALL', 'success': 'No', 'msg': 'Samples count exceeds the threshold limit 2'}], 'check_missing_values_limit': [{'feature_name': 'ID ', 'success': 'Yes', 'msg': 'Test passed'}, {'



In [4]:
rep1