In [None]:
import pandas as pd
from sklearn.impute import KNNImputer

#Loading data from csv files
#Excel data files
def load_data(data_set):
    try:
        excel = ['.xls','.xlsx']
        if '.csv' in data_set:
            d1 = pd.read_csv(data_set,delimiter=',',header=0)
            print('CSV file loaded')
            print(d1.head())
        elif any(x in data_set for x in excel): 
            d1 = pd.read_excel(data_set,index_col=None)
            print('Excel file loaded')
            print(d1.head())
        return d1
    except Exception as err:
        print(err)
        print('Error encountered in loading data file')

    
#Missing values 

#Impute missing values with mean for Qualitative Data

def impute_missing_values_mean(data):
    try:
        if type(data) == pd.core.frame.DataFrame:
            for col in data.columns:
                if (data[col].isnull().sum()>0):
                    if any(data[col].dtype in x2 for x2 in  [x.value for x in QuantitativeDataType]):
                        data[col].fillna(value=data[col].mean(),inplace=True)
        else:
            data = data.fillna(data.mean())
        return data
    except Exception as err:
        print(err)
        print('Error encountered in loading data file')

 
 #Categorical Data
def impute_missing_values_mode(data):
    try:
        if type(data) == pd.core.frame.DataFrame:
            for col in data.columns:
                if (data[col].isnull().sum()>0):
                    if any(data[col].dtype in x2 for x2 in  [x.value for x in CategoricalDataType]):
                        print(col)
                        data[col].fillna(value=data[col].mode()[0],inplace=True)
                        
        else:
            data = data.fillna(data.value_counts().index[0])
            
        return data
    except Exception as err:
        print(err)
        print('Error encountered in imputing missing values - mode')
       


    # Impute missing values with KNN            

def impute_missing_values_knn(data,k=2):
    try:
        if type(data) == pd.core.frame.DataFrame:
            missing_cols = data.columns[data.isnull().any()]
            if len(missing_cols)>0:
                    imputer =KNNImputer(n_neighbors=k)
                    data = pd.DataFrame(imputer.fit_transform(data),columns=data.columns) 
        else:
            imputer =KNNImputer(n_neighbors=k)
            data =pd.DataFrame( (imputer.fit_transform(np.array(data).reshape(1,-1))).reshape(-1,1),columns=[data.name])                    

        return data  
    except Exception as err:
        print(err)
        print('Error encountered in imputing missing values - knn')

#Impute missing values with Interpolate
def impute_missing_values_interpolate(data):
    try:
        data_filled = data.interpolate()
            
        return data_filled
    except Exception as err:
        print(err)
        print('Error encountered in imputing missing values - interpolate')

In [1]:
import numpy as np
import enum
from enum import Enum, auto
import logging

# the use of Enum classes will prevent incorrect combination of values
class QuantitativeDataType(Enum):
    CONTINUOUS = [float, np.float64]
    DISCRETE = [int]

class CategoricalDataType(Enum):
    BOOLEAN = [bool]
    NUMERICAL = [float, int, np.float64, np.int64]
    CHARACTER = [str, object]
    
class DataType(Enum):
    """

    """
    QUANTITATIVE = [QuantitativeDataType.CONTINUOUS,
                   QuantitativeDataType.DISCRETE]
    CATEGORICAL = [CategoricalDataType.BOOLEAN,
                  CategoricalDataType.NUMERICAL,
                  CategoricalDataType.CHARACTER]
    #MISSING = 'MISSING'
    DATETIME = 'DATETIME'
    UNKNOWN = 'UNKNOWN'
    
class CustomWarning:
    
        def __init__(self, disclosure,level):
            self.level = level
            self.disclosure = disclosure

            
        def display(self,message, columns = ' ' ):
            
            logger = logging.getLogger('mylogger')
            logger.setLevel(self.level)

            handler = logging.FileHandler('mylog.log')
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            logger.addHandler(handler)
            
            if self.disclosure == 1:
                if self.level == 'CRITICAL':
                    logger.critical(message)
                elif self.level == 'WARNING':
                    logger.warning(message)
            elif self.disclosure == 2:
                if self.level == 'CRITICAL':
                    message = 'Critical Warning.' + message
                    logger.critical(message)
                elif self.level == 'WARNING':
                    message = 'Regular Warning.' + message
                    logger.warning(message)
            elif self.disclosure == 3:
                if self.level == 'CRITICAL':
                    message = 'Critical Warning. ' + message + 'Columns affected :' + columns
                    logger.critical(message)
                elif self.level == 'WARNING':
                    message = 'Regular Warning. ' + message  + 'Columns affected :' + columns
                    logger.warning(message)           
  

In [None]:
csv_data1 = load_data('C:\\Users\\p.santosha.dasari\\Desktop\\Feature164\\pseudo_adni_mod.csv')

In [None]:
import pandas as pd
def load_data(data_set):
    try:
        excel = ['.xls','.xlsx']
        if '.csv' in data_set:
            d1 = pd.read_csv(data_set,delimiter=';',header=0)
            print('CSV file loaded')
            print(d1.head())
        elif any(x in data_set for x in excel): 
            d1 = pd.read_excel(data_set,index_col=0)
            print('Excel file loaded')
            print(d1.head())
        return d1
    except:
        print('Exception encountered in loading data file')

In [None]:
csv_data1.isnull().sum()

In [None]:
data_filled = impute_missing_value_knn(csv_data1)

In [None]:
data_filled.isnull().sum()

In [None]:
import math #Checking if missing values exceed threshold limit(50%)
def check_missing_values_limit(data,percent,disclosure):
    MIN_NB_MISSING_DATA = math.ceil((percent/100)*data.shape[0])
    for col in data.columns:
        if (data[col].isnull().sum()>MIN_NB_MISSING_DATA):
            
            critical_warning = CustomWarning(disclosure,'CRITICAL')
            critical_warning.display(f'Missing value exceeds threshold limit {MIN_NB_MISSING_DATA}',col)
            
          

In [None]:
#Checking samples limit

def check_number_of_samples(data,MIN_NB_SAMPLES,disclosure):
    
    sample_count = data.shape[0]
    if sample_count> MIN_NB_SAMPLES:
        critical_warning = CustomWarning(disclosure,'CRITICAL')
        critical_warning.display(f'Samples count exceeds the threshold limit {MIN_NB_SAMPLES}')

In [None]:
data1 = load_data('C:\\Users\\p.santosha.dasari\\Desktop\\Feature164\\Exceltest.xlsx')

In [None]:
check_missing_values_limit(d1,1,3)

In [None]:
d10 = load_data('C:\\Users\\p.santosha.dasari\\Desktop\\Feature164\\file1.csv')

In [None]:
d1 = load_data('/mnt/c/Users/p.santosha.dasari/Desktop/Feature164/Exceltest.xlsx')