In [3]:
import pandas as pd
from sklearn.impute import KNNImputer

#Loading data from csv files
#Excel data files
def load_data(data_set):
    try:
        excel = ['.xls','.xlsx']
        if '.csv' in data_set:
            d1 = pd.read_csv(data_set,delimiter=';',header=0)
            print('CSV file loaded')
            print(d1.head())
        elif any(x in data_set for x in excel): 
            d1 = pd.read_excel(data_set,index_col=0)
            print('Excel file loaded')
            print(d1.head())
        return d1
    except:
        print('Error encountered in loading data file')
    
#Missing values 

#Impute missing values with mean for Qualitative Data

def impute_missing_value_mean(data):
    try:
        for col in data.columns:
            if (data[col].isnull().sum()>0):
                if any(data[col].dtype in x2 for x2 in  [x.value for x in QuantitativeDataType]):
                    data[col].fillna(value=data[col].mean(),inplace=True)
        return data
    except:
        print('Error encountered in imputing missing values - mean')
    
#Categorical Data
def impute_missing_values_mode(data):
    try:
        for col in data.columns:
            if (data[col].isnull().sum()>0):
                if any(data[col].dtype in x2 for x2 in  [x.value for x in CategoricalDataType]):
                    print(col)
                    data[col].fillna(value=data[col].mode()[0],inplace=True)
        return data
    except:
         print('Error encountered in imputing missing values - mode')
       


    # Impute missing values with KNN            

def impute_missing_value_knn(data):
    try:
        missing_cols = data.columns[data.isnull().any()]
        if len(missing_cols)>0:
                imputer =KNNImputer(n_neighbors=2)
                data = pd.DataFrame(imputer.fit_transform(data),columns=data.columns) 

        return data  
    except:
         print('Error encountered in imputing missing values - knn')

#Impute missing values with Interpolate
def impute_missing_values_interpolate(data):
    try:   
        data_filled = data.interpolate()
        return data_filled
    except:
         print('Error encountered in imputing missing values - interpolate')

In [2]:
import numpy as np
import enum
from enum import Enum, auto

# the use of Enum classes will prevent incorrect combination of values
class QuantitativeDataType(Enum):
    CONTINUOUS = [float, np.float64]
    DISCRETE = [int]

class CategoricalDataType(Enum):
    BOOLEAN = [bool]
    NUMERICAL = [float, int, np.float64, np.int64]
    CHARACTER = [str, object]
    
class DataType(Enum):
    """

    """
    QUANTITATIVE = [QuantitativeDataType.CONTINUOUS,
                   QuantitativeDataType.DISCRETE]
    CATEGORICAL = [CategoricalDataType.BOOLEAN,
                  CategoricalDataType.NUMERICAL,
                  CategoricalDataType.CHARACTER]
    #MISSING = 'MISSING'
    DATETIME = 'DATETIME'
    UNKNOWN = 'UNKNOWN'
    
def CustomWarning(level,message):
        logger = logging.getLogger('mylogger')
        logger.setLevel(level)

        handler = logging.FileHandler('mylog.log')
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        logger.addHandler(handler)

        if level == 'CRITICAL':
            logger.critical(message)
        elif level == 'WARNING':
            logger.warning(message)
        elif level == 'ERROR':
            logger.error(message)
        elif level == 'DEBUG':
            logger.debug(message)
        else:
            logger.info(message)
    


In [20]:
csv_data1 = load_data('C:\\Users\\p.santosha.dasari\\Desktop\\Feature164\\pseudo_adni_mod.csv')

In [7]:
import pandas as pd
def load_data(data_set):
    try:
        excel = ['.xls','.xlsx']
        if '.csv' in data_set:
            d1 = pd.read_csv(data_set,delimiter=';',header=0)
            print('CSV file loaded')
            print(d1.head())
        elif any(x in data_set for x in excel): 
            d1 = pd.read_excel(data_set,index_col=0)
            print('Excel file loaded')
            print(d1.head())
        return d1
    except:
        print('Exception encountered in loading data file')

In [17]:
csv_data1.isnull().sum()

CDRSB.bl               0
ADAS11.bl              1
MMSE.bl                0
RAVLT.immediate.bl     1
RAVLT.learning.bl      0
RAVLT.forgetting.bl    0
FAQ.bl                 0
WholeBrain.bl          0
Ventricles.bl          0
Hippocampus.bl         0
MidTemp.bl             0
Entorhinal.bl          0
ABETA.MEDIAN.bl        0
PTAU.MEDIAN.bl         0
TAU.MEDIAN.bl          0
AGE                    0
dtype: int64

In [16]:
data_filled = impute_missing_value_knn(csv_data1)

In [19]:
data_filled.isnull().sum()

CDRSB.bl               0
ADAS11.bl              0
MMSE.bl                0
RAVLT.immediate.bl     0
RAVLT.learning.bl      0
RAVLT.forgetting.bl    0
FAQ.bl                 0
WholeBrain.bl          0
Ventricles.bl          0
Hippocampus.bl         0
MidTemp.bl             0
Entorhinal.bl          0
ABETA.MEDIAN.bl        0
PTAU.MEDIAN.bl         0
TAU.MEDIAN.bl          0
AGE                    0
dtype: int64

In [3]:
#Checking if missing values exceed threshold limit(50%)
def check_missing_value_limit(data,percent):
    MIN_NB_MISSING_DATA = ceil((percent/100)*data.shape[0])
    for col in data.columns:
        if (data[col].isnull().sum()>MIN_NB_MISSING_DATA):
            CustomWarning('CRITICAL',f'Missing value for column {col} exceeds threshold limit {MIN_NB_MISSING_DATA}')

In [None]:
#Checking samples limit

def check_number_of_samples(data,MIN_NB_SAMPLES):
    
    sample_count = data.shape[0]
    if sample_count> MIN_NB_SAMPLES:
        CustomWarning('CRITICAL',f'Samples count exceeds the threshold limit {MIN_NB_SAMPLES}')