In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
def DataSummary(Data, sve=0):

    """
        This function gives a summary for columns of given data.

        Parameters:
            Data : The name of dataset (should be a dataframe).
            sve  : Single Value Elimination

        Returns:
            DataFrame.

        Example:
            DataSummary = koco.DataSummary(CebimPos)
    """
    
    DataSummary = pd.DataFrame({
                                 'ColumnName': pd.Series(dtype='str')
                                ,'DataType': pd.Series(dtype='str')
                                ,'TotalCount': pd.Series(dtype='int')
                                ,'DistinctCount': pd.Series(dtype='int')
                                ,'MinValue': pd.Series(dtype='float')
                                ,'MaxValue': pd.Series(dtype='float')
                                ,'AvgValue': pd.Series(dtype='float')
                                ,'ModValue': pd.Series(dtype='float')
                                ,'MedValue': pd.Series(dtype='float')
                                ,'StdValue': pd.Series(dtype='float')
                                ,'VarValue': pd.Series(dtype='float')
                                ,'NullCount': pd.Series(dtype='int')
                                ,'NullRate': pd.Series(dtype='float')
                                ,'ZeroCount': pd.Series(dtype='int')
                                ,'ZeroRate': pd.Series(dtype='float')
                                ,'PositiveCount': pd.Series(dtype='int')
                                ,'PositiveRate': pd.Series(dtype='float')
                                ,'NegativeCount': pd.Series(dtype='int')
                                ,'NegativeRate': pd.Series(dtype='float')
                                })
    
    for i in Data.columns:
        if i in Data.select_dtypes(include=np.number).columns.tolist():
            ColumnName = i
            DataType = Data[i].dtypes
            TotalCount = len(Data)
            DistinctCount = len(Data[i].value_counts())
            MinValue = Data[i].min()
            MaxValue = Data[i].max()
            AvgValue = Data[i].mean()
            if(pd.isna(MinValue) and pd.isna(MaxValue)):
                ModValue = float('nan')
            else:
                ModValue = Data[i].mode()[0] 
            MedValue = Data[i].median()
            StdValue = Data[i].std()
            VarValue = Data[i].var()
            NullCount = Data[i].isnull().sum()
            NullRate = NullCount / TotalCount
            ZeroCount = (Data[i] == 0).sum()
            ZeroRate = ZeroCount / TotalCount
            PositiveCount = (Data[i] > 0).sum()
            PositiveRate = PositiveCount / TotalCount
            NegativeCount = (Data[i] < 0).sum()
            NegativeRate = NegativeCount / TotalCount
        else:
            ColumnName = i
            DataType = Data[i].dtypes
            TotalCount = len(Data)
            DistinctCount = len(Data[i].value_counts())
            MinValue = Data[i].min()
            MaxValue = Data[i].max()
            AvgValue = float('nan')
            ModValue = float('nan')
            MedValue = float('nan')
            StdValue = float('nan')
            VarValue = float('nan')
            NullCount = Data[i].isnull().sum()
            NullRate = NullCount / TotalCount
            ZeroCount = float('nan')
            ZeroRate = float('nan')
            PositiveCount = float('nan')
            PositiveRate = float('nan')
            NegativeCount = float('nan')
            NegativeRate = float('nan')            

        NewRow = [ColumnName
                 ,DataType
                 ,TotalCount
                 ,DistinctCount
                 ,MinValue
                 ,MaxValue
                 ,AvgValue
                 ,ModValue
                 ,MedValue
                 ,StdValue
                 ,VarValue
                 ,NullCount
                 ,NullRate
                 ,ZeroCount
                 ,ZeroRate
                 ,PositiveCount
                 ,PositiveRate
                 ,NegativeCount
                 ,NegativeRate]
        DataSummary.loc[len(DataSummary)] = NewRow
        
    if sve == 1:
        DeletedColumns = []
        for i in Data.columns:
            if DataSummary['DistinctCount'][DataSummary['ColumnName'] == i].values[0] == 0:
                Data = Data.drop(columns=[i])
                DeletedColumns = DeletedColumns + [i]
            elif (DataSummary['DistinctCount'][DataSummary['ColumnName'] == i].values[0] == 1) and (DataSummary['NullCount'][DataSummary['ColumnName'] == i].values[0] == 0):
                Data = Data.drop(columns=[i])
                DeletedColumns = DeletedColumns + [i]
        print('SVE column list: ' + str(DeletedColumns) + '\n')
        
        return DataSummary, Data
    else:
        return DataSummary

In [None]:
def Rep(Data, Train=None, CalcParameter=None, TestSize=0.30, Seed=1234):

    """
        This function calculate representation power of train set.

        Parameters:
            Data          : The name of dataset (should be a dataframe).
            Train         : Optional, train dataset (should be a dataframe)
            CalcParameter : Calculation parameters
            TestSize      : Test size for train and test
            Seed          : Random State value

        Returns:
            DataFrame.

        Example:
            Representation = koco.Rep(Data=OtherPOS)
            Representation = koco.Rep(Data=OtherPOS, Train=OP_Test_x)
    """

    if Train is None:
        Train, Test = train_test_split(Data, test_size=TestSize, random_state=Seed)

    if CalcParameter is None:
        CalcParameter = ['AvgValue', 'ModValue', 'MedValue', 'NullRate', 'ZeroRate', 'PositiveRate', 'NegativeRate']

    POPSummary = DataSummary(Data)       
    POPSummary = POPSummary[(POPSummary.DataType == 'int64') | (POPSummary.DataType == 'float64')].reset_index(drop=True)  
    POPSummary.set_index('ColumnName', inplace = True)

    TrainSummary = DataSummary(Train)
    TrainSummary = TrainSummary[(TrainSummary.DataType == 'int64') | (TrainSummary.DataType == 'float64')].reset_index(drop=True)
    TrainSummary.set_index('ColumnName', inplace = True)

    Representation = pd.DataFrame({'ColumnName': pd.Series(dtype='str')})
    Representation.set_index('ColumnName', inplace = True)

    ColumnList = [Col for Col in POPSummary.index if Col in TrainSummary.index]

    for i in ColumnList:
        rep=0
        for j in CalcParameter:
            p=POPSummary.loc[i, j]
            t=TrainSummary.loc[i, j]
            if p == 0 and t == 0:
                Representation.loc[i,j] = 0
                rep = rep + 0
            elif p == 0:
                Representation.loc[i,j] = 1
                rep = rep + 1
            else:
                r=(round(t,6)-round(p,6))/round(p,6)
                Representation.loc[i,j] = r
                rep = rep + r**2

        Representation.loc[i,'Rep'] = 1-((rep/len(CalcParameter))**(1/2))

    Representation.drop(columns=(CalcParameter), inplace=True)

    print('Representaion power of train sample: ' + str(round((Representation['Rep'].mean())*100,2)))
    Representation.sort_values(by='Rep',ascending=False, inplace=True)

    return Representation    