# Fairness-Aware Instrumentation of ML-Pipelines

## Preparations

In [1]:
from collections import defaultdict
import inspect
import pandas as pd
import numpy as np
from scipy import stats
import re
from graphviz import Digraph

from sklearn.preprocessing import OneHotEncoder, StandardScaler, label_binarize, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from utils import *

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_colwidth',1000)
np.set_printoptions(precision = 4)
pd.set_option("display.precision", 4)
pd.set_option('expand_frame_repr', True)

## Logs Part

In [2]:
# Current Version
def describe_ver(pipeline_to_test, cat_col, numerical_col):
    
    raw_func = inspect.getsource(pipeline_to_test)


    input_args, executable_list, outputs = func_aggregation(raw_func)
    
    for line in input_args:
        exec(line)
    
    print()
    print('####################### Start Pandas Opeation #######################')
    print()
    
    ######################################
    # Initialization
    ######################################
    prev = {}
    
    numerical_metric_list = ['count', 'missing_count', 'median', 'mad', 'range']
    numerical_df = pd.DataFrame(np.inf, index = numerical_col, columns = numerical_metric_list)
    
    cat_metric_list = ['missing_count', 'num_class', 'class_count', 'class_percent']
    cat_df = pd.DataFrame(np.inf, index = cat_col, columns = cat_metric_list)

    
    ######################################
    # Execution
    ######################################     
    for cur_line in executable_list:
        print_bool = False
        exec(cur_line)
        if '#' in cur_line:
            continue
        try: 
            if str(eval(f"type({cur_line.split('=')[0].strip()})")) == "<class 'pandas.core.frame.DataFrame'>":

                target_df = cur_line.split('=')[0].strip()
                
                col_list = eval(target_df).columns.tolist()
                numerical_col_sub = [i for i in numerical_col if i in col_list]
                cat_col_sub = [j for j in cat_col if j in col_list]
                
                if len(numerical_col_sub) != 0:
                    ######################################################################################
                    # numerical features & metrices
                    # counts, missing values, Median and MAD, range/scaling
                    ######################################################################################
                    for numeric_feature in numerical_col_sub:

                        numerical_df = cal_numerical(eval(target_df), numeric_feature, numerical_df)
                
                if len(cat_col_sub) != 0:
                    ######################################################################################
                    # categorical features & metrices
                    # missing values, number of classes, counts for each group, percentage for each group
                    ######################################################################################
                    for cat_feature in cat_col_sub:

                        cat_df = cal_categorical(eval(target_df), cat_feature, cat_df)
                    
                ######################################################################################
                # Comparison occurs here! 
                ######################################################################################

                if len(prev) != 0:
                    if len(numerical_col_sub) != 0:
                        numerical_dif = numerical_df - prev['numerical']
                        if (numerical_dif.values != 0).any():
                            print('*'*10)
                            print('Changes in numerical features!')
                            display(numerical_dif)
                            print('*'*10)
                            print()
                    
                ################################## 
                # ⬆️ numerical
                # ⬇️ categorical                
                ##################################
                    if len(cat_col_sub) != 0:
                        cat_dif = get_categorical_dif(cat_df, cat_metric_list, prev['categorical'])
                        cat_dif_flag = check_cat_dif(cat_dif)
                        if cat_dif_flag:
                            print('*'*10)
                            print('Changes in categorical features!')
                            display(cat_dif)
                            print('*'*10)
              
                print_bool = True
                
                if print_bool:
                    print('-------------------------------------------------------')
                    print(f'Inpected {cur_line}')
                    print('-------------------------------------------------------')
                    print() 

                # save the output for next round comparison
                prev['numerical'] = numerical_df.copy()
                prev['categorical'] = cat_df.copy()

            elif str(eval(f"type({cur_line.split('=')[0].strip()})")).startswith("<class 'sklearn"):
                pass
            else:
                pass

        except:
            if len(numerical_col_sub) != 0:
                ######################################################################################
                # numerical features & metrices
                # counts, missing values, Median and MAD, range/scaling
                ######################################################################################
                for numeric_feature in numerical_col:

                    numerical_df = cal_numerical(eval(target_df), numeric_feature, numerical_df)
            if len(cat_col_sub) != 0:
                ######################################################################################
                # categorical features & metrices
                # missing values, number of classes, counts for each group, percentage for each group
                ######################################################################################
                for cat_feature in cat_col:

                    cat_df = cal_categorical(eval(target_df), cat_feature, cat_df)

            ######################################################################################
            # Comparison occurs here! 
            ######################################################################################
            if len(prev) != 0:
                if len(numerical_col_sub) != 0:
                    numerical_dif = numerical_df - prev['numerical']
                    if (numerical_dif.values != 0).any():
                        print('*'*10)
                        print('Changes in numerical features!')
                        display(numerical_dif)
                        print('*'*10)
                        print()

            ################################## 
            # ⬆️ numerical
            # ⬇️ categorical                
            ##################################
                if len(cat_col_sub) != 0:
                    cat_dif = get_categorical_dif(cat_df, cat_metric_list, prev['categorical'])
                    cat_dif_flag = check_cat_dif(cat_dif)
                    if cat_dif_flag:
                        print('*'*10)
                        print('Changes in categorical features!')
                        display(cat_dif)
                        print('*'*10)

            print_bool = True

            if print_bool:
                print('-------------------------------------------------------')
                print(f'Inpected {cur_line}')
                print('-------------------------------------------------------')
                print() 

            # save the output for next round comparison
            prev['numerical'] = numerical_df.copy()
            prev['categorical'] = cat_df.copy()            
            

    nested_graph = pipeline_to_dataflow_graph(eval(f'{outputs[0]}'))

    print()
    print('####################### Start Sklearn Pipeline #######################')
    print()
        
    for item in nested_graph:
        ######################################################################################
        # numerical features & metrices
        # counts, missing values, Median and MAD, range/scaling
        ######################################################################################
        if item.name in numerical_col: 
            numeric_feature = item.name
            
            eval(target_df)[item.name] = item.operation.fit_transform(eval(target_df)[item.name].values.reshape(-1,1))
            print('-------------------------------------------------------')
            print(f"Operations {str(item.operation).split('(')[0]} on {item.name}")
            print('-------------------------------------------------------')
            print()
            
            ##############################
            # Metrices Calculation
            ##############################
            numerical_df = cal_numerical(eval(target_df), numeric_feature, numerical_df)
            
            ##############################
            # Comparison
            ##############################
            numerical_dif = numerical_df - prev['numerical']
            
            if (numerical_dif.loc[numeric_feature,:].values != 0).any():
                # print(f'Metrics: {mat} changed in {col} with value {dif}')
                print('*'*10)
                print('Changes in numerical features!')
                display(numerical_dif.loc[numeric_feature,:].to_frame())
                print('*'*10)
                print()
                
        ######################################################################################
        # categorical features & metrices
        # missing values, number of classes, counts for each group, percentage for each group
        ######################################################################################               
        if item.name in cat_col:
            cat_feature = item.name
            ##############################
            try:
                eval(target_df)[item.name] = item.operation.fit_transform(eval(target_df)[item.name].values.reshape(-1,1)).toarray()
            except:
                eval(target_df)[item.name] = item.operation.fit_transform(eval(target_df)[item.name].values.reshape(-1,1))
            print('-------------------------------------------------------')
            print(f"Operations {str(item.operation).split('(')[0]} on {item.name}")
            print('-------------------------------------------------------')
            print()
            
            ##############################
            # Metrices Calculation
            ##############################            
            cat_df = cal_categorical(eval(target_df), cat_feature, cat_df)
            
            ##############################
            # Comparison
            ##############################            
            cat_dif = get_categorical_dif(cat_df, cat_metric_list, prev['categorical'])
            cat_dif_flag = check_cat_dif(cat_dif)
            if cat_dif_flag:
                print('*'*10)
                print('Changes in categorical features!')
                display(cat_dif.loc[cat_feature,:].to_frame())
                print('*'*10)
                
        prev['numerical'] = numerical_df.copy()
        prev['categorical'] = cat_df.copy()

## DAGs Part

In [31]:
def find_pd_lines(pipeline_func):
    pipeline_func = inspect.getsource(pipeline_func)
    pd_lines = []
    input_args , executable_list, _ = func_aggregation(pipeline_func)
    for line in input_args:
        exec(line)
    for cur_line in executable_list:
        exec(cur_line)
        try: 
            if 'inplace' in cur_line:
                pd_lines.append(cur_line)
            elif str(eval(f"type({cur_line.split('=')[0].strip()})")).startswith("<class 'pandas"):
                pd_lines.append(cur_line)
        except:
            pass
    return pd_lines

def pd_to_dataflow_graph(pipeline_func, parent_vertices=[]):
    executable_list = find_pd_lines(pipeline_func)
    graph = []
    previous = []
    
    for line in executable_list:
        if 'inplace' in line and '#' not in line:
            df_name = line.split('.')[0]
            func_name = line.split('.')[1].split('(')[0].strip()
            col_effect = line.split('[')[1].split(']')[0].strip()
            if len(previous) > 1:
                for node in previous:
                    if node.name == df_name:
                        vertex = DataFlowVertex([node], df_name+'_drop', func_name+' '+col_effect, col_effect)
                        previous.append(vertex)
                        previous.remove(node)     
            else:
                vertex = DataFlowVertex(previous, df_name+'_drop', func_name+' '+col_effect, col_effect)
                previous = [vertex]
        else:
            var_name = line.split('=')[0].strip()

            # match ".func_name(...)"
            pd_func = re.search('\.\s*([_a-z]*)\s*\(',line)  
            if pd_func:
                func_name = pd_func.group(1)
                params = re.search('\(([^\)]*)\)',line)  #"(...)"

                if params:
                    params = params.group(1).strip()

                    if func_name == 'read_csv': #df = pd.read_csv(path)
                        vertex = DataFlowVertex(parent_vertices,var_name, func_name, params)
                        previous.append(vertex)

                    elif func_name in ['join','merge','concat']:
                        if func_name == 'concat': #df_new = pd.concat([df1,df2],keys=[])
                            df_names = [item.strip() for item in params.split(']')[0].strip().strip('[]').split(',')]

                        else: # df_new = df1.join/merge(df2,on='...',how='...')
                            df_names = [line.split('=')[1].strip().split('.')[0], params.split(',')[0].strip()] 
                        parent_vertices = search_vertex_by_names(df_names, graph) #search in graph by df_names
                        vertex = DataFlowVertex(parent_vertices, var_name, func_name, params) #TODO vertex name?
                        previous = [vertex] + list(set(previous) - set(parent_vertices))
                    elif 'lambda' in params:
                        cols = var_name.split('[')[1].split(']')[0].strip()
                        vertex = DataFlowVertex(previous, func_name+' '+cols, func_name, params)
                        previous = [vertex]
                    elif '[' in var_name:
                        cols = var_name.split('[')[1].split(']')[0].strip()
                        vertex = DataFlowVertex(previous, func_name+' '+cols+' '+params, func_name, params)
                        previous = [vertex]
                    else:
                        vertex = DataFlowVertex(previous, func_name+' '+params, func_name, params)
                        previous = [vertex]


            # filter operation: "df[[cols]]", "df[condition]","df.loc[]","df.iloc[]"
            else:
                if '[[' in line:
                    is_filter = re.search('\[([^\]]*)\]',line) #"[...]"
                else:
                    is_filter = re.search('\(([^\)]*)\)',line) #"[...]"
                if is_filter:
                    filter_cond = is_filter.group(1).strip('[').strip(']')
                    vertex = DataFlowVertex(previous, 'select '+filter_cond, 'filter', filter_cond)
                    previous = [vertex]

        graph.append(vertex)
            
    return graph, previous


def sklearn_to_dataflow_graph(pipeline, parent_vertices=[]):
    
    graph = pipeline_to_dataflow_graph_full(pipeline, name_prefix='', parent_vertices=[])
    for node in graph:
        if node.parent_vertices==[]:
            node.parent_vertices = parent_vertices
    return graph

def visualize(nested_graph, save_file_path ='./pipeline.gv'):
    dot = Digraph(comment='preprocessing_pipeline')
    dot.format = 'pdf'
    for node in nested_graph:
        dot.node(node.name,label = node.name+',\nop='+node.operation)
        parents = node.parent_vertices 
        
        for parent in parents:
            dot.edge(parent.name, node.name)
    dot.render(save_file_path, view=True)
    return dot

## Combine and make Func Wrapper

In [32]:
def tracer(cat_col, numerical_col):
    def wrapper(func):
        def call(*args, **kwargs):
            pd_graph, parent_vertices = pd_to_dataflow_graph(func)
            pipeline = func(*args, **kwargs)
            sklearn_graph = sklearn_to_dataflow_graph(pipeline, parent_vertices)
            pd_graph.extend(sklearn_graph)
            visualize(pd_graph)

            describe_ver(func, cat_col, numerical_col)
        return call
    return wrapper

## Pipeline Functions

### German Dataset

In [33]:
@tracer(cat_col = ['personal_status_and_sex'], numerical_col = ['age'])
def german_pipeline_easy(f_path = '../data/german_titled.csv'):
    data = pd.read_csv(f_path)
    # projection
    data = data[['duration_in_month', 'credit_his',  'credit_amt', 'preset_emp', 'personal_status_and_sex',
                 'guarantors', 'present_residence', 'property', 'age','label']]
    # filtering
    data = data.loc[(data.credit_amt>=4000)]

    #start sklearn pipeline
    one_hot_and_impute = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder())
    ])

    featurizer = ColumnTransformer(transformers=[
        ('onehot', OneHotEncoder(), ['credit_his', 'preset_emp']),
        ('impute_onehot', one_hot_and_impute, ['personal_status_and_sex', 'guarantors', 'property']),
        ('std_scaler', StandardScaler(), ['duration_in_month', 'credit_amt', 'present_residence', 'age'])
    ])
    pipeline = Pipeline([
        ('features', featurizer),
        ('learner', RandomForestClassifier())
    ])
    return pipeline
    

In [34]:
@tracer(cat_col = ['personal_status_and_sex'], numerical_col = ['age'])
def german_pipeline_normal(f_path_1='../data/german_titled_split_1.csv', f_path_2='../data/german_titled_split_2.csv'):
    # load data
    dataSplit1 = pd.read_csv(f_path_1, index_col = 0)
    dataSplit2 = pd.read_csv(f_path_2, index_col = 0)

    # join
    data = dataSplit1.merge(dataSplit2, on='identifier')

    # drop first col
    data.drop(data.columns[0], axis=1, inplace = True)

    # projection
    data = data[['duration_in_month', 'credit_his',  'credit_amt', 'preset_emp', 'personal_status_and_sex', 'guarantors', 'present_residence', 
                 'property', 'age','label']]
    # filtering
    data = data.loc[(data.credit_amt>=4000)]

    #start sklearn pipeline
    one_hot_and_impute = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder())
    ])

    featurizer = ColumnTransformer(transformers=[
        ('onehot', OneHotEncoder(), ['credit_his', 'preset_emp']),
        ('impute_onehot', one_hot_and_impute, ['personal_status_and_sex', 'guarantors', 'property']),
        ('std_scaler', StandardScaler(), ['duration_in_month', 'credit_amt', 'present_residence', 'age'])
    ])
    pipeline = Pipeline([
        ('features', featurizer),
        ('learner', RandomForestClassifier())
    ])
    return pipeline


### Compass Dataset

In [35]:
@tracer(cat_col = ['race'], numerical_col = ['age'])
def compas_pipeline(f1_path = '../data/compass/demographic.csv',f2_path = '../data/compass/jailrecord1.csv',f3_path = '../data/compass/jailrecord2.csv'):
    #read csv files
    df1 = pd.read_csv(f1_path)
    df2 = pd.read_csv(f2_path)
    df3 = pd.read_csv(f3_path)
    
    #drop columns inplace
    df1.drop(columns=['Unnamed: 0','age_cat'],inplace=True)
    df2.drop(columns=['Unnamed: 0'],inplace=True)
    df3.drop(columns=['Unnamed: 0'],inplace=True)

    #JOIN dataframes column-wise and row-wise
    data23 = pd.concat([df2,df3],ignore_index=True)
    data = df1.merge(data23, on=['id','name'])

    #drop rows that miss a few important features
    data = data.dropna(subset=['id', 'name','is_recid','days_b_screening_arrest','c_charge_degree','c_jail_out','c_jail_in'])

    #generate a new column conditioned on existed column
    data['age_cat'] = data.apply(lambda row:'<25' if row['age'] < 25 else '>45' if row['age']>45 else '25-45', axis=1)

    #PROJECTION
    data = data[['sex', 'dob','age','c_charge_degree', 'age_cat', 'race','score_text','priors_count','days_b_screening_arrest',
                 'decile_score','is_recid','two_year_recid','c_jail_in','c_jail_out']]

    #SELECT based on some conditions
    data = data.loc[(data['days_b_screening_arrest'] <= 30)]
    data = data.loc[(data['days_b_screening_arrest'] >= -30)]
    data = data.loc[(data['is_recid'] != -1)]
    data = data.loc[(data['c_charge_degree'] != "O")]
    data = data.loc[(data['score_text'] != 'N/A')]
    # create a new feature 
    data['c_jail_out'] = pd.to_datetime(data['c_jail_out']) 
    data['c_jail_in'] = pd.to_datetime(data['c_jail_in']) 
#     data['length_of_stay'] = data['c_jail_out'] - data['c_jail_in']
    #specify categorical and numeric features
    categorical = ['sex', 'c_charge_degree', 'age_cat', 'race', 'score_text', 'is_recid',
           'two_year_recid']
    numeric1 = ['age','priors_count', 'decile_score']
    numeric2 = ['days_b_screening_arrest','length_of_stay']

    #sklearn pipeline
    impute1_and_onehot = Pipeline([('imputer1', SimpleImputer(strategy='most_frequent')), 
                                   ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    impute2_and_bin = Pipeline([('imputer2', SimpleImputer(strategy='mean')), 
                                ('bin_discretizer', KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform'))])
    featurizer = ColumnTransformer(transformers=[
            ('impute1_and_onehot', impute1_and_onehot, categorical),
            ('impute2_and_bin', impute2_and_bin, numeric1),
            ('std_scaler', StandardScaler(), numeric2),
        ])
                               
    pipeline = Pipeline([
        ('features', featurizer),
        ('learner', LogisticRegression())
    ])
    return pipeline


### Adult Sample

In [36]:
@tracer(cat_col = ['race', 'occupation', 'education'], numerical_col = ['age', 'hours-per-week'])
def adult_pipeline_easy(f_path = '../pipelines/adult-sample.csv'):
   
    raw_data = pd.read_csv(f_path, na_values='?')
    data = raw_data.dropna()

    labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])

    feature_transformation = ColumnTransformer(transformers=[
        ('categorical', OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']),
        ('numeric', StandardScaler(), ['age', 'hours-per-week'])
    ])

        
    income_pipeline = Pipeline([
      ('features', feature_transformation),
      ('classifier', DecisionTreeClassifier())])
    
    return income_pipeline

In [37]:
@tracer(cat_col = ['race', 'occupation', 'education'], numerical_col = ['age', 'hours-per-week'])
def adult_pipeline_normal(f_path = '../pipelines/adult-sample_missing.csv'):
    raw_data = pd.read_csv(f_path, na_values='?')
    data = raw_data.dropna()

    labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])

    nested_categorical_feature_transformation = Pipeline(steps=[
        ('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
        ('encode', OneHotEncoder(handle_unknown='ignore'))
    ])

    nested_feature_transformation = ColumnTransformer(transformers=[
        ('categorical', nested_categorical_feature_transformation, ['education', 'workclass']),
        ('numeric', StandardScaler(), ['age', 'hours-per-week'])
    ])

    nested_pipeline = Pipeline([
      ('features', nested_feature_transformation),
      ('classifier', DecisionTreeClassifier())])

    return nested_pipeline

### Loan Dataset

In [38]:
@tracer(cat_col = ['Gender', 'Education'], numerical_col = [])
def loan_pipeline(f_path = '../pipelines/loan_train.csv'):
    data = pd.read_csv(f_path)

    # Loan_ID is not needed in training or prediction
    data = data.drop('Loan_ID', axis=1)

#     data = data.drop('Loan_Status', axis=1)

    numeric_features = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = data.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns
    # do transformer on numeric & categorical data respectively
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    # classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', RandomForestClassifier())])
    return pipeline

## Pipeline Output

---
DAGs are saved in current directory

In [39]:
# adult_pipeline
# cat_col = ['race', 'occupation', 'education'], numerical_col = ['age', 'hours-per-week']
pipeline = adult_pipeline_easy()


####################### Start Pandas Opeation #######################

-------------------------------------------------------
Inpected raw_data = pd.read_csv(f_path, na_values='?')
-------------------------------------------------------

**********
Changes in numerical features!


Unnamed: 0,count,missing_count,median,mad,range
age,-8.0,0.0,0.0,-0.7413,-19.0
hours-per-week,-8.0,0.0,0.0,-1.4826,0.0


**********

**********
Changes in categorical features!


Unnamed: 0,missing_count,num_class,class_count,class_percent
race,0.0,0.0,"{'White': -6, 'Black': -1, 'Amer-Indian-Eskimo': -1, 'Asian-Pac-Islander': 0, 'Other': 0}","{'White': 0.007, 'Black': -0.0013, 'Amer-Indian-Eskimo': -0.0074, 'Asian-Pac-Islander': 0.0009, 'Other': 0.0009}"
occupation,-6.0,0.0,"{'Exec-managerial': 0, 'Adm-clerical': 0, 'Craft-repair': -1, 'Sales': 0, 'Prof-specialty': -1, 'Other-service': 0, 'Transport-moving': 0, 'Machine-op-inspct': 0, 'Farming-fishing': 0, 'Protective-serv': 0, 'Handlers-cleaners': 0, 'Tech-support': 0}","{'Exec-managerial': 0.0035, 'Adm-clerical': 0.003, 'Craft-repair': -0.0079, 'Sales': 0.0025, 'Prof-specialty': -0.0083, 'Other-service': 0.0021, 'Transport-moving': 0.0019, 'Machine-op-inspct': 0.0014, 'Farming-fishing': 0.0007, 'Protective-serv': 0.0005, 'Handlers-cleaners': 0.0005, 'Tech-support': 0.0002}"
education,0.0,0.0,"{'HS-grad': -1, 'Bachelors': 0, 'Some-college': -4, 'Masters': -1, '11th': -2, '7th-8th': 0, 'Assoc-voc': 0, '10th': 0, 'Prof-school': 0, 'Assoc-acdm': 0, '12th': 0, '5th-6th': 0}","{'HS-grad': 0.0152, 'Bachelors': 0.0191, 'Some-college': -0.0235, 'Masters': -0.0057, '11th': -0.0157, '7th-8th': 0.0026, 'Assoc-voc': 0.0026, '10th': 0.0017, 'Prof-school': 0.0009, 'Assoc-acdm': 0.0009, '12th': 0.0009, '5th-6th': 0.0009}"


**********
-------------------------------------------------------
Inpected data = raw_data.dropna()
-------------------------------------------------------


####################### Start Sklearn Pipeline #######################

-------------------------------------------------------
Operations OneHotEncoder on education
-------------------------------------------------------

**********
Changes in categorical features!


Unnamed: 0,education
missing_count,0
num_class,-10
class_count,"{0.0: 90, 1.0: 2}"
class_percent,"{0.0: 0.9783, 1.0: 0.0217}"


**********
-------------------------------------------------------
Operations StandardScaler on age
-------------------------------------------------------

**********
Changes in numerical features!


Unnamed: 0,age
count,0.0
missing_count,0.0
median,-36.1059
mad,-12.8706
range,-48.4315


**********

-------------------------------------------------------
Operations StandardScaler on hours-per-week
-------------------------------------------------------

**********
Changes in numerical features!


Unnamed: 0,hours-per-week
count,0.0
missing_count,0.0
median,-40.0814
mad,0.0
range,-63.7616


**********



In [28]:
pipeline = adult_pipeline_normal()


####################### Start Pandas Opeation #######################

-------------------------------------------------------
Inpected raw_data = pd.read_csv(f_path, na_values='?')
-------------------------------------------------------

**********
Changes in numerical features!


Unnamed: 0,count,missing_count,median,mad,range
age,-14.0,0.0,0.0,-0.7413,-23.0
hours-per-week,-14.0,0.0,0.0,0.0,0.0


**********

**********
Changes in categorical features!


Unnamed: 0,missing_count,num_class,class_count,class_percent
race,-4.0,0.0,"{'White': -6, 'Black': -2, 'Amer-Indian-Eskimo': -2, 'Other': 0, 'Asian-Pac-Islander': 0}","{'White': 0.0271, 'Black': -0.0111, 'Amer-Indian-Eskimo': -0.0184, 'Other': 0.0012, 'Asian-Pac-Islander': 0.0012}"
occupation,-8.0,0.0,"{'Exec-managerial': 0, 'Adm-clerical': 0, 'Craft-repair': -1, 'Sales': -1, 'Other-service': 0, 'Prof-specialty': -2, 'Transport-moving': -1, 'Machine-op-inspct': 0, 'Farming-fishing': 0, 'Handlers-cleaners': 0, 'Tech-support': 0, 'Protective-serv': -1}","{'Exec-managerial': 0.0106, 'Adm-clerical': 0.0099, 'Craft-repair': -0.0018, 'Sales': -0.0033, 'Other-service': 0.0068, 'Prof-specialty': -0.0157, 'Transport-moving': -0.0056, 'Machine-op-inspct': 0.0046, 'Farming-fishing': 0.0023, 'Handlers-cleaners': 0.0015, 'Tech-support': 0.0008, 'Protective-serv': -0.0101}"
education,-2.0,0.0,"{'HS-grad': -3, 'Bachelors': -1, 'Some-college': -4, '11th': -2, 'Masters': -2, '7th-8th': 0, '10th': 0, 'Assoc-voc': 0, 'Prof-school': 0, 'Assoc-acdm': 0, '12th': 0, '5th-6th': 0}","{'HS-grad': 0.0078, 'Bachelors': 0.0183, 'Some-college': -0.0138, '11th': -0.0133, 'Masters': -0.0147, '7th-8th': 0.0043, '10th': 0.0028, 'Assoc-voc': 0.0028, 'Prof-school': 0.0014, 'Assoc-acdm': 0.0014, '12th': 0.0014, '5th-6th': 0.0014}"


**********
-------------------------------------------------------
Inpected data = raw_data.dropna()
-------------------------------------------------------


####################### Start Sklearn Pipeline #######################

-------------------------------------------------------
Operations SimpleImputer on education
-------------------------------------------------------

-------------------------------------------------------
Operations OneHotEncoder on education
-------------------------------------------------------

**********
Changes in categorical features!


Unnamed: 0,education
missing_count,0
num_class,-10
class_count,"{0.0: 84, 1.0: 2}"
class_percent,"{0.0: 0.9767, 1.0: 0.0233}"


**********
-------------------------------------------------------
Operations StandardScaler on age
-------------------------------------------------------

**********
Changes in numerical features!


Unnamed: 0,age
count,0.0
missing_count,0.0
median,-36.0972
mad,-12.832
range,-44.6418


**********

-------------------------------------------------------
Operations StandardScaler on hours-per-week
-------------------------------------------------------

**********
Changes in numerical features!


Unnamed: 0,hours-per-week
count,0.0
missing_count,0.0
median,-40.1126
mad,-1.3509
range,-63.7813


**********



In [40]:
# loan_pipeline
# cat_col = ['Gender', 'Education'], numerical_col = []
pipeline = loan_pipeline()


####################### Start Pandas Opeation #######################

-------------------------------------------------------
Inpected data = pd.read_csv(f_path)
-------------------------------------------------------

-------------------------------------------------------
Inpected data = data.drop('Loan_ID', axis=1)
-------------------------------------------------------


####################### Start Sklearn Pipeline #######################

-------------------------------------------------------
Operations SimpleImputer on Gender
-------------------------------------------------------

**********
Changes in categorical features!


Unnamed: 0,Gender
missing_count,-13
num_class,1
class_count,"{'Male': 0, 'Female': 0, 'missing': 13}"
class_percent,"{'Male': -0.0172, 'Female': -0.0039, 'missing': 0.0212}"


**********
-------------------------------------------------------
Operations OneHotEncoder on Gender
-------------------------------------------------------

**********
Changes in categorical features!


Unnamed: 0,Gender
missing_count,0
num_class,-1
class_count,"{0.0: 502, 1.0: 112}"
class_percent,"{0.0: 0.8176, 1.0: 0.1824}"


**********
-------------------------------------------------------
Operations SimpleImputer on Education
-------------------------------------------------------

-------------------------------------------------------
Operations OneHotEncoder on Education
-------------------------------------------------------

**********
Changes in categorical features!


Unnamed: 0,Education
missing_count,0
num_class,0
class_count,"{1.0: 480, 0.0: 134}"
class_percent,"{1.0: 0.7818, 0.0: 0.2182}"


**********


In [27]:
#compass_pipeline
# cat_col = ['race'], numerical_col = ['age']
pipeline = compas_pipeline()


####################### Start Pandas Opeation #######################

-------------------------------------------------------
Inpected df1 = pd.read_csv(f1_path)
-------------------------------------------------------

-------------------------------------------------------
Inpected df2 = pd.read_csv(f2_path)
-------------------------------------------------------

-------------------------------------------------------
Inpected df3 = pd.read_csv(f3_path)
-------------------------------------------------------

-------------------------------------------------------
Inpected df1.drop(columns=['Unnamed: 0','age_cat'],inplace=True)
-------------------------------------------------------

-------------------------------------------------------
Inpected df2.drop(columns=['Unnamed: 0'],inplace=True)
-------------------------------------------------------

-------------------------------------------------------
Inpected df3.drop(columns=['Unnamed: 0'],inplace=True)
------------------------

Unnamed: 0,count,missing_count,median,mad,range
age,-307.0,0.0,0.0,0.0,0.0


**********

**********
Changes in categorical features!


Unnamed: 0,missing_count,num_class,class_count,class_percent
race,0.0,0.0,"{'African-American': -159, 'Caucasian': -76, 'Hispanic': -53, 'Other': -17, 'Asian': 0, 'Native American': -2}","{'African-American': -0.0002, 'Caucasian': 0.0041, 'Hispanic': -0.0037, 'Other': -0.0001, 'Asian': 0.0002, 'Native American': -0.0002}"


**********
-------------------------------------------------------
Inpected data = data.dropna(subset=['id', 'name','is_recid','days_b_screening_arrest','c_charge_degree','c_jail_out','c_jail_in'])
-------------------------------------------------------

-------------------------------------------------------
Inpected data = data[['sex', 'dob','age','c_charge_degree', 'age_cat', 'race','score_text','priors_count','days_b_screening_arrest','decile_score','is_recid','two_year_recid','c_jail_in','c_jail_out']]
-------------------------------------------------------

**********
Changes in numerical features!


Unnamed: 0,count,missing_count,median,mad,range
age,-284.0,0.0,0.0,0.0,0.0


**********

**********
Changes in categorical features!


Unnamed: 0,missing_count,num_class,class_count,class_percent
race,0.0,0.0,"{'African-American': -158, 'Caucasian': -87, 'Hispanic': -27, 'Other': -9, 'Asian': 0, 'Native American': -3}","{'African-American': -0.0019, 'Caucasian': 0.0016, 'Hispanic': -0.0005, 'Other': 0.0009, 'Asian': 0.0002, 'Native American': -0.0004}"


**********
-------------------------------------------------------
Inpected data = data.loc[(data['days_b_screening_arrest'] <= 30)]
-------------------------------------------------------

**********
Changes in numerical features!


Unnamed: 0,count,missing_count,median,mad,range
age,-451.0,0.0,0.0,0.0,0.0


**********

**********
Changes in categorical features!


Unnamed: 0,missing_count,num_class,class_count,class_percent
race,0.0,0.0,"{'African-American': -204, 'Caucasian': -188, 'Hispanic': -48, 'Other': -8, 'Asian': -1, 'Native American': -2}","{'African-American': 0.0042, 'Caucasian': -0.0052, 'Hispanic': -0.0016, 'Other': 0.0026, 'Asian': 0.0002, 'Native American': -0.0002}"


**********
-------------------------------------------------------
Inpected data = data.loc[(data['days_b_screening_arrest'] >= -30)]
-------------------------------------------------------

-------------------------------------------------------
Inpected data = data.loc[(data['is_recid'] != -1)]
-------------------------------------------------------

-------------------------------------------------------
Inpected data = data.loc[(data['c_charge_degree'] != "O")]
-------------------------------------------------------

-------------------------------------------------------
Inpected data = data.loc[(data['score_text'] != 'N/A')]
-------------------------------------------------------


####################### Start Sklearn Pipeline #######################

-------------------------------------------------------
Operations SimpleImputer on race
-------------------------------------------------------

-------------------------------------------------------
Operations OneHotEncoder on r

Unnamed: 0,race
missing_count,0
num_class,-4
class_count,"{1.0: 3175, 0.0: 2997}"
class_percent,"{1.0: 0.5144, 0.0: 0.4856}"


**********
-------------------------------------------------------
Operations SimpleImputer on age
-------------------------------------------------------

-------------------------------------------------------
Operations KBinsDiscretizer on age
-------------------------------------------------------

**********
Changes in numerical features!


Unnamed: 0,age
count,0.0
missing_count,0.0
median,-31.0
mad,-10.3782
range,-75.0


**********



In [41]:
# german_pipeline
# cat_col = ['personal_status_and_sex'], numerical_col = ['age']
pipeline = german_pipeline_easy()


####################### Start Pandas Opeation #######################

-------------------------------------------------------
Inpected data = pd.read_csv(f_path)
-------------------------------------------------------

-------------------------------------------------------
Inpected data = data[['duration_in_month', 'credit_his',  'credit_amt', 'preset_emp', 'personal_status_and_sex','guarantors', 'present_residence', 'property', 'age','label']]
-------------------------------------------------------

**********
Changes in numerical features!


Unnamed: 0,count,missing_count,median,mad,range
age,-754.0,0.0,0.5,0.7413,-1.0


**********

**********
Changes in categorical features!


Unnamed: 0,missing_count,num_class,class_count,class_percent
personal_status_and_sex,0.0,0.0,"{'A93': -384, 'A92': -251, 'A91': -37, 'A94': -82}","{'A93': 0.1187, 'A92': -0.0702, 'A91': 0.0028, 'A94': -0.0513}"


**********
-------------------------------------------------------
Inpected data = data.loc[(data.credit_amt>=4000)]
-------------------------------------------------------


####################### Start Sklearn Pipeline #######################

-------------------------------------------------------
Operations SimpleImputer on personal_status_and_sex
-------------------------------------------------------

-------------------------------------------------------
Operations OneHotEncoder on personal_status_and_sex
-------------------------------------------------------

**********
Changes in categorical features!


Unnamed: 0,personal_status_and_sex
missing_count,0
num_class,-2
class_count,"{0.0: 233, 1.0: 13}"
class_percent,"{0.0: 0.9472, 1.0: 0.0528}"


**********
-------------------------------------------------------
Operations StandardScaler on age
-------------------------------------------------------

**********
Changes in numerical features!


Unnamed: 0,age
count,0.0
missing_count,0.0
median,-33.7344
mad,-10.1331
range,-50.1208


**********



In [42]:
pipeline = german_pipeline_normal()


####################### Start Pandas Opeation #######################

-------------------------------------------------------
Inpected dataSplit1 = pd.read_csv(f_path_1, index_col = 0)
-------------------------------------------------------

**********
Changes in numerical features!


Unnamed: 0,count,missing_count,median,mad,range
age,-inf,-inf,-inf,-inf,-inf


**********

-------------------------------------------------------
Inpected dataSplit2 = pd.read_csv(f_path_2, index_col = 0)
-------------------------------------------------------

-------------------------------------------------------
Inpected data = dataSplit1.merge(dataSplit2, on='identifier')
-------------------------------------------------------

-------------------------------------------------------
Inpected data.drop(data.columns[0], axis=1, inplace = True)
-------------------------------------------------------

-------------------------------------------------------
Inpected data = data[['duration_in_month', 'credit_his',  'credit_amt', 'preset_emp', 'personal_status_and_sex', 'guarantors', 'present_residence','property', 'age','label']]
-------------------------------------------------------

**********
Changes in numerical features!


Unnamed: 0,count,missing_count,median,mad,range
age,-754.0,0.0,0.5,0.7413,-1.0


**********

**********
Changes in categorical features!


Unnamed: 0,missing_count,num_class,class_count,class_percent
personal_status_and_sex,0.0,0.0,"{'A93': -384, 'A92': -251, 'A91': -37, 'A94': -82}","{'A93': 0.1187, 'A92': -0.0702, 'A91': 0.0028, 'A94': -0.0513}"


**********
-------------------------------------------------------
Inpected data = data.loc[(data.credit_amt>=4000)]
-------------------------------------------------------


####################### Start Sklearn Pipeline #######################

-------------------------------------------------------
Operations SimpleImputer on personal_status_and_sex
-------------------------------------------------------

-------------------------------------------------------
Operations OneHotEncoder on personal_status_and_sex
-------------------------------------------------------

**********
Changes in categorical features!


Unnamed: 0,personal_status_and_sex
missing_count,0
num_class,-2
class_count,"{0.0: 233, 1.0: 13}"
class_percent,"{0.0: 0.9472, 1.0: 0.0528}"


**********
-------------------------------------------------------
Operations StandardScaler on age
-------------------------------------------------------

**********
Changes in numerical features!


Unnamed: 0,age
count,0.0
missing_count,0.0
median,-33.7344
mad,-10.1331
range,-50.1208


**********

