In [1]:
import trace 
import pandas as pd
import numpy as np
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, label_binarize
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import inspect
from collections import defaultdict

In [2]:
class DataFlowVertex:
    def __init__(self, parent_vertices, name, operation):
        self.parent_vertices = parent_vertices
        self.name = name
        self.operation = operation

    def __repr__(self):
        return "{}, (name={}, op={})".format(self.parent_vertices, self.name, self.operation)


def pipeline_to_dataflow_graph(pipeline):
    graph = []
    layer_graph = []
    def helper(pipeline, name_prefix=[], parent_vertices=[]):
        if 'ColumnTransformer' in str(type(pipeline)):
            for step in pipeline.transformers:
                for column_name in step[2]:
                    helper(step[1], name_prefix+[column_name], parent_vertices)
        elif 'Pipeline' in str(type(pipeline)):
            layer_graph.clear()
            for i, key in enumerate(pipeline.named_steps.keys()):
                helper(pipeline.named_steps[key], name_prefix, parent_vertices+layer_graph)

        else :
            graph.append(DataFlowVertex(parent_vertices, ''.join(name_prefix), pipeline))
            layer_graph.append(DataFlowVertex(parent_vertices, ''.join(name_prefix), pipeline))

    helper(pipeline)
    return graph

# DAG Part

In [3]:
# import sys
# from functools import wraps

# class TraceCalls(object):
#     """ Use as a decorator on functions that should be traced. Several
#         functions can be decorated - they will all be indented according
#         to their call depth.
#     """
#     def __init__(self, stream=sys.stdout, indent_step=2, show_ret=False):
#         self.stream = stream
#         self.indent_step = indent_step
#         self.show_ret = show_ret

#         # This is a class attribute since we want to share the indentation
#         # level between different traced functions, in case they call
#         # each other.
#         TraceCalls.cur_indent = 0

#     def __call__(self, fn):
#         @wraps(fn)
#         def wrapper(*args, **kwargs):
#             indent = ' ' * TraceCalls.cur_indent
#             argstr = ', '.join(
#                 [repr(a) for a in args] +
#                 ["%s=%s" % (a, repr(b)) for a, b in kwargs.items()])
#             self.stream.write('%s%s(%s)\n' % (indent, fn.__name__, argstr))

#             TraceCalls.cur_indent += self.indent_step
#             ret = fn(*args, **kwargs)
#             TraceCalls.cur_indent -= self.indent_step

#             if self.show_ret:
#                 self.stream.write('%s--> %s\n' % (indent, ret))
#             return ret
#         return wrapper

In [4]:
# @TraceCalls()
def pd_oprations(ll):
    a = pd.DataFrame(ll,columns = ['a'])
    a = a.apply(lambda x: x)
    return a

pd_oprations([[1],[2],[3]])
str(pd_oprations)

'<function pd_oprations at 0x11adbd400>'

In [5]:
def pipeline_test_3(f_path = 'adult-sample.csv', a = 0):
   
    raw_data = pd.read_csv(f_path, na_values='?')
    data = raw_data.dropna()

    labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])

    feature_transformation = sklearn.compose.ColumnTransformer(transformers=[
        ('categorical', OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']),
        ('numeric', StandardScaler(), ['age', 'hours-per-week'])
    ])

        
    income_pipeline = Pipeline([
      ('features', feature_transformation),
      ('classifier', DecisionTreeClassifier())])
    
    return income_pipeline

In [6]:
def pipeline_test_4(f_path = 'adult-sample_missing.csv', a = 0):
    raw_data = pd.read_csv(f_path, na_values='?')
    data = raw_data.dropna()

    labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])

    nested_categorical_feature_transformation = Pipeline(steps=[
        ('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
        ('encode', OneHotEncoder(handle_unknown='ignore'))
    ])

    nested_feature_transformation = ColumnTransformer(transformers=[
        ('categorical', nested_categorical_feature_transformation, ['education', 'workclass']),
        ('numeric', StandardScaler(), ['age', 'hours-per-week'])
    ])

    nested_pipeline = Pipeline([
      ('features', nested_feature_transformation),
      ('classifier', DecisionTreeClassifier())])

    return nested_pipeline

## Found module dis

In [7]:
# import dis
# saved = dis.dis(pipeline_test)

# Monitor Part

## Use module inspect to convert function codes into Strings

In [8]:
raw_func = inspect.getsource(pipeline_test_3)

In [9]:
raw_func

"def pipeline_test_3(f_path = 'adult-sample.csv', a = 0):\n   \n    raw_data = pd.read_csv(f_path, na_values='?')\n    data = raw_data.dropna()\n\n    labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])\n\n    feature_transformation = sklearn.compose.ColumnTransformer(transformers=[\n        ('categorical', OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']),\n        ('numeric', StandardScaler(), ['age', 'hours-per-week'])\n    ])\n\n        \n    income_pipeline = Pipeline([\n      ('features', feature_transformation),\n      ('classifier', DecisionTreeClassifier())])\n    \n    return income_pipeline\n"

In [10]:
raw_func_list = [item[4:].rstrip() for item in raw_func.split('\n')]

In [11]:
raw_func_list

["pipeline_test_3(f_path = 'adult-sample.csv', a = 0):",
 '',
 "raw_data = pd.read_csv(f_path, na_values='?')",
 'data = raw_data.dropna()',
 '',
 "labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])",
 '',
 'feature_transformation = sklearn.compose.ColumnTransformer(transformers=[',
 "    ('categorical', OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']),",
 "    ('numeric', StandardScaler(), ['age', 'hours-per-week'])",
 '])',
 '',
 '',
 'income_pipeline = Pipeline([',
 "  ('features', feature_transformation),",
 "  ('classifier', DecisionTreeClassifier())])",
 '',
 'return income_pipeline',
 '']

### For_if_statement Trail

In [12]:
# def bracket_balance(func_list):
#     res = []
#     # func_args = raw_func_list[0].split('(')[1].rstrip('):')
#     stack_for_parent = []
#     logs_of_parent = []
#     for item in func_list:
#         logs_of_parent.append(item)
#     for char in item:
#         if char == '(':
#             stack_for_parent.append('(')
#         if char == '[':
#             stack_for_parent.append('[')
#         if char == ')' and stack_for_parent[-1] == '(':
#             stack_for_parent.pop(-1)
#         if char == ']' and stack_for_parent[-1] == '[':
#             stack_for_parent.pop(-1)
#     if not stack_for_parent:
#         res.append(' '.join(logs_of_parent))
#         logs_of_parent.clear()
#     return res

# logs_of_loop_if = []
# for item in raw_func_list:
#     if item[-1] == ':':
#         logs_of_loop_if.append(item)
#     if not logs_of_loop_if and item.startwith('    '):
#         logs_of_loop_if.append(item)
#     else:
#         item.strip()
#     logs_of_parent.append(item)
#     for char in item:
#         if char == '(':
#             stack_for_parent.append('(')
#         if char == '[':
#             stack_for_parent.append('[')
#         if char == ')' and stack_for_parent[-1] == '(':
#             stack_for_parent.pop(-1)
#         if char == ']' and stack_for_parent[-1] == '[':
#             stack_for_parent.pop(-1)
#     if not stack_for_parent:
#         res.append(' '.join(logs_of_parent))
#         logs_of_parent.clear()

## Raw Function to Convert Function to Executable Lines

In [13]:
def func_aggregation(func_str):
    '''
    This function is used for line execution with exec()
    
    args:
        function strings after inspect
    returns:
        list of functionable strings for exec()
    '''
    
    res = [] # executables for return
    stack_for_parent = [] # stack storing brackets for line integration
    logs_of_parent = [] # logs of lines for concat
#     convert function codes to list of strings
    func_list = [item.strip() for item in func_str.split('\n')]
#     function args
    func_args = [item.strip() for item in func_list[0].split('(')[1].rstrip('):').split(',')]
    for item in func_list[1:]:
        if not item:
            continue
        logs_of_parent.append(item)
        for char in item:
            if char == '(':
                stack_for_parent.append('(')
            if char == '[':
                stack_for_parent.append('[')
            if char == ')' and stack_for_parent[-1] == '(':
                stack_for_parent.pop(-1)
            if char == ']' and stack_for_parent[-1] == '[':
                stack_for_parent.pop(-1)
        if not stack_for_parent:
            res.append(''.join(logs_of_parent))
            logs_of_parent.clear()
    return func_args, res[:-1], [item.strip() for item in res[-1].replace('return ', '').split(',')]

In [14]:
input_args, executable_list, output = func_aggregation(raw_func)

In [15]:
input_args

["f_path = 'adult-sample.csv'", 'a = 0']

In [16]:
executable_list

["raw_data = pd.read_csv(f_path, na_values='?')",
 'data = raw_data.dropna()',
 "labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])",
 "feature_transformation = sklearn.compose.ColumnTransformer(transformers=[('categorical', OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']),('numeric', StandardScaler(), ['age', 'hours-per-week'])])",
 "income_pipeline = Pipeline([('features', feature_transformation),('classifier', DecisionTreeClassifier())])"]

In [17]:
output

['income_pipeline']

In [18]:
for line in input_args:
    exec(line)

In [19]:
target_col = ['race', 'occupation']

In [20]:
exec_it = iter(executable_list)

In [21]:
import pprint

for _ in range(4):
    cur_line = next(exec_it)
    exec(cur_line)
    try: 
        if str(eval(f"type({cur_line.split('=')[0].strip()})")) == "<class 'pandas.core.frame.DataFrame'>":
            print('yes')
            target_df = cur_line.split('=')[0].strip()
            exec(f"pprint.pprint({target_df}[{target_col}].describe())")
        else:
            exec(f"pprint.pprint({target_df}[{target_col}].describe())")
    except:
        exec(f"pprint.pprint({target_df}[{target_col}].describe())")

yes
         race       occupation
count     100               94
unique      5               12
top     White  Exec-managerial
freq       83               15
yes
         race       occupation
count      92               92
unique      5               12
top     White  Exec-managerial
freq       77               15
         race       occupation
count      92               92
unique      5               12
top     White  Exec-managerial
freq       77               15
         race       occupation
count      92               92
unique      5               12
top     White  Exec-managerial
freq       77               15


### test 3 debugging

In [22]:
cur_line

"feature_transformation = sklearn.compose.ColumnTransformer(transformers=[('categorical', OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']),('numeric', StandardScaler(), ['age', 'hours-per-week'])])"

In [23]:
tar = cur_line.split('=')[0].strip()
str(eval(f"type({tar})"))=="<class 'pandas.core.frame.DataFrame'>"

False

In [24]:
nested_graph = pipeline_to_dataflow_graph(income_pipeline)

NameError: name 'income_pipeline' is not defined

In [25]:
nested_graph

NameError: name 'nested_graph' is not defined

## test 4 starts

## Try add more eval metrices

In [7]:
from scipy import stats

In [12]:
# Current Version
def describe_ver(pipeline_to_test, cat_col = ['race', 'occupation', 'education'], numerical_col = ['age', 'hours-per-week']):
    
    def func_aggregation(func_str):
        '''
        This function is used for line execution with exec()

        args:
            function strings after inspect
        returns:
            list of functionable strings for exec()
        '''

        res = [] # executables for return
        stack_for_parent = [] # stack storing brackets for line integration
        logs_of_parent = [] # logs of lines for concat
    #     convert function codes to list of strings
        func_list = [item.strip() for item in func_str.split('\n')]
    #     function args
        func_args = [item.strip() for item in func_list[0].split('(')[1].rstrip('):').split(',')]
        for item in func_list[1:]:
            if not item:
                continue
            logs_of_parent.append(item)
            for char in item:
                if char == '(':
                    stack_for_parent.append('(')
                if char == '[':
                    stack_for_parent.append('[')
                if char == ')' and stack_for_parent[-1] == '(':
                    stack_for_parent.pop(-1)
                if char == ']' and stack_for_parent[-1] == '[':
                    stack_for_parent.pop(-1)
            if not stack_for_parent:
                res.append(''.join(logs_of_parent))
                logs_of_parent.clear()
        return func_args, res[:-1], [item.strip() for item in res[-1].replace('return ', '').split(',')]
    
    
    raw_func = inspect.getsource(pipeline_to_test)


    input_args, executable_list, outputs = func_aggregation(raw_func)
    
    for line in input_args:
        exec(line)
    
    print()
    print('####################### Start Pandas Opeation #######################')
    print()
    
    ######################################
    # Initialization
    ######################################
    prev = {}
    
    numerical_metric_list = ['count', 'missing_count', 'median', 'mad', 'range']
    numerical_df = pd.DataFrame(np.inf, index = numerical_col, columns = numerical_metric_list)
    
    cat_metric_list = ['missing_count', 'num_class', 'class_count', 'class_percent']
    cat_df = pd.DataFrame(np.inf, index = cat_col, columns = cat_metric_list)
    
    ######################################
    # Supporting functions
    ######################################    
    def handle_dict(dict_1, dict_2):
        '''
        Calculate differences between two dictionaries
        eg: input: d1 = {'a': 1, 'b': 2, 'c': 3, 'e': 2, 'f':4}
                   d2 = {'a': 10, 'b': 9, 'c': 8, 'd': 7, 'e': 3}
            output: z = {'a': -9, 'b': -7, 'c': -5, 'e': -1, 'f': 4, 'd': -7}
        '''
        
        dict_1_re = {key: dict_1.get(key,0) - dict_2.get(key, 0) for key in dict_1.keys()}
        dict_2_re = {key: dict_1.get(key,0) - dict_2.get(key, 0) for key in dict_1.keys()}
        return {**dict_1_re, **dict_2_re}
    
    def get_categorical_dif(cat_df, cat_metric_list, prev):
        '''
        Calculate differences for categorical dataframe comparison
        Need special handling for 'class_count' and 'class_percent' 
            since they are stored as dict in the dataframe
        '''
        
        cat_dif = pd.DataFrame()
        for i in cat_metric_list:
            if i != 'class_count' and i != 'class_percent':
                # if the metric is not defined as a dictionary
                dif = cat_df[i] - prev[i]
                cat_dif[i] = dif
            else:
                for idx, col in enumerate(cat_df.index):
                    dif = handle_dict(cat_df[i][idx], prev[i][idx])
                    cat_dif.loc[col, i] = [dif]
        return cat_dif
                    
    def cal_numerical(target_df_1, numeric_feature, numerical_df):
        '''
        Calculate metrices for numerical features
            including counts, missing values, Median and MAD, range/scaling
        '''

        # get counts of non NA values
        count_log = target_df_1[numeric_feature].count()
        numerical_df.loc[numeric_feature, 'count'] = count_log

        # get missing value counts
        missing_count_log = target_df_1[numeric_feature].isna().sum()
        numerical_df.loc[numeric_feature, 'missing_count'] = missing_count_log

        # distribution
        # Median and MAD
        median_log = target_df_1[numeric_feature].median()
        numerical_df.loc[numeric_feature, 'median'] = median_log
        if missing_count_log == 0:
            mad_log = stats.median_absolute_deviation(target_df_1[numeric_feature])
            numerical_df.loc[numeric_feature, 'mad'] = mad_log
        else:
            numerical_df.loc[numeric_feature, 'mad'] = 0

        # range/ scaling
        range_log = target_df_1[numeric_feature].max() - target_df_1[numeric_feature].min()
        numerical_df.loc[numeric_feature, 'range'] = range_log

        return numerical_df  
    
    def cal_categorical(target_df_1, cat_feature, cat_df):
        '''
        Calculate metrices for categorical features
            including missing values, number of classes, counts for each group, percentage for each group
        '''
        
        # get missing value counts
        missing_count_log = target_df_1[cat_feature].isna().sum()
        cat_df.loc[cat_feature, 'missing_count'] = missing_count_log

        # get number of classes
        num_class_log = len(target_df_1[cat_feature].value_counts().keys())
        cat_df.loc[cat_feature, 'num_class'] = num_class_log

        # get counts for each group
        class_count_log = target_df_1[cat_feature].value_counts().to_dict()
        cat_df.loc[cat_feature, 'class_count'] = [class_count_log]

        # get percentage each group covers
        class_percent_log = round(target_df_1[cat_feature].value_counts() / \
        target_df_1[cat_feature].value_counts().sum(), 4).to_dict()
        cat_df.loc[cat_feature, 'class_percent'] = [class_percent_log]

        return cat_df 
    
    ######################################
    # Execution
    ######################################     
    for cur_line in executable_list:
        print_bool = False
        exec(cur_line)
        try: 
            if str(eval(f"type({cur_line.split('=')[0].strip()})")) == "<class 'pandas.core.frame.DataFrame'>":

                target_df = cur_line.split('=')[0].strip()
                
                ######################################################################################
                # numerical features & metrices
                # counts, missing values, Median and MAD, range/scaling
                ######################################################################################
                for numeric_feature in numerical_col:

                    numerical_df = cal_numerical(eval(target_df), numeric_feature, numerical_df)

                ######################################################################################
                # categorical features & metrices
                # missing values, number of classes, counts for each group, percentage for each group
                ######################################################################################
                for cat_feature in cat_col:

                    cat_df = cal_categorical(eval(target_df), cat_feature, cat_df)
                    
                # KL/ Wilcox not suitable
                # dif_2 = stats.wilcoxon(eval_mat['dist_shift_log'], prev['dist_shift_log'], zero_method='wilcox')[0]

                # Use K-S test
                # stats.ks_2samp(vector,vector1)

                ######################################################################################
                # Comparison occurs here! 
                ######################################################################################
                if len(prev) != 0:
                    numerical_dif = numerical_df - prev['numerical']
                    if (numerical_dif.values != 0).any():
                        # print(f'Metrics: {mat} changed in {col} with value {dif}')
                        print('*'*10)
                        print('Changes in numerical features!')
                        print(numerical_dif)
                        print('*'*10)
                        print()
                    
                ################################## 
                # ⬆️ numerical
                # ⬇️ categorical                
                ##################################
                    
                    cat_dif = get_categorical_dif(cat_df, cat_metric_list, prev['categorical'])
                    if (cat_dif.values != 0).any():
                        print('*'*10)
                        print('Changes in categorical features!')
                        print(cat_dif)
                        print('*'*10)
              
                print_bool = True
                
                if print_bool:
#                     print(numerical_df)
#                     print(cat_df)
                    print(f'Inpected {cur_line}')
                    print('-------------------------------------------------------')
                    print() 

                # save the output for next round comparison
                prev['numerical'] = numerical_df.copy()
                prev['categorical'] = cat_df.copy()

            elif str(eval(f"type({cur_line.split('=')[0].strip()})")).startswith("<class 'sklearn"):
                pass
            else:
                pass

        except:
            ######################################################################################
            # numerical features & metrices
            # counts, missing values, Median and MAD, range/scaling
            ######################################################################################
            for numeric_feature in numerical_col:

                numerical_df = cal_numerical(eval(target_df), numeric_feature, numerical_df)

            ######################################################################################
            # categorical features & metrices
            # missing values, number of classes, counts for each group, percentage for each group
            ######################################################################################
            for cat_feature in cat_col:

                cat_df = cal_categorical(eval(target_df), cat_feature, cat_df)

            ######################################################################################
            # Comparison occurs here! 
            ######################################################################################
            if len(prev) != 0:
                numerical_dif = numerical_df - prev['numerical']
                if (numerical_dif.values != 0).any():
                    # print(f'Metrics: {mat} changed in {col} with value {dif}')
                    print('*'*10)
                    print('Changes in numerical features!')
                    print(numerical_dif)
                    print('*'*10)
                    print()

            ################################## 
            # ⬆️ numerical
            # ⬇️ categorical                
            ##################################

                cat_dif = get_categorical_dif(cat_df, cat_metric_list, prev['categorical'])
                if (cat_dif.values != 0).any():
                    print('*'*10)
                    print('Changes in categorical features!')
                    print(cat_dif)
                    print('*'*10)

            print_bool = True

            if print_bool:
#                     print(numerical_df)
#                     print(cat_df)
                print(f'Inpected {cur_line}')
                print('-------------------------------------------------------')
                print() 

            # save the output for next round comparison
            prev['numerical'] = numerical_df.copy()
            prev['categorical'] = cat_df.copy()            
            

    nested_graph = pipeline_to_dataflow_graph(eval(f'{outputs[0]}'))

    print()
    print('####################### Start Sklearn Pipeline #######################')
    print()
        
    for item in nested_graph:
        ######################################################################################
        # numerical features & metrices
        # counts, missing values, Median and MAD, range/scaling
        ######################################################################################
        if item.name in numerical_col: 
            numeric_feature = item.name
            
            eval(target_df)[item.name] = item.operation.fit_transform(eval(target_df)[item.name].values.reshape(-1,1))
            print(f"Operations {str(item.operation).split('(')[0]} on {item.name}")
            
            ##############################
            # Metrices Calculation
            ##############################
            numerical_df = cal_numerical(eval(target_df), numeric_feature, numerical_df)
            
            ##############################
            # Comparison
            ##############################
            numerical_dif = numerical_df - prev['numerical']
            
            if (numerical_dif.loc[numeric_feature,:].values != 0).any():
                # print(f'Metrics: {mat} changed in {col} with value {dif}')
                print('*'*10)
                print('Changes in numerical features!')
                print(numerical_dif.loc[numeric_feature,:])
                print('*'*10)
                print()
                
        ######################################################################################
        # categorical features & metrices
        # missing values, number of classes, counts for each group, percentage for each group
        ######################################################################################               
        if item.name in cat_col:
            cat_feature = item.name
            ##############################
            try:
                eval(target_df)[item.name] = item.operation.fit_transform(eval(target_df)[item.name].values.reshape(-1,1)).toarray()
            except:
                eval(target_df)[item.name] = item.operation.fit_transform(eval(target_df)[item.name].values.reshape(-1,1))
            print(f"Operations {str(item.operation).split('(')[0]} on {item.name}")
            
            ##############################
            # Metrices Calculation
            ##############################            
            cat_df = cal_categorical(eval(target_df), cat_feature, cat_df)
            
            ##############################
            # Comparison
            ##############################            
            cat_dif = get_categorical_dif(cat_df, cat_metric_list, prev['categorical'])
            if (cat_dif.loc[cat_feature,:].values != 0).any():
                print('*'*10)
                print('Changes in categorical features!')
                print(cat_dif.loc[cat_feature,:])
                print('*'*10)
                
        prev['numerical'] = numerical_df.copy()
        prev['categorical'] = cat_df.copy()     

In [13]:
raw_func = inspect.getsource(pipeline_test_4)

In [14]:
raw_func

"def pipeline_test_4(f_path = 'adult-sample_missing.csv', a = 0):\n    raw_data = pd.read_csv(f_path, na_values='?')\n    data = raw_data.dropna()\n\n    labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])\n\n    nested_categorical_feature_transformation = Pipeline(steps=[\n        ('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),\n        ('encode', OneHotEncoder(handle_unknown='ignore'))\n    ])\n\n    nested_feature_transformation = ColumnTransformer(transformers=[\n        ('categorical', nested_categorical_feature_transformation, ['education', 'workclass']),\n        ('numeric', StandardScaler(), ['age', 'hours-per-week'])\n    ])\n\n    nested_pipeline = Pipeline([\n      ('features', nested_feature_transformation),\n      ('classifier', DecisionTreeClassifier())])\n\n    return nested_pipeline\n"

In [15]:
describe_ver(pipeline_test_4)


####################### Start Pandas Opeation #######################

Inpected raw_data = pd.read_csv(f_path, na_values='?')
-------------------------------------------------------

**********
Changes in numerical features!
                count  missing_count  median     mad  range
age             -13.0           -3.0     0.0  14.826  -23.0
hours-per-week  -16.0            0.0     0.0   0.000    0.0
**********

**********
Changes in categorical features!
            missing_count  num_class  \
race                 -4.0        0.0   
occupation           -8.0        0.0   
education            -2.0        0.0   

                                                  class_count  \
race        {'White': -7, 'Black': -3, 'Amer-Indian-Eskimo...   
occupation  {'Adm-clerical': 0, 'Craft-repair': -1, 'Exec-...   
education   {'HS-grad': -3, 'Bachelors': -2, 'Some-college...   

                                                class_percent  
race        {'White': 0.035699999999999954, 'Black':

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [494]:
cat_metric_list = ['missing_count', 'num_class', 'class_count', 'class_percent']

In [538]:
cat_dif = pd.DataFrame()
for i in cat_metric_list:
    if i != 'class_count' and i != 'class_percent':
        # if the metric is not defined as a dictionary
        dif = prev[i] - now[i]
        cat_dif[i] = dif
    else:
        for idx, col in enumerate(cat_df.index):
            dif = handle_dict(prev[i][idx], now[i][idx])
            cat_dif.loc[col, i] = [dif]
        

In [540]:
prev

Unnamed: 0,missing_count,num_class,class_count,class_percent
race,4.0,5.0,"{'White': 80, 'Black': 10, 'Amer-Indian-Eskimo...","{'White': 0.8333333333333334, 'Black': 0.10416..."
occupation,8.0,12.0,"{'Exec-managerial': 14, 'Adm-clerical': 13, 'C...","{'Exec-managerial': 0.15217391304347827, 'Adm-..."
education,2.0,12.0,"{'HS-grad': 30, 'Some-college': 23, 'Bachelors...","{'HS-grad': 0.30612244897959184, 'Some-college..."


In [541]:
now

Unnamed: 0,missing_count,num_class,class_count,class_percent
race,0.0,5.0,"{'White': 73, 'Black': 7, 'Amer-Indian-Eskimo'...","{'White': 0.8690476190476191, 'Black': 0.08333..."
occupation,0.0,12.0,"{'Adm-clerical': 13, 'Craft-repair': 12, 'Exec...","{'Adm-clerical': 0.15476190476190477, 'Craft-r..."
education,0.0,12.0,"{'HS-grad': 27, 'Some-college': 19, 'Bachelors...","{'HS-grad': 0.32142857142857145, 'Some-college..."


In [539]:
cat_dif

Unnamed: 0,missing_count,num_class,class_count,class_percent
race,4.0,0.0,"{'White': 7, 'Black': 3, 'Amer-Indian-Eskimo':...","{'White': -0.0357142857142857, 'Black': 0.0208..."
occupation,8.0,0.0,"{'Exec-managerial': 2, 'Adm-clerical': 0, 'Cra...","{'Exec-managerial': 0.009316770186335421, 'Adm..."
education,2.0,0.0,"{'HS-grad': 3, 'Some-college': 4, 'Bachelors':...","{'HS-grad': -0.015306122448979609, 'Some-colle..."


In [474]:
handle_dict(prev['class_count'][0], now['class_count'][0])

{'White': 7,
 'Black': 3,
 'Amer-Indian-Eskimo': 2,
 'Other': 0,
 'Asian-Pac-Islander': 0}

In [343]:
cat_metric_list = ['missing_count', 'num_class']

In [344]:
cat_col = ['race', 'occupation', 'education']

In [345]:
cat_df = pd.DataFrame(index = cat_col, columns = cat_metric_list)

In [431]:
c = {'ddd':cat_df}

In [432]:
c['ddd']

Unnamed: 0,missing_count,num_class
race,,"[{'White': 80, 'Black': 10, 'Amer-Indian-Eskim..."
occupation,,
education,,


In [346]:
cat_df

Unnamed: 0,missing_count,num_class
race,,
occupation,,
education,,


In [298]:
x1

White                 80
Black                 10
Amer-Indian-Eskimo     4
Other                  1
Asian-Pac-Islander     1
Name: race, dtype: int64

In [34]:
# test dict - dict
d1 = {'a': 1, 'b': 2, 'c': 3, 'e': 2, 'f':4}
d2 = {'a': 10, 'b': 9, 'c': 8, 'd': 7, 'e': 3}
d3 = {key: d1.get(key,0) - d2.get(key, 0) for key in d1.keys()}
d4 = {key: d1.get(key,0) - d2.get(key, 0) for key in d2.keys()}
z = {**d3, **d4}
print('d3:', d3)
print('d4:', d4)
print('merged:', z)

d3: {'a': -9, 'b': -7, 'c': -5, 'e': -1, 'f': 4}
d4: {'a': -9, 'b': -7, 'c': -5, 'd': -7, 'e': -1}
merged: {'a': -9, 'b': -7, 'c': -5, 'e': -1, 'f': 4, 'd': -7}


In [100]:
data_test = pd.read_csv('adult-sample_missing.csv')
x1 = data_test['race'].value_counts()

In [101]:
data_test['age'].count()

97

In [102]:
data_test['age']


0     28.0
1     58.0
2      NaN
3     71.0
4     20.0
5     46.0
6      NaN
7     24.0
8     21.0
9      NaN
10    43.0
11    47.0
12    23.0
13    38.0
14    31.0
15    36.0
16    27.0
17    32.0
18    55.0
19    33.0
20    21.0
21    25.0
22    28.0
23    51.0
24    26.0
25    62.0
26    37.0
27    55.0
28    46.0
29    47.0
      ... 
70    48.0
71    29.0
72    30.0
73    31.0
74    40.0
75    32.0
76    47.0
77    19.0
78    45.0
79    49.0
80    18.0
81    33.0
82    48.0
83    30.0
84    55.0
85    47.0
86    72.0
87    27.0
88    44.0
89    23.0
90    33.0
91    43.0
92    46.0
93    90.0
94    34.0
95    32.0
96    42.0
97    18.0
98    25.0
99    28.0
Name: age, Length: 100, dtype: float64

In [404]:
(data_test['race'].value_counts() / data_test['race'].value_counts().sum())

White                 0.833333
Black                 0.104167
Amer-Indian-Eskimo    0.041667
Other                 0.010417
Asian-Pac-Islander    0.010417
Name: race, dtype: float64

In [387]:
s1 = x1.to_dict()

In [388]:
s2 = s1.copy()

In [389]:
s1.update({'wh':44})

In [390]:
s1

{'White': 80,
 'Black': 10,
 'Amer-Indian-Eskimo': 4,
 'Other': 1,
 'Asian-Pac-Islander': 1,
 'wh': 44}

In [391]:
s2

{'White': 80,
 'Black': 10,
 'Amer-Indian-Eskimo': 4,
 'Other': 1,
 'Asian-Pac-Islander': 1}

In [392]:
handle_dict(s1,s2)

{'White': 0,
 'Black': 0,
 'Amer-Indian-Eskimo': 0,
 'Other': 0,
 'Asian-Pac-Islander': 0,
 'wh': 44}

In [356]:
def handle_dict(dict_1, dict_2):
    dict_1_re = {key: dict_1.get(key,0) - dict_2.get(key, 0) for key in dict_1.keys()}
    dict_2_re = {key: dict_1.get(key,0) - dict_2.get(key, 0) for key in dict_1.keys()}
    return {**dict_1_re, **dict_2_re}

In [357]:
cat_df.loc['race','num_class'] = [x1.to_dict()]

In [358]:
handle_dict(cat_df.loc['race','num_class'][0], cat_df.loc['race','num_class'][0])

{'White': 0,
 'Black': 0,
 'Amer-Indian-Eskimo': 0,
 'Other': 0,
 'Asian-Pac-Islander': 0}

In [292]:
cat_df.loc['race','num_class'] = [x1]

In [293]:
cat_df

Unnamed: 0,missing_count,num_class
race,,"[[80, 10, 4, 1, 1]]"
occupation,,
education,,


In [291]:
cat_df.loc['race','num_class']

[{'White': 80,
  'Black': 10,
  'Amer-Indian-Eskimo': 4,
  'Other': 1,
  'Asian-Pac-Islander': 1}]

In [77]:
data_test_na = data_test.dropna()