In [26]:
import trace 
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, label_binarize
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import inspect


In [269]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

####################################################################################
def test_pipeline():
    train = pd.read_csv('loan_train.csv')
    test = pd.read_csv('loan_test.csv')

    # Loan_ID is not needed in training or prediction
    train = train.drop('Loan_ID', axis=1)

    X = train.drop('Loan_Status', axis=1)
    y = train['Loan_Status']

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


    # do transformer on numeric & categorical data respectively
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    numeric_features = train.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = train.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    # classifier
    rf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', RandomForestClassifier())])
    return rf

In [278]:
def find_sink(graph):
    sorted_graph = topo_sort(graph)
    return sorted_graph[-1]

In [282]:
def pipeline_to_dataflow_graph(pipeline, name_prefix='', parent_vertices=[]):
    graph = []
    parent_vertices_for_current_step = parent_vertices
    parent_vertices_for_next_step = []

    for step_name, component in pipeline.steps:
        component_class_name = component.__class__.__name__

        if component_class_name == 'ColumnTransformer':
            for transformer_prefix, transformer_component, columns in component.transformers:
                for column in columns:
                    name = name_prefix + step_name + '__' + transformer_prefix + "__" + column
                    transformer_component_class_name = transformer_component.__class__.__name__

                    if transformer_component_class_name == 'Pipeline':

                        vertices_to_add = pipeline_to_dataflow_graph(transformer_component,
                                                                     name + "__",
                                                                     parent_vertices_for_current_step)

                        for vertex in vertices_to_add:
                            graph.append(vertex)

                        parent_vertices_for_next_step.append(find_sink(vertices_to_add))

                    else:
                        vertex = DataFlowVertex(parent_vertices_for_current_step,
                                                name_prefix + name,
                                                transformer_component_class_name)
                        graph.append(vertex)
                        parent_vertices_for_next_step.append(vertex)

        else:
            vertex = DataFlowVertex(parent_vertices_for_current_step,
                                    name_prefix + step_name,
                                    component_class_name)
            graph.append(vertex)
            parent_vertices_for_next_step.append(vertex)

        parent_vertices_for_current_step = parent_vertices_for_next_step.copy()
        parent_vertices_for_next_step = []

    return graph

In [280]:
class DataFlowVertex:
    def __init__(self, parent_vertices, name, operation):
        self.parent_vertices = parent_vertices
        self.name = name
        self.operation = operation

    def __repr__(self):
        return "{}, (name={}, op={})".format(self.parent_vertices, self.name, self.operation)


def pipeline_to_dataflow_graph(pipeline):
    layer_graph = []
    all_layers  = []
    graph = []
    # TODO Implement translation of the pipeline into a list of DataFlowVertex objects
    def helper(pipeline, name_prefix=[], parent_vertices=[], all_layers=[], level = 0):
        if 'ColumnTransformer' in str(type(pipeline)):
            for step in pipeline.transformers:
                for column_name in step[2]:
                    helper(step[1], name_prefix+[step[0]]+[column_name], parent_vertices, all_layers, level)
        elif 'Pipeline' in str(type(pipeline)):
#             print(level)
            if layer_graph:
                all_layers+=layer_graph
            layer_graph.clear()
            for i, key in enumerate(pipeline.named_steps.keys()):
                if level == 0:
                    helper(pipeline.named_steps[key], name_prefix+[key], parent_vertices+all_layers+layer_graph, all_layers, level+1)
                else:
                    helper(pipeline.named_steps[key], name_prefix+[key], parent_vertices+layer_graph, all_layers, level+1)

        else :
#             global layer_graph 
            graph.append(DataFlowVertex(parent_vertices, '__'.join(name_prefix), str(pipeline).split('(')[0]))
            layer_graph.append(DataFlowVertex(parent_vertices, '__'.join(name_prefix), str(pipeline).split('(')[0]))
    helper(pipeline)
    return graph

In [283]:
pipeline = pipeline_test_4()

In [270]:
rf = test_pipeline()

In [285]:
pipeline_to_dataflow_graph(pipeline)

NameError: name 'topo_sort' is not defined

In [47]:
pipeline

Pipeline(memory=None,
         steps=[('features',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('categorical',
                                                  Pipeline(memory=None,
                                                           steps=[('impute',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='most_frequent',
                                                   

# DAG Part

In [3]:
# import sys
# from functools import wraps

# class TraceCalls(object):
#     """ Use as a decorator on functions that should be traced. Several
#         functions can be decorated - they will all be indented according
#         to their call depth.
#     """
#     def __init__(self, stream=sys.stdout, indent_step=2, show_ret=False):
#         self.stream = stream
#         self.indent_step = indent_step
#         self.show_ret = show_ret

#         # This is a class attribute since we want to share the indentation
#         # level between different traced functions, in case they call
#         # each other.
#         TraceCalls.cur_indent = 0

#     def __call__(self, fn):
#         @wraps(fn)
#         def wrapper(*args, **kwargs):
#             indent = ' ' * TraceCalls.cur_indent
#             argstr = ', '.join(
#                 [repr(a) for a in args] +
#                 ["%s=%s" % (a, repr(b)) for a, b in kwargs.items()])
#             self.stream.write('%s%s(%s)\n' % (indent, fn.__name__, argstr))

#             TraceCalls.cur_indent += self.indent_step
#             ret = fn(*args, **kwargs)
#             TraceCalls.cur_indent -= self.indent_step

#             if self.show_ret:
#                 self.stream.write('%s--> %s\n' % (indent, ret))
#             return ret
#         return wrapper

In [4]:
# @TraceCalls()
def pd_oprations(ll):
    a = pd.DataFrame(ll,columns = ['a'])
    a = a.apply(lambda x: x)
    return a

pd_oprations([[1],[2],[3]])
str(pd_oprations)

'<function pd_oprations at 0x10fba6b00>'

In [51]:
def pipeline_test_3(f_path = 'adult-sample.csv', a = 0):
   
    raw_data = pd.read_csv(f_path, na_values='?')
    data = raw_data.dropna()

    labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])

    feature_transformation = ColumnTransformer(transformers=[
        ('categorical', OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']),
        ('numeric', StandardScaler(), ['age', 'hours-per-week'])
    ])

        
    income_pipeline = Pipeline([
      ('features', feature_transformation),
      ('classifier', DecisionTreeClassifier())])
    
    return income_pipeline

In [44]:
def pipeline_test_4(f_path = 'adult-sample.csv', a = 0):
    raw_data = pd.read_csv(f_path, na_values='?')
    data = raw_data.dropna()

    labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])

    nested_categorical_feature_transformation = Pipeline(steps=[
        ('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
        ('encode', OneHotEncoder(handle_unknown='ignore'))
    ])

    nested_feature_transformation = ColumnTransformer(transformers=[
        ('categorical', nested_categorical_feature_transformation, ['education', 'workclass']),
        ('numeric', StandardScaler(), ['age', 'hours-per-week'])
    ])

    nested_pipeline = Pipeline([
      ('features', nested_feature_transformation),
      ('classifier', DecisionTreeClassifier())])

    return nested_pipeline

## Found module dis

In [52]:
import dis
saved = dis.dis(pipeline_test)

  2           0 LOAD_CONST               1 (0)
              2 LOAD_CONST               2 (('Pipeline',))
              4 IMPORT_NAME              0 (sklearn.pipeline)
              6 IMPORT_FROM              1 (Pipeline)
              8 STORE_FAST               1 (Pipeline)
             10 POP_TOP

  3          12 LOAD_CONST               1 (0)
             14 LOAD_CONST               3 (('DecisionTreeClassifier',))
             16 IMPORT_NAME              2 (sklearn.tree)
             18 IMPORT_FROM              3 (DecisionTreeClassifier)
             20 STORE_FAST               2 (DecisionTreeClassifier)
             22 POP_TOP

  4          24 LOAD_CONST               1 (0)
             26 LOAD_CONST               4 (('OneHotEncoder', 'StandardScaler', 'label_binarize'))
             28 IMPORT_NAME              4 (sklearn.preprocessing)
             30 IMPORT_FROM              5 (OneHotEncoder)
             32 STORE_FAST               3 (OneHotEncoder)
             34 IMPORT_FROM  

# Monitor Part

## Use module inspect to convert function codes into Strings

In [17]:
raw_func = inspect.getsource(pipeline_test_4)

In [18]:
raw_func

"def pipeline_test_4(f_path = 'adult-sample.csv', a = 0):\n    raw_data = pd.read_csv(f_path, na_values='?')\n    data = raw_data.dropna()\n\n    labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])\n\n    nested_categorical_feature_transformation = Pipeline(steps=[\n        ('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),\n        ('encode', OneHotEncoder(handle_unknown='ignore'))\n    ])\n\n    nested_feature_transformation = ColumnTransformer(transformers=[\n        ('categorical', nested_categorical_feature_transformation, ['education', 'workclass']),\n        ('numeric', StandardScaler(), ['age', 'hours-per-week'])\n    ])\n\n    nested_pipeline = Pipeline([\n      ('features', nested_feature_transformation),\n      ('classifier', DecisionTreeClassifier())])\n\n    return nested_pipeline\n"

In [19]:
raw_func_list = [item[4:].rstrip() for item in raw_func.split('\n')]

In [20]:
raw_func_list

["pipeline_test_4(f_path = 'adult-sample.csv', a = 0):",
 "raw_data = pd.read_csv(f_path, na_values='?')",
 'data = raw_data.dropna()',
 '',
 "labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])",
 '',
 'nested_categorical_feature_transformation = Pipeline(steps=[',
 "    ('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),",
 "    ('encode', OneHotEncoder(handle_unknown='ignore'))",
 '])',
 '',
 'nested_feature_transformation = ColumnTransformer(transformers=[',
 "    ('categorical', nested_categorical_feature_transformation, ['education', 'workclass']),",
 "    ('numeric', StandardScaler(), ['age', 'hours-per-week'])",
 '])',
 '',
 'nested_pipeline = Pipeline([',
 "  ('features', nested_feature_transformation),",
 "  ('classifier', DecisionTreeClassifier())])",
 '',
 'return nested_pipeline',
 '']

### For_if_statement Trail

In [21]:
# def bracket_balance(func_list):
#     res = []
#     # func_args = raw_func_list[0].split('(')[1].rstrip('):')
#     stack_for_parent = []
#     logs_of_parent = []
#     for item in func_list:
#         logs_of_parent.append(item)
#     for char in item:
#         if char == '(':
#             stack_for_parent.append('(')
#         if char == '[':
#             stack_for_parent.append('[')
#         if char == ')' and stack_for_parent[-1] == '(':
#             stack_for_parent.pop(-1)
#         if char == ']' and stack_for_parent[-1] == '[':
#             stack_for_parent.pop(-1)
#     if not stack_for_parent:
#         res.append(' '.join(logs_of_parent))
#         logs_of_parent.clear()
#     return res

# logs_of_loop_if = []
# for item in raw_func_list:
#     if item[-1] == ':':
#         logs_of_loop_if.append(item)
#     if not logs_of_loop_if and item.startwith('    '):
#         logs_of_loop_if.append(item)
#     else:
#         item.strip()
#     logs_of_parent.append(item)
#     for char in item:
#         if char == '(':
#             stack_for_parent.append('(')
#         if char == '[':
#             stack_for_parent.append('[')
#         if char == ')' and stack_for_parent[-1] == '(':
#             stack_for_parent.pop(-1)
#         if char == ']' and stack_for_parent[-1] == '[':
#             stack_for_parent.pop(-1)
#     if not stack_for_parent:
#         res.append(' '.join(logs_of_parent))
#         logs_of_parent.clear()

## Raw Function to Convert Function to Executable Lines

In [30]:
def func_aggregation(func_str):
    '''
    This function is used for line execution with exec()
    
    args:
        function strings after inspect
    returns:
        list of functionable strings for exec()
    '''
    
    res = [] # executables for return
    stack_for_parent = [] # stack storing brackets for line integration
    logs_of_parent = [] # logs of lines for concat
#     convert function codes to list of strings
    func_list = [item.strip() for item in func_str.split('\n')]
#     function args
    func_args = [item.strip() for item in func_list[0].split('(')[1].rstrip('):').split(',')]
    for item in func_list[1:]:
        if not item:
            continue
        logs_of_parent.append(item)
        for char in item:
            if char == '(':
                stack_for_parent.append('(')
            if char == '[':
                stack_for_parent.append('[')
            if char == ')' and stack_for_parent[-1] == '(':
                stack_for_parent.pop(-1)
            if char == ']' and stack_for_parent[-1] == '[':
                stack_for_parent.pop(-1)
        if not stack_for_parent:
            res.append(''.join(logs_of_parent))
            logs_of_parent.clear()
    return func_args, res[:-1], [item.strip() for item in res[-1].replace('return ', '').split(',')]

In [31]:
input_args, executable_list = func_aggregation(raw_func)

ValueError: too many values to unpack (expected 2)

In [14]:
input_args

NameError: name 'input_args' is not defined

In [139]:
executable_list

["raw_data = pd.read_csv(f_path, na_values='?')",
 'data = raw_data.dropna()',
 "labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])",
 "feature_transformation = sklearn.compose.ColumnTransformer(transformers=[('categorical', OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']),('numeric', StandardScaler(), ['age', 'hours-per-week'])])",
 "income_pipeline = Pipeline([('features', feature_transformation),('classifier', DecisionTreeClassifier())])"]

In [140]:
for line in input_args:
    exec(line)

In [153]:
target_col = ['race', 'occupation']

In [217]:
exec_it = iter(executable_list)

In [9]:
import pprint

for _ in range(4):
    cur_line = next(exec_it)
    exec(cur_line)
    try: 
        if str(eval(f"type({cur_line.split('=')[0].strip()})")) == "<class 'pandas.core.frame.DataFrame'>":
            print('yes')
            target_df = cur_line.split('=')[0].strip()
            exec(f"pprint.pprint({target_df}[{target_col}].describe())")
        else:
            exec(f"pprint.pprint({target_df}[{target_col}].describe())")
    except:
        exec(f"pprint.pprint({target_df}[{target_col}].describe())")

### test 3 debugging

In [219]:
cur_line

"feature_transformation = sklearn.compose.ColumnTransformer(transformers=[('categorical', OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']),('numeric', StandardScaler(), ['age', 'hours-per-week'])])"

In [199]:
tar = cur_line.split('=')[0].strip()
str(eval(f"type({tar})"))=="<class 'pandas.core.frame.DataFrame'>"

True

In [223]:
nested_graph = pipeline_to_dataflow_graph(income_pipeline)

In [224]:
nested_graph

[[], (name=features__categorical__education, op=OneHotEncoder),
 [], (name=features__categorical__workclass, op=OneHotEncoder),
 [], (name=features__numeric__age, op=StandardScaler),
 [], (name=features__numeric__hours-per-week, op=StandardScaler),
 [[], (name=features__categorical__education, op=OneHotEncoder), [], (name=features__categorical__workclass, op=OneHotEncoder), [], (name=features__numeric__age, op=StandardScaler), [], (name=features__numeric__hours-per-week, op=StandardScaler)], (name=classifier, op=DecisionTreeClassifier)]

## test 4 starts

In [10]:
raw_func_4 = inspect.getsource(pipeline_test_4)

input_args, executable_list, outputs = func_aggregation(raw_func_4)


In [11]:
for line in input_args:
    exec(line)

In [12]:
executable_list

["raw_data = pd.read_csv(f_path, na_values='?')",
 'data = raw_data.dropna()',
 "labels = label_binarize(data['income-per-year'], ['>50K', '<=50K'])",
 "nested_categorical_feature_transformation = Pipeline(steps=[('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),('encode', OneHotEncoder(handle_unknown='ignore'))])",
 "nested_feature_transformation = ColumnTransformer(transformers=[('categorical', nested_categorical_feature_transformation, ['education', 'workclass']),('numeric', StandardScaler(), ['age', 'hours-per-week'])])",
 "nested_pipeline = Pipeline([('features', nested_feature_transformation),('classifier', DecisionTreeClassifier())])"]

In [13]:
outputs

['nested_pipeline']

In [98]:
target_col = ['race', 'occupation', 'education']

In [135]:
prev = None
for cur_line in executable_list:
    print_bool = False
    exec(cur_line)
    try: 
        if str(eval(f"type({cur_line.split('=')[0].strip()})")) == "<class 'pandas.core.frame.DataFrame'>":
            target_df = cur_line.split('=')[0].strip()
            count_log = eval(f"{target_df}[{target_col}].count()")
            if prev is not None:
                for col in target_col:
                    dif = count_log[col] - prev[col]
                    if dif != 0:
                        print(f'Count Changed in {col} with value {dif}')
                        print_bool = True
            else:
                print_bool = True
            
            if print_bool:
                pprint.pprint(count_log)
                print(f'Inpected {cur_line}')
                print('-------------------------------------------------------')
                print()
            prev = count_log
            
        elif str(eval(f"type({cur_line.split('=')[0].strip()})")).startswith("<class 'sklearn"):
            pass
        
        else:
            count_log = eval(f"{target_df}[{target_col}].count()")
            for col in target_col:
                dif = count_log[col] - prev[col] 
                if dif != 0:
                    print(f'Count Changed in {col} with value {dif}')
                    print_bool = True
            if print_bool:
                pprint.pprint(count_log)
                print(f'Inspected {cur_line}')
                print('-------------------------------------------------------')
                print()
            prev = count_log
            
    except:
#         print(f'inspecting {cur_line}')
#         exec(f"pprint.pprint({target_df}[{target_col}].count())")
#         print()
        pass

race          100
occupation     94
education     100
dtype: int64
Inpected raw_data = pd.read_csv(f_path, na_values='?')
-------------------------------------------------------

Count Changed in race with value -8
Count Changed in occupation with value -2
Count Changed in education with value -8
race          92
occupation    92
education     92
dtype: int64
Inpected data = raw_data.dropna()
-------------------------------------------------------



In [94]:
if not 0:
    print('dsf')

dsf


In [70]:
nested_graph = pipeline_to_dataflow_graph(eval(f'{outputs[0]}'))

In [71]:
nested_graph

[[], (name=education, op=SimpleImputer(add_indicator=False, copy=True, fill_value=None,
               missing_values=nan, strategy='most_frequent', verbose=0)),
 [[], (name=education, op=SimpleImputer(add_indicator=False, copy=True, fill_value=None,
               missing_values=nan, strategy='most_frequent', verbose=0))], (name=education, op=OneHotEncoder(categorical_features=None, categories=None, drop=None,
               dtype=<class 'numpy.float64'>, handle_unknown='ignore',
               n_values=None, sparse=True)),
 [], (name=workclass, op=SimpleImputer(add_indicator=False, copy=True, fill_value=None,
               missing_values=nan, strategy='most_frequent', verbose=0)),
 [[], (name=workclass, op=SimpleImputer(add_indicator=False, copy=True, fill_value=None,
               missing_values=nan, strategy='most_frequent', verbose=0))], (name=workclass, op=OneHotEncoder(categorical_features=None, categories=None, drop=None,
               dtype=<class 'numpy.float64'>, handle_u

In [72]:
for item in nested_graph:
    if item.name in target_col:
        eval(target_df)[item.name] = item.operation.fit_transform(eval(target_df)[item.name].values.reshape(-1,1))
        print(eval(target_df)[item.name].describe().to_string())
        print()

count          92
unique         12
top       HS-grad
freq           29
count                                                    92
unique                                                    1
top         (0, 11)\t1.0\n  (1, 8)\t1.0\n  (2, 7)\t1.0\n...
freq                                                     92


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [24]:
def describe_ver(pipeline_to_test, target_col = ['race', 'occupation', 'education']):
    
    def func_aggregation(func_str):
        '''
        This function is used for line execution with exec()

        args:
            function strings after inspect
        returns:
            list of functionable strings for exec()
        '''

        res = [] # executables for return
        stack_for_parent = [] # stack storing brackets for line integration
        logs_of_parent = [] # logs of lines for concat
    #     convert function codes to list of strings
        func_list = [item.strip() for item in func_str.split('\n')]
    #     function args
        func_args = [item.strip() for item in func_list[0].split('(')[1].rstrip('):').split(',')]
        for item in func_list[1:]:
            if not item:
                continue
            logs_of_parent.append(item)
            for char in item:
                if char == '(':
                    stack_for_parent.append('(')
                if char == '[':
                    stack_for_parent.append('[')
                if char == ')' and stack_for_parent[-1] == '(':
                    stack_for_parent.pop(-1)
                if char == ']' and stack_for_parent[-1] == '[':
                    stack_for_parent.pop(-1)
            if not stack_for_parent:
                res.append(''.join(logs_of_parent))
                logs_of_parent.clear()
        return func_args, res[:-1], [item.strip() for item in res[-1].replace('return ', '').split(',')]
    
    
    raw_func = inspect.getsource(pipeline_to_test)

    input_args, executable_list, outputs = func_aggregation(raw_func)
    
    for line in input_args:
        exec(line)
    
    print()
    print('####################### Start Pandas Opeation #######################')
    print()
    
    prev = None
    for cur_line in executable_list:
        print_bool = False
        exec(cur_line)
        try: 
            if str(eval(f"type({cur_line.split('=')[0].strip()})")) == "<class 'pandas.core.frame.DataFrame'>":
                target_df = cur_line.split('=')[0].strip()
                count_log = eval(f"{target_df}[{target_col}].count()")
                if prev is not None:
                    for col in target_col:
                        dif = count_log[col] - prev[col]
                        if dif != 0:
                            print(f'Count Changed in {col} with value {dif}')
                            print_bool = True
                else:
                    print_bool = True

                if print_bool:
                    pprint.pprint(count_log)
                    print(f'Inpected {cur_line}')
                    print('-------------------------------------------------------')
                    print()
                prev = count_log

            elif str(eval(f"type({cur_line.split('=')[0].strip()})")).startswith("<class 'sklearn"):
                pass

            else:
                count_log = eval(f"{target_df}[{target_col}].count()")
                for col in target_col:
                    dif = count_log[col] - prev[col] 
                    if dif != 0:
                        print(f'Count Changed in {col} with value {dif}')
                        print_bool = True
                if print_bool:
                    pprint.pprint(count_log)
                    print(f'Inspected {cur_line}')
                    print('-------------------------------------------------------')
                    print()
                prev = count_log

        except:
    #         print(f'inspecting {cur_line}')
    #         exec(f"pprint.pprint({target_df}[{target_col}].count())")
    #         print()
            pass
    
    nested_graph = pipeline_to_dataflow_graph(eval(f'{outputs[0]}'))
    
    print()
    print('####################### Start Sklearn Pipeline #######################')
    print()
    
    for item in nested_graph:
        if item.name in target_col:
            eval(target_df)[item.name] = item.operation.fit_transform(eval(target_df)[item.name].values.reshape(-1,1))
            print(f"Operations {str(item.operation).split('(')[0]} on {item.name}")
            count_log = eval(target_df)[item.name].count()
            dif = count_log - prev[item.name]
            if dif != 0:
                print(f"count now is {count_log}, dif is {dif}")
            else:
                print('no changes')
            prev[item.name] = count_log
            print('-------------------------------------------------------')
            print()


In [25]:
describe_ver(pipeline_test_4)


####################### Start Pandas Opeation #######################


####################### Start Sklearn Pipeline #######################

Operations SimpleImputer on education


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


TypeError: 'NoneType' object is not subscriptable