In [1]:
import pandas as pd
from importlib import reload
import skrub
from functools import wraps

In [2]:
df1 = pd.DataFrame({"Country": ["USA", "Italy", "Georgia"]})#.astype("string").set_index("Country")
df2 = pd.DataFrame({"Country": ["Spain", "Belgium", "Italy"],
                    "Capital": ["Madrid", "Brussel", "Rome"]})

def provenance_func_name(func):
    def wrapper(*args,**kwargs):
        print(f"I know you executed {func.__name__}")
        print("now I will tell you the args")
        for argument in args:
            if type(argument) == list:
                print("-----------------------------------")
                print("check on the type of argument class list is successful")
                print("now iterating over each element of the list:")
                for i,elementi in enumerate(argument):
                    print(f"that is the {i}th element")
                    print(elementi)
            else:
                print("argument: ", argument)
                print("type(argument): ", type(argument))
        return func(*args,**kwargs)
    return wrapper


dict_of_functions_with_implemented_provenance={
    "merge" : pd.merge,
    "concat" : skrub.DataOp.skb.concat,
    "get_data" : skrub.DataOp.skb.get_data
}

def wrap_skrub(func, names_of_original_functions, provenance_wrapper_for_the_function, verbose=False):
    @wraps(func)
    def wrapper(*args,**kwargs):
        
        for original_name, new_function in zip(names_of_original_functions, provenance_wrapper_for_the_function):
            if original_name in dict_of_functions_with_implemented_provenance.keys():
                # just understood that the following approach will not work -> will reassign the value in the dictionary
                # how can one make it more dynamic?
                # dict_of_functions_with_implemented_provenance[original_name] = new_function
                result_function = new_function(dict_of_functions_with_implemented_provenance[original_name])
                if original_name == "merge":
                    pd.merge = result_function            # I thought it would work, but it does not -> find a way how skrub executes pandas merge -> which class should be adjusted
                elif original_name == "concat":
                    skrub.DataOp.skb.concat = result_function
                elif original_name == "get_data":
                    skrub.DataOp.skb.get_data = result_function
                else:
                    if verbose:
                        print("The value of the original function is in the dictionary, but not in the wrapper() function.")
                    
            else:
                if verbose:
                    print("One of the provenance functinos was not actiavated because no such key was found in dict_of_functions_with_implemented_provenance")
        return func(*args,**kwargs)
    return wrapper




#### Standard run: this is how the outputs look like without monkey patching the provenance

In [5]:
my_df1 = skrub.var("my_df1",df1)
my_df2 = skrub.var("my_df2",df2)

In [7]:
my_df1.skb.concat([my_df2])

Unnamed: 0_level_0,Country,Capital
Unnamed: 0_level_1,Country,Capital
0,USA,
1,Italy,
2,Georgia,
3,Spain,Madrid
4,Belgium,Brussel
5,Italy,Rome

Column,Column name,dtype,Is sorted,Null values,Unique values,Mean,Std,Min,Median,Max
0,Country,ObjectDType,False,0 (0.0%),5 (83.3%),,,,,
1,Capital,ObjectDType,False,3 (50.0%),3 (50.0%),,,,,


In [9]:
type(my_df1.skb.get_data())

dict

In [12]:
skrub.DataOp.skb.get_data

<function skrub._data_ops._skrub_namespace.SkrubNamespace.get_data(self)>

#### Execute either this version with the provenance only on concat or three cells below one version with two provenance functions

In [7]:
skrub.var = wrap_skrub(skrub.var,["concat"],[provenance_func_name])
skrub.var

<function skrub._data_ops._data_ops.var(name, value=NULL)>

In [8]:
my_df1 = skrub.var("my_df1",df1)
my_df2 = skrub.var("my_df2",df2)

In [15]:
my_df1.skb.concat([my_df2])

I know you executed concat
now I will tell you the args
argument:  <SkrubNamespace>
type(argument):  <class 'skrub._data_ops._skrub_namespace.SkrubNamespace'>
check on the type of argument class list is successful
now iterating over each element of the list:
that is the 0th element
<Var 'my_df2'>
Result:
―――――――
   Country  Capital
0    Spain   Madrid
1  Belgium  Brussel
2    Italy     Rome


Unnamed: 0_level_0,Country,Capital
Unnamed: 0_level_1,Country,Capital
0,USA,
1,Italy,
2,Georgia,
3,Spain,Madrid
4,Belgium,Brussel
5,Italy,Rome

Column,Column name,dtype,Is sorted,Null values,Unique values,Mean,Std,Min,Median,Max
0,Country,ObjectDType,False,0 (0.0%),5 (83.3%),,,,,
1,Capital,ObjectDType,False,3 (50.0%),3 (50.0%),,,,,


#### This is the second version of the provenance with two functions

In [11]:
names_of_original_functions = ["concat","get_data"]
length_of_the_array = len(names_of_original_functions)
skrub.var = wrap_skrub(skrub.var,
                       names_of_original_functions=names_of_original_functions,
                       provenance_wrapper_for_the_function=[provenance_func_name]*length_of_the_array)
skrub.var

<function skrub._data_ops._data_ops.var(name, value=NULL)>

In [13]:
my_df1 = skrub.var("my_df1",df1)
my_df2 = skrub.var("my_df2",df2)

In [15]:
my_df1.skb.concat([my_df2])

I know you executed concat
now I will tell you the args
argument:  <SkrubNamespace>
type(argument):  <class 'skrub._data_ops._skrub_namespace.SkrubNamespace'>
-----------------------------------
check on the type of argument class list is successful
now iterating over each element of the list:
that is the 0th element
<Var 'my_df2'>
Result:
―――――――
   Country  Capital
0    Spain   Madrid
1  Belgium  Brussel
2    Italy     Rome


Unnamed: 0_level_0,Country,Capital
Unnamed: 0_level_1,Country,Capital
0,USA,
1,Italy,
2,Georgia,
3,Spain,Madrid
4,Belgium,Brussel
5,Italy,Rome

Column,Column name,dtype,Is sorted,Null values,Unique values,Mean,Std,Min,Median,Max
0,Country,ObjectDType,False,0 (0.0%),5 (83.3%),,,,,
1,Capital,ObjectDType,False,3 (50.0%),3 (50.0%),,,,,


In [20]:
my_df1.skb.get_data()

I know you executed get_data
now I will tell you the args
argument:  <SkrubNamespace>
type(argument):  <class 'skrub._data_ops._skrub_namespace.SkrubNamespace'>


{'my_df1':    Country
 0      USA
 1    Italy
 2  Georgia}