In [1]:
import skrub
import pandas as pd
from monkey_patching_v0 import provenance_func_name

def set_provenance(namespace, name_of_the_function, provenance_func=provenance_func_name):
    skrub_eval_namespace = namespace
    name = name_of_the_function
    skrub_eval = getattr(skrub_eval_namespace,name,None)
    setattr(skrub_eval_namespace, name, provenance_func(skrub_eval))
    print(f"Set provenance for {name}")

set_provenance(skrub._data_ops._data_ops.DataOp, "__call__")

Set provenance for __call__


In [2]:
df1 = pd.DataFrame({"Country": ["USA", "Italy", "Georgia"]})#.astype("string").set_index("Country")
df2 = pd.DataFrame({"Country": ["Spain", "Belgium", "Italy"],
                    "Capital": ["Madrid", "Brussel", "Rome"]})#.astype("string").set_index("Country")

main_table = skrub.var("main_table", df1)
aux_table = skrub.var("aux_table", df2)

joined = (
    main_table
    #.assign(Country=main_table["Country"].astype("string"))
    .merge(
        aux_table,
        #.assign(Country=aux_table["Country"].astype("string"))
        #.set_index("Country"),
        on="Country",
        how="left",
    )
)
joined

[PROVENANCE]: START
I know you executed __call__
now I will tell you the args

argument 0:  <GetAttr 'merge'>
Result:
―――――――
<bound method DataFrame.merge of    Country
0      USA
1    Italy
2  Georgia>
type(argument):  <class 'skrub._data_ops._data_ops.DataOp'>

argument 1:  <Var 'aux_table'>
Result:
―――――――
   Country  Capital
0    Spain   Madrid
1  Belgium  Brussel
2    Italy     Rome
type(argument):  <class 'skrub._data_ops._data_ops.DataOp'>
-----------------------------------
now I will go over keyword arguments
the keyword is: >>> on <<< the value will be printed below: 
Country
the keyword is: >>> how <<< the value will be printed below: 
left
[PROVENANCE]: END


Unnamed: 0_level_0,Country,Capital
Unnamed: 0_level_1,Country,Capital
0,USA,
1,Italy,Rome
2,Georgia,

Column,Column name,dtype,Is sorted,Null values,Unique values,Mean,Std,Min,Median,Max
0,Country,ObjectDType,True,0 (0.0%),3 (100.0%),,,,,
1,Capital,ObjectDType,False,2 (66.7%),1 (33.3%),,,,,


In [3]:
df3 = pd.DataFrame({"Name": ["Person1", "Person2", "Person3"],
                    "Home Country": ["Italy", "Italy", "Germany"]})#.set_index("Country")
people_table = skrub.var("people_table", df3)

joined2 = (
    joined
    #.assign(Country=main_table["Country"].astype("string"))
    .merge(
        people_table,
        #.assign(Country=people_table["Home Country"].astype("string"))
        #.set_index("Country"),
        left_on="Country", right_on="Home Country",
        how="left",
    )
)
joined2

[PROVENANCE]: START
I know you executed __call__
now I will tell you the args

argument 0:  <GetAttr 'merge'>
Result:
―――――――
<bound method DataFrame.merge of    Country Capital
0      USA     NaN
1    Italy    Rome
2  Georgia     NaN>
type(argument):  <class 'skrub._data_ops._data_ops.DataOp'>

argument 1:  <Var 'people_table'>
Result:
―――――――
      Name Home Country
0  Person1        Italy
1  Person2        Italy
2  Person3      Germany
type(argument):  <class 'skrub._data_ops._data_ops.DataOp'>
-----------------------------------
now I will go over keyword arguments
the keyword is: >>> left_on <<< the value will be printed below: 
Country
the keyword is: >>> right_on <<< the value will be printed below: 
Home Country
the keyword is: >>> how <<< the value will be printed below: 
left
[PROVENANCE]: END


Unnamed: 0_level_0,Country,Capital,Name,Home Country
Unnamed: 0_level_1,Country,Capital,Name,Home Country
0,USA,,,
1,Italy,Rome,Person1,Italy
2,Italy,Rome,Person2,Italy
3,Georgia,,,

Column,Column name,dtype,Is sorted,Null values,Unique values,Mean,Std,Min,Median,Max
0,Country,ObjectDType,True,0 (0.0%),3 (75.0%),,,,,
1,Capital,ObjectDType,False,2 (50.0%),1 (25.0%),,,,,
2,Name,ObjectDType,False,2 (50.0%),2 (50.0%),,,,,
3,Home Country,ObjectDType,False,2 (50.0%),1 (25.0%),,,,,


In [4]:
# Aggregation
aggregation_df = joined2.groupby("Capital").agg([list,"count"])
aggregation_df

[PROVENANCE]: START
I know you executed __call__
now I will tell you the args

argument 0:  <GetAttr 'groupby'>
Result:
―――――――
<bound method DataFrame.groupby of    Country Capital     Name Home Country
0      USA     NaN      NaN          NaN
1    Italy    Rome  Person1        Italy
2    Italy    Rome  Person2        Italy
3  Georgia     NaN      NaN          NaN>
type(argument):  <class 'skrub._data_ops._data_ops.DataOp'>
argument 1:  Capital
type(argument):  <class 'str'>
-----------------------------------
There are no kwargs.
[PROVENANCE]: END
[PROVENANCE]: START
I know you executed __call__
now I will tell you the args

argument 0:  <GetAttr 'agg'>
Result:
―――――――
<bound method DataFrameGroupBy.aggregate of <pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000020B5A58FFB0>>
type(argument):  <class 'skrub._data_ops._data_ops.DataOp'>
-----------------------------------
check on the type of argument class list is successful
now iterating over each element of the list:
tha

Capital,Country,Country,Name,Name,Home Country,Home Country
Capital,list,count,list,count,list,count
Capital,list,count,list,count,list,count
Rome,"['Italy', 'Italy']",2,"['Person1', 'Person2']",2,"['Italy', 'Italy']",2

Column,Column name,dtype,Is sorted,Null values,Unique values,Mean,Std,Min,Median,Max
0,"('Country', 'list')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,
1,"('Country', 'count')",Int64DType,True,0 (0.0%),1 (100.0%),2.0,,,,
2,"('Name', 'list')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,
3,"('Name', 'count')",Int64DType,True,0 (0.0%),1 (100.0%),2.0,,,,
4,"('Home Country', 'list')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,
5,"('Home Country', 'count')",Int64DType,True,0 (0.0%),1 (100.0%),2.0,,,,


In [5]:
# Selection
aggregation_selection_df = aggregation_df[["Name","Home Country"]]
aggregation_selection_df

Capital,Name,Name,Home Country,Home Country
Capital,list,count,list,count
Capital,list,count,list,count
Rome,"['Person1', 'Person2']",2,"['Italy', 'Italy']",2

Column,Column name,dtype,Is sorted,Null values,Unique values,Mean,Std,Min,Median,Max
0,"('Name', 'list')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,
1,"('Name', 'count')",Int64DType,True,0 (0.0%),1 (100.0%),2.0,,,,
2,"('Home Country', 'list')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,
3,"('Home Country', 'count')",Int64DType,True,0 (0.0%),1 (100.0%),2.0,,,,


In [6]:
# Projection
result_df = aggregation_selection_df.assign(Country=aggregation_selection_df[("Home Country","list")])
result_df

[PROVENANCE]: START
I know you executed __call__
now I will tell you the args

argument 0:  <GetAttr 'assign'>
Result:
―――――――
<bound method DataFrame.assign of                        Name          Home Country      
                       list count            list count
Capital                                                
Rome     [Person1, Person2]     2  [Italy, Italy]     2>
type(argument):  <class 'skrub._data_ops._data_ops.DataOp'>
-----------------------------------
now I will go over keyword arguments
the keyword is: >>> Country <<< the value will be printed below: 
<GetItem ('Home Country', 'list')>
Result:
―――――――
Capital
Rome    [Italy, Italy]
Name: (Home Country, list), dtype: object
[PROVENANCE]: END


Capital,Name,Name,Home Country,Home Country,Country
Capital,list,count,list,count,Unnamed: 5_level_1
Capital,list,count,list,count,Unnamed: 5_level_2
Rome,"['Person1', 'Person2']",2,"['Italy', 'Italy']",2,"['Italy', 'Italy']"

Column,Column name,dtype,Is sorted,Null values,Unique values,Mean,Std,Min,Median,Max
0,"('Name', 'list')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,
1,"('Name', 'count')",Int64DType,True,0 (0.0%),1 (100.0%),2.0,,,,
2,"('Home Country', 'list')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,
3,"('Home Country', 'count')",Int64DType,True,0 (0.0%),1 (100.0%),2.0,,,,
4,"('Country', '')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,


### Let's execute the same pipeline but each ASPJ operator is marked by a decorator function now. 

#### For each passed function name in name_of_original_functions a provenance function is passed to wrap_skrub()
#### For now they are all provenance_func_name() (it prints the function name) but they can be customized: pd.merge will get assigned provenance_of_merge(), and df[] will get assigned provenance_select()

In [6]:
from monkey_patching_v0 import wrap_skrub, provenance_func_name

In [None]:
import pandas as pd
import skrub


df1 = pd.DataFrame({"Country": ["USA", "Italy", "Georgia"]})#.astype("string").set_index("Country")
df2 = pd.DataFrame({"Country": ["Spain", "Belgium", "Italy"],
                    "Capital": ["Madrid", "Brussel", "Rome"]})#.astype("string").set_index("Country")

name_of_original_functions = ["df[]", "df.merge", "groupby", "agg", "assign"]
length_of_original_names = len(name_of_original_functions)
skrub.var = wrap_skrub(skrub.var, 
                       names_of_original_functions=name_of_original_functions, 
                       provenance_wrapper_for_the_function=[provenance_func_name]*length_of_original_names)

main_table = skrub.var("main_table", df1)
aux_table = skrub.var("aux_table", df2)

joined = (
    main_table
    .merge(
        aux_table,
        on="Country",
        how="left",
    )
)
joined

I know you executed merge
now I will tell you the args
argument:     Country
0      USA
1    Italy
2  Georgia
type(argument):  <class 'pandas.core.frame.DataFrame'>
argument:     Country  Capital
0    Spain   Madrid
1  Belgium  Brussel
2    Italy     Rome
type(argument):  <class 'pandas.core.frame.DataFrame'>
-----------------------------------
now I will go over keyword arguments
the keyword is:  on  the value will be printed below: 
Country
the keyword is:  how  the value will be printed below: 
left
I know you executed __getitem__
now I will tell you the args
argument:     Country  Capital
0    Spain   Madrid
1  Belgium  Brussel
2    Italy     Rome
type(argument):  <class 'pandas.core.frame.DataFrame'>
argument:  Country
type(argument):  <class 'str'>
-----------------------------------
now I will go over keyword arguments
I know you executed __getitem__
now I will tell you the args
argument:     Country
0      USA
1    Italy
2  Georgia
type(argument):  <class 'pandas.core.frame.Dat

Unnamed: 0_level_0,Country,Capital
Unnamed: 0_level_1,Country,Capital
0,USA,
1,Italy,Rome
2,Georgia,

Column,Column name,dtype,Is sorted,Null values,Unique values,Mean,Std,Min,Median,Max
0,Country,ObjectDType,True,0 (0.0%),3 (100.0%),,,,,
1,Capital,ObjectDType,False,2 (66.7%),1 (33.3%),,,,,


#### By introducing provenance for \_\_getitem\_\_ -> we can see that it is being called inside the merge function. Find the most basic operators like \_\_getitem\_\_ and implement provenance for them

In [None]:
df3 = pd.DataFrame({"Name": ["Person1", "Person2", "Person3"],
                             "Home Country": ["Italy", "Italy", "Germany"]})#.set_index("Country")
people_table = skrub.var("people_table", df3)

joined2 = (
    joined
    .merge(
        people_table,
        left_on="Country", right_on="Home Country",
        how="left",
    )
)
joined2

I know you executed merge
now I will tell you the args
argument:     Country Capital
0      USA     NaN
1    Italy    Rome
2  Georgia     NaN
type(argument):  <class 'pandas.core.frame.DataFrame'>
argument:        Name Home Country
0  Person1        Italy
1  Person2        Italy
2  Person3      Germany
type(argument):  <class 'pandas.core.frame.DataFrame'>
-----------------------------------
now I will go over keyword arguments
the keyword is:  left_on  the value will be printed below: 
Country
the keyword is:  right_on  the value will be printed below: 
Home Country
the keyword is:  how  the value will be printed below: 
left
I know you executed __getitem__
now I will tell you the args
argument:        Name Home Country
0  Person1        Italy
1  Person2        Italy
2  Person3      Germany
type(argument):  <class 'pandas.core.frame.DataFrame'>
argument:  Home Country
type(argument):  <class 'str'>
-----------------------------------
now I will go over keyword arguments
I know you exe

Unnamed: 0_level_0,Country,Capital,Name,Home Country
Unnamed: 0_level_1,Country,Capital,Name,Home Country
0,USA,,,
1,Italy,Rome,Person1,Italy
2,Italy,Rome,Person2,Italy
3,Georgia,,,

Column,Column name,dtype,Is sorted,Null values,Unique values,Mean,Std,Min,Median,Max
0,Country,ObjectDType,True,0 (0.0%),3 (75.0%),,,,,
1,Capital,ObjectDType,False,2 (50.0%),1 (25.0%),,,,,
2,Name,ObjectDType,False,2 (50.0%),2 (50.0%),,,,,
3,Home Country,ObjectDType,False,2 (50.0%),1 (25.0%),,,,,


#### Interesting that func.\_\_name\_\_ for groupby() is not "groupby" -> I do not find any name for groupby

In [10]:
# Aggregation
aggregation_df = joined2.groupby("Capital").agg([list,"count"])
aggregation_df

I know you executed __getitem__
now I will tell you the args
argument:     Country Capital     Name Home Country
0      USA     NaN      NaN          NaN
1    Italy    Rome  Person1        Italy
2    Italy    Rome  Person2        Italy
3  Georgia     NaN      NaN          NaN
type(argument):  <class 'pandas.core.frame.DataFrame'>
argument:  Capital
type(argument):  <class 'str'>
-----------------------------------
now I will go over keyword arguments
I know you executed aggregate
now I will tell you the args
argument:  <pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000195BAF2AB10>
type(argument):  <class 'pandas.core.groupby.generic.DataFrameGroupBy'>
-----------------------------------
check on the type of argument class list is successful
now iterating over each element of the list:
that is the 0th element
<class 'list'>
that is the 1th element
count
-----------------------------------
now I will go over keyword arguments
I know you executed __getitem__
now I will tell y

Capital,Country,Country,Name,Name,Home Country,Home Country
Capital,list,count,list,count,list,count
Capital,list,count,list,count,list,count
Rome,"['Italy', 'Italy']",2,"['Person1', 'Person2']",2,"['Italy', 'Italy']",2

Column,Column name,dtype,Is sorted,Null values,Unique values,Mean,Std,Min,Median,Max
0,"('Country', 'list')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,
1,"('Country', 'count')",Int64DType,True,0 (0.0%),1 (100.0%),2.0,,,,
2,"('Name', 'list')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,
3,"('Name', 'count')",Int64DType,True,0 (0.0%),1 (100.0%),2.0,,,,
4,"('Home Country', 'list')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,
5,"('Home Country', 'count')",Int64DType,True,0 (0.0%),1 (100.0%),2.0,,,,


In [11]:
# Selection
aggregation_selection_df = aggregation_df[["Name","Home Country"]]
aggregation_selection_df

I know you executed __getitem__
now I will tell you the args
argument:                  Country                      Name          Home Country      
                   list count                list count            list count
Capital                                                                      
Rome     [Italy, Italy]     2  [Person1, Person2]     2  [Italy, Italy]     2
type(argument):  <class 'pandas.core.frame.DataFrame'>
-----------------------------------
check on the type of argument class list is successful
now iterating over each element of the list:
that is the 0th element
Name
that is the 1th element
Home Country
-----------------------------------
now I will go over keyword arguments
I know you executed __getitem__
now I will tell you the args
argument:                  value  count
0  [Person1, Person2]      1
type(argument):  <class 'pandas.core.frame.DataFrame'>
argument:  count
type(argument):  <class 'str'>
-----------------------------------
now I will go ove

Capital,Name,Name,Home Country,Home Country
Capital,list,count,list,count
Capital,list,count,list,count
Rome,"['Person1', 'Person2']",2,"['Italy', 'Italy']",2

Column,Column name,dtype,Is sorted,Null values,Unique values,Mean,Std,Min,Median,Max
0,"('Name', 'list')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,
1,"('Name', 'count')",Int64DType,True,0 (0.0%),1 (100.0%),2.0,,,,
2,"('Home Country', 'list')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,
3,"('Home Country', 'count')",Int64DType,True,0 (0.0%),1 (100.0%),2.0,,,,


In [12]:
# Projection
result_df = aggregation_selection_df.assign(Country=aggregation_selection_df[("Home Country","list")])
result_df

I know you executed __getitem__
now I will tell you the args
argument:                         Name          Home Country      
                       list count            list count
Capital                                                
Rome     [Person1, Person2]     2  [Italy, Italy]     2
type(argument):  <class 'pandas.core.frame.DataFrame'>
argument:  ('Home Country', 'list')
type(argument):  <class 'tuple'>
-----------------------------------
now I will go over keyword arguments
I know you executed assign
now I will tell you the args
argument:                         Name          Home Country      
                       list count            list count
Capital                                                
Rome     [Person1, Person2]     2  [Italy, Italy]     2
type(argument):  <class 'pandas.core.frame.DataFrame'>
-----------------------------------
now I will go over keyword arguments
the key is:  Country  the value is: 
Capital
Rome    [Italy, Italy]
Name: (Home Country,

Capital,Name,Name,Home Country,Home Country,Country
Capital,list,count,list,count,Unnamed: 5_level_1
Capital,list,count,list,count,Unnamed: 5_level_2
Rome,"['Person1', 'Person2']",2,"['Italy', 'Italy']",2,"['Italy', 'Italy']"

Column,Column name,dtype,Is sorted,Null values,Unique values,Mean,Std,Min,Median,Max
0,"('Name', 'list')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,
1,"('Name', 'count')",Int64DType,True,0 (0.0%),1 (100.0%),2.0,,,,
2,"('Home Country', 'list')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,
3,"('Home Country', 'count')",Int64DType,True,0 (0.0%),1 (100.0%),2.0,,,,
4,"('Country', '')",ObjectDType,True,0 (0.0%),1 (100.0%),,,,,


In [None]:
#import pandas as pd
#from skrub import Joiner
#df1 = pd.DataFrame({"Country": ["USA", "Italia", "Georgia"]})
#df2 = pd.DataFrame( {"Country": ["Spain", "Belgium", "Italy"],
#                           "Capital": ["Madrid", "Brussel", "Rome"]} )
#df3 = pd.DataFrame({"Name": ["Person1", "Person2", "Person3"],
#                             "Home Country": ["Italy", "Italia", "Germany"]})
#
#main_table = skrub.var("main_table", df1)
#aux_table = skrub.var("aux_table", df2)
#people_table = skrub.var("people_table", df3)
#
#
##main_table_fixed = main_table.astype({"Country": "string"})
##aux_table_fixed = aux_table.astype({"Country": "string"})
#
#
##joined1 = (
##    main_table[["Country"]].astype({"Country": "string"}).join(
##        aux_table[["Country", "Capital"]].astype({"Country": "string"}),
##        on="Country",
##        how="left",
##    )
##)
#
#
## Joiner executes a fuzzy join. I did not find a way to make this skrub function work with DataOps
#joiner = Joiner(
#    df2,
#    key="Country",
#    suffix="_aux",
#    max_dist=0.8,
#    add_match_info=False,
#)
#main_table.skb.apply(joiner.fit_transform)
#main_and_aux_table = joiner.fit_transform(df1)
#main_and_aux_table

Unnamed: 0,Country,Country_aux,Capital_aux
0,USA,,
1,Italia,Italy,Rome
2,Georgia,,


In [None]:
# Joiner executes a fuzzy join. I did not find a way to make this skrub function work with DataOps
#joiner = Joiner(
#    aux_table,
#    key="Country",
#    suffix="_aux",
#    max_dist=0.8,
#    add_match_info=False,
#)
#main_table.skb.apply_func(joiner.fit_transform)

RuntimeError: Evaluation of 'fit_transform()' failed.
You can see the full traceback above. The error message was:
TypeError: Only pandas and polars DataFrames are supported. Cannot handle X of type: <class 'skrub._data_ops._data_ops.DataOp'>.

In [None]:
#joiner_all = Joiner(
#    main_and_aux_table,
#    main_key="Home Country",
#    aux_key="Country",
#    suffix="_aux",
#    max_dist=0.8,
#    add_match_info=False,
##)
#main_and_people_and_aux_table = joiner_all.fit_transform(peopele_and_aux_table)
#main_and_people_and_aux_table

Unnamed: 0,Name,Home Country,Country_aux,Capital_aux,Country_aux__skrub_d0c22b71__,Country_aux_aux,Capital_aux_aux
0,Person1,Italy,Italy,Rome,Italia,Italy,Rome
1,Person2,Italia,Italy,Rome,Italia,Italy,Rome
2,Person3,Germany,Germany,Berlin,,,


In [None]:
#joiner = Joiner(
#    aux_table,
#    main_key="Home Country",
#    aux_key="Country",
#    suffix="_aux",
#    max_dist=0.8,
#    add_match_info=False,
#)
#joiner.fit_transform(people_table)

Unnamed: 0,Name,Home Country,Country_aux,Capital_aux
0,Person1,Italy,Italy,Rome
1,Person2,Italia,Italy,Rome
2,Person3,Germany,,


In [None]:
import duckdb
@deffered
def project_with_duckdb(a_dataframe, which_column, new_name):
    cursor = duckdb.connect()
    query = f""" SELECT "{which_column}" AS "{new_name}" from a_dataframe """
    return cursor.execute(query).fetch_df()