# Reproducibility: versioning

1. Create two versions of a dataframe (v1, v2)
2. Commit both to dolt as table: `bar` -> get two commit references
3. Run Flow with two different versions of `bar` (parameter to specify)
4. Read and the results table `baz` for the two runs given the `run_id`'s

In [53]:
import pandas as pd
from doltpy.core import Dolt
from doltpy.core.write import import_df

dolt = Dolt.init("foo")

df_v1 = pd.DataFrame({"A": [1,1,1], "B": [1,1,1]})
df_v2 = pd.DataFrame({"A": [1,1,1,2,2,2], "B": [1,1,1,2,2,2]})

import_df(dolt, "bar", df_v1.reset_index(), ["index"], "create")
dolt.add("bar")
dolt.commit("Initialize bar")

v1 = list(dolt.log(number="1").keys())[0]

import_df(dolt, "bar", df_v2.reset_index(), ["index"], "update")
dolt.add("bar")
dolt.commit("Add rows to bar")

v2 = list(dolt.log(number="1").keys())[0]

01-17 14:30:41 doltpy.core.dolt INFO     Creating directory foo
01-17 14:30:41 doltpy.core.dolt INFO     Creating a new repo in foo


['dolt', 'init']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning


01-17 14:30:42 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306
01-17 14:30:42 doltpy.core.write.write INFO     Importing to table bar in dolt directory located in foo, import mode create
01-17 14:30:42 doltpy.core.dolt INFO     Rows Processed: 3, Additions: 3, Modifications: 0, Had No Effect: 0
01-17 14:30:42 doltpy.core.dolt INFO     


['dolt', 'table', 'import', 'bar', '-c', '--pk=index', '/var/folders/05/c0ll_wxd26j61fntnymm2k8c0000gn/T/tmpwzxvh2i3.csv']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'add', 'bar']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'status']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning


01-17 14:30:42 doltpy.core.dolt INFO     commit pv1482m6b2mamo7od9jhf83m2ehadmfq
Author: Max Hoffman <maximilian.wolfgang1@gmail.com>
Date:   Sun Jan 17 14:30:42 -0800 2021

	Initialize bar


01-17 14:30:42 doltpy.core.write.write INFO     Importing to table bar in dolt directory located in foo, import mode update


['dolt', 'commit', '-m', 'Initialize bar']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'log', '--number', '1']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'table', 'import', 'bar', '-u', '/var/folders/05/c0ll_wxd26j61fntnymm2k8c0000gn/T/tmprvabbtd1.csv']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning


01-17 14:30:42 doltpy.core.dolt INFO     Rows Processed: 6, Additions: 3, Modifications: 0, Had No Effect: 3
01-17 14:30:42 doltpy.core.dolt INFO     
01-17 14:30:42 doltpy.core.dolt INFO     commit hvpnns4djgpvdaqnlv8qhvmo94kg7t72
Author: Max Hoffman <maximilian.wolfgang1@gmail.com>
Date:   Sun Jan 17 14:30:42 -0800 2021

	Add rows to bar




['dolt', 'add', 'bar']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'status']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'commit', '-m', 'Add rows to bar']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'log', '--number', '1']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning


In [54]:
! cat demo.py

import pickle

from metaflow import FlowSpec, step, DoltDT, Parameter
import pandas as pd
from sklearn import tree

class VersioningDemo(FlowSpec):
    bar_version = Parameter('bar-version',  help="Specifc the tag for the input version", required=True)
    @step
    def start(self):
        with DoltDT(run=self, database='foo', branch=self.bar_version) as dolt:
            self.df = dolt.read_table('bar')

        self.next(self.middle)

    @step
    def middle(self):
        with DoltDT(run=self, database='foo', branch=self.bar_version) as dolt:

            df = self.df
            df["B"] = df["B"].map(lambda x: x*2)

            dolt.write_table(table_name='baz', df=df, pks=['index'])

        self.next(self.end)

    @step
    def end(self):
        pass


if __name__ == '__main__':
    VersioningDemo()


In [58]:
!poetry run python3 demo.py run --bar-version $v1

[35m[1mMetaflow 2.2.5.post21+git36980c8[0m[35m[22m executing [0m[31m[1mVersioningDemo[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:max-hoffman[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[1m    Pylint is happy![K[0m[32m[1m[0m
[35m2021-01-17 14:34:12.193 [0m[1mWorkflow starting (run-id 1610922852187706):[0m
[35m2021-01-17 14:34:12.202 [0m[32m[1610922852187706/start/1 (pid 26961)] [0m[1mTask is starting.[0m
[35m2021-01-17 14:34:13.440 [0m[32m[1610922852187706/start/1 (pid 26961)] [0m[22m01-17 14:34:13 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m
[35m2021-01-17 14:34:13.471 [0m[32m[1610922852187706/start/1 (pid 26961)] [0m[22m01-17 14:34:13 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m
[3

[35m2021-01-17 14:34:16.737 [0m[32m[1610922852187706/middle/2 (pid 26997)] [0m[1mTask finished successfully.[0m
[35m2021-01-17 14:34:16.745 [0m[32m[1610922852187706/end/3 (pid 27037)] [0m[1mTask is starting.[0m
[35m2021-01-17 14:34:17.813 [0m[32m[1610922852187706/end/3 (pid 27037)] [0m[22m01-17 14:34:17 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes[0m
[35m2021-01-17 14:34:17.820 [0m[32m[1610922852187706/end/3 (pid 27037)] [0m[22m01-17 14:34:17 doltpy.core.system_helpers INFO     No processes to clean up, exiting[0m
[35m2021-01-17 14:34:17.967 [0m[32m[1610922852187706/end/3 (pid 27037)] [0m[1mTask finished successfully.[0m
[35m2021-01-17 14:34:17.968 [0m[1mDone![0m
01-17 14:34:17 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes
01-17 14:34:17 doltpy.core.system_helpers INFO     No processes to clean up, exiting


In [59]:
!cd foo && dolt log -n 1

[33mcommit 5gqv0f2ppjnbvnuutrouucg7hjmgj8lj[0m
Author: Max Hoffman <maximilian.wolfgang1@gmail.com>
Date:   Sun Jan 17 14:34:16 -0800 2021

	VersioningDemo/1610922852187706/middle/2



In [4]:
# save for later
run1_id = "1610922852187706"

In [60]:
!poetry run python3 demo.py run --bar-version $v2

[35m[1mMetaflow 2.2.5.post21+git36980c8[0m[35m[22m executing [0m[31m[1mVersioningDemo[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:max-hoffman[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[1m    Pylint is happy![K[0m[32m[1m[0m
[35m2021-01-17 14:36:26.910 [0m[1mWorkflow starting (run-id 1610922986903927):[0m
[35m2021-01-17 14:36:26.917 [0m[32m[1610922986903927/start/1 (pid 27057)] [0m[1mTask is starting.[0m
[35m2021-01-17 14:36:28.073 [0m[32m[1610922986903927/start/1 (pid 27057)] [0m[22m01-17 14:36:28 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m
[35m2021-01-17 14:36:28.105 [0m[32m[1610922986903927/start/1 (pid 27057)] [0m[22m01-17 14:36:28 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m
[3

[35m2021-01-17 14:36:31.078 [0m[32m[1610922986903927/middle/2 (pid 27091)] [0m[1mTask finished successfully.[0m
[35m2021-01-17 14:36:31.083 [0m[32m[1610922986903927/end/3 (pid 27131)] [0m[1mTask is starting.[0m
[35m2021-01-17 14:36:32.193 [0m[32m[1610922986903927/end/3 (pid 27131)] [0m[22m01-17 14:36:32 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes[0m
[35m2021-01-17 14:36:32.200 [0m[32m[1610922986903927/end/3 (pid 27131)] [0m[22m01-17 14:36:32 doltpy.core.system_helpers INFO     No processes to clean up, exiting[0m
[35m2021-01-17 14:36:32.357 [0m[32m[1610922986903927/end/3 (pid 27131)] [0m[1mTask finished successfully.[0m
[35m2021-01-17 14:36:32.357 [0m[1mDone![0m
01-17 14:36:32 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes
01-17 14:36:32 doltpy.core.system_helpers INFO     No processes to clean up, exiting


In [62]:
!cd foo && dolt log -n 1

[33mcommit 0a0rptr3q0co1sum6k4ma3tac7sqctkq[0m
Author: Max Hoffman <maximilian.wolfgang1@gmail.com>
Date:   Sun Jan 17 14:36:30 -0800 2021

	VersioningDemo/1610922986903927/middle/2



In [3]:
run2_id = "1610922986903927"

In [7]:
from metaflow.datatools.dolt import DoltDT, DoltRun
d1 = DoltRun(flow_name="VersioningDemo", run_id=run1_id)
d2 = DoltRun(flow_name="VersioningDemo", run_id=run2_id)

res1 = d1.writes[0].data
res2 = d2.writes[0].data

01-17 14:44:48 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306
01-17 14:44:48 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306
01-17 14:44:48 doltpy.core.dolt INFO     flow_name,run_id,step_name,task_id,kind,database,table_name,commit,timestamp
VersioningDemo,1610922852187706,middle,2,write,foo,baz,5gqv0f2ppjnbvnuutrouucg7hjmgj8lj,1.6109229e+09

01-17 14:44:48 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306
01-17 14:44:48 doltpy.core.dolt INFO     index,A,B
0,1,11
1,1,11
2,1,11

01-17 14:44:48 doltpy.core.dolt INFO     flow_name,run_id,step_name,task_id,kind,database,table_name,commit,timestamp
VersioningDemo,1610922986903927,middle,2,write,foo,baz,0a0rptr3q0co1sum6k4ma3tac7sqctkq,1.610923e+09

01-17 14:44:48 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306


foo 5gqv0f2ppjnbvnuutrouucg7hjmgj8lj baz
foo 0a0rptr3q0co1sum6k4ma3tac7sqctkq baz


01-17 14:44:48 doltpy.core.dolt INFO     index,A,B
0,1,11
1,1,11
2,1,11
3,2,22
4,2,22
5,2,22



In [10]:
res1

Unnamed: 0,index,A,B
0,0,1,11
1,1,1,11
2,2,1,11


In [9]:
res2

Unnamed: 0,index,A,B
0,0,1,11
1,1,1,11
2,2,1,11
3,3,2,22
4,4,2,22
5,5,2,22
