# Reproducibility: versioning

1. Create two versions of a dataframe (v1, v2)
2. Commit both to dolt as table: `bar` -> get two commit references
3. Run Flow with two different versions of `bar` (parameter to specify)
4. Read and the results table `baz` for the two runs given the `run_id`'s

In [26]:
import pandas as pd
from doltpy.core import Dolt
from doltpy.core.write import import_df

dolt = Dolt.init("foo")

v1 = pd.DataFrame({"A": [1,1,1], "B": [1,1,1]})
v2 = pd.DataFrame({"A": [1,1,1,2,2,2], "B": [1,1,1,2,2,2]})

import_df(dolt, "bar", v1.reset_index(), ["index"], "create")
dolt.add("bar")
dolt.commit("Initialize bar")

01-15 11:28:25 doltpy.core.dolt INFO     Creating directory foo
01-15 11:28:25 doltpy.core.dolt INFO     Creating a new repo in foo


['dolt', 'init']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning


01-15 11:28:26 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306
01-15 11:28:26 doltpy.core.write.write INFO     Importing to table bar in dolt directory located in foo, import mode create
01-15 11:28:26 doltpy.core.dolt INFO     Rows Processed: 3, Additions: 3, Modifications: 0, Had No Effect: 0
01-15 11:28:26 doltpy.core.dolt INFO     


['dolt', 'table', 'import', 'bar', '-c', '--pk=index', '/var/folders/05/c0ll_wxd26j61fntnymm2k8c0000gn/T/tmp7n9hee1m.csv']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'add', 'bar']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'status']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'commit', '-m', 'Initialize bar']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning


01-15 11:28:26 doltpy.core.dolt INFO     commit skhib94eki65s6gofrrs4cb7fqt6dldm
Author: Max Hoffman <maximilian.wolfgang1@gmail.com>
Date:   Fri Jan 15 11:28:26 -0800 2021

	Initialize bar




In [27]:
!cd foo && dolt log -n 1

[33mcommit skhib94eki65s6gofrrs4cb7fqt6dldm[0m
Author: Max Hoffman <maximilian.wolfgang1@gmail.com>
Date:   Fri Jan 15 11:28:26 -0800 2021

	Initialize bar



In [28]:
# TODO: set this
v1 = "skhib94eki65s6gofrrs4cb7fqt6dldm"

In [29]:
import_df(dolt, "bar", v2.reset_index(), ["index"], "update")
dolt.add("bar")
dolt.commit("Add rows to bar")

01-15 11:28:37 doltpy.core.write.write INFO     Importing to table bar in dolt directory located in foo, import mode update
01-15 11:28:37 doltpy.core.dolt INFO     Rows Processed: 6, Additions: 3, Modifications: 0, Had No Effect: 3
01-15 11:28:37 doltpy.core.dolt INFO     


['dolt', 'table', 'import', 'bar', '-u', '/var/folders/05/c0ll_wxd26j61fntnymm2k8c0000gn/T/tmpzs2o5u7h.csv']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'add', 'bar']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'status']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'commit', '-m', 'Add rows to bar']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning


01-15 11:28:37 doltpy.core.dolt INFO     commit tgf2dkad1dgnu9067rodchr7p225q1ce
Author: Max Hoffman <maximilian.wolfgang1@gmail.com>
Date:   Fri Jan 15 11:28:37 -0800 2021

	Add rows to bar




In [30]:
!cd foo && dolt log -n 1

[33mcommit tgf2dkad1dgnu9067rodchr7p225q1ce[0m
Author: Max Hoffman <maximilian.wolfgang1@gmail.com>
Date:   Fri Jan 15 11:28:37 -0800 2021

	Add rows to bar



In [31]:
# TODO: set this
v2 = "tgf2dkad1dgnu9067rodchr7p225q1ce"

In [49]:
! cat demo.py

import pickle

from metaflow import FlowSpec, step, DoltDT, Parameter
import pandas as pd
from sklearn import tree

class VersioningDemo(FlowSpec):
    bar_version = Parameter('bar-version',  help="Specifc the tag for the input version", required=True)
    @step
    def start(self):
        with DoltDT(run=self, doltdb_path='foo', branch=self.bar_version) as dolt:
            self.df = dolt.read_table('bar')

        self.next(self.middle)

    @step
    def middle(self):
        with DoltDT(run=self, doltdb_path='foo', branch=self.bar_version) as dolt:

            df = self.df
            df["B"] = df["B"].map(lambda x: x*2)

            dolt.write_table(table_name='baz', df=df, pks=['index'])

        self.next(self.end)

    @step
    def end(self):
        pass


if __name__ == '__main__':
    VersioningDemo()


In [40]:
!poetry run python3 demo.py run --bar-version skhib94eki65s6gofrrs4cb7fqt6dldm

[35m[1mMetaflow 2.2.5.post18+gitfd0fb04[0m[35m[22m executing [0m[31m[1mVersioningDemo[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:max-hoffman[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[1m    Pylint is happy![K[0m[32m[1m[0m
[35m2021-01-15 11:32:04.780 [0m[1mWorkflow starting (run-id 1610739124772933):[0m
[35m2021-01-15 11:32:04.787 [0m[32m[1610739124772933/start/1 (pid 10302)] [0m[1mTask is starting.[0m
[35m2021-01-15 11:32:06.066 [0m[32m[1610739124772933/start/1 (pid 10302)] [0m[22m01-15 11:32:06 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m
[35m2021-01-15 11:32:06.159 [0m[32m[1610739124772933/start/1 (pid 10302)] [0m[22m01-15 11:32:06 doltpy.core.dolt INFO       master                                        	tgf2dkad1dgnu9067rodchr7p225q

[35m2021-01-15 11:32:09.150 [0m[32m[1610739124772933/middle/2 (pid 10325)] [0m[1mTask finished successfully.[0m
[35m2021-01-15 11:32:09.157 [0m[32m[1610739124772933/end/3 (pid 10367)] [0m[1mTask is starting.[0m
[35m2021-01-15 11:32:10.488 [0m[32m[1610739124772933/end/3 (pid 10367)] [0m[22m01-15 11:32:10 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes[0m
[35m2021-01-15 11:32:10.496 [0m[32m[1610739124772933/end/3 (pid 10367)] [0m[22m01-15 11:32:10 doltpy.core.system_helpers INFO     No processes to clean up, exiting[0m
[35m2021-01-15 11:32:10.680 [0m[32m[1610739124772933/end/3 (pid 10367)] [0m[1mTask finished successfully.[0m
[35m2021-01-15 11:32:10.680 [0m[1mDone![0m
01-15 11:32:10 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes
01-15 11:32:10 doltpy.core.system_helpers INFO     No processes to clean up, exiting


In [41]:
!cd foo && dolt log -n 1

[33mcommit pv4s3bpopnf8b5l9n579298r1vdjknb5[0m
Author: Max Hoffman <maximilian.wolfgang1@gmail.com>
Date:   Fri Jan 15 11:32:08 -0800 2021

	VersioningDemo/1610739124772933/middle/2



In [42]:
# save for later
run1 = "VersioningDemo/1610739124772933"

In [43]:
!poetry run python3 demo.py run --bar-version tgf2dkad1dgnu9067rodchr7p225q1ce

[35m[1mMetaflow 2.2.5.post18+gitfd0fb04[0m[35m[22m executing [0m[31m[1mVersioningDemo[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:max-hoffman[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[1m    Pylint is happy![K[0m[32m[1m[0m
[35m2021-01-15 11:32:30.116 [0m[1mWorkflow starting (run-id 1610739150109964):[0m
[35m2021-01-15 11:32:30.124 [0m[32m[1610739150109964/start/1 (pid 10386)] [0m[1mTask is starting.[0m
[35m2021-01-15 11:32:31.334 [0m[32m[1610739150109964/start/1 (pid 10386)] [0m[22m01-15 11:32:31 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m
[35m2021-01-15 11:32:31.428 [0m[32m[1610739150109964/start/1 (pid 10386)] [0m[22m01-15 11:32:31 doltpy.core.dolt INFO       master                                        	tgf2dkad1dgnu9067rodchr7p225q

[35m2021-01-15 11:32:34.402 [0m[32m[1610739150109964/middle/2 (pid 10409)] [0m[1mTask finished successfully.[0m
[35m2021-01-15 11:32:34.409 [0m[32m[1610739150109964/end/3 (pid 10451)] [0m[1mTask is starting.[0m
[35m2021-01-15 11:32:35.755 [0m[32m[1610739150109964/end/3 (pid 10451)] [0m[22m01-15 11:32:35 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes[0m
[35m2021-01-15 11:32:35.761 [0m[32m[1610739150109964/end/3 (pid 10451)] [0m[22m01-15 11:32:35 doltpy.core.system_helpers INFO     No processes to clean up, exiting[0m
[35m2021-01-15 11:32:35.938 [0m[32m[1610739150109964/end/3 (pid 10451)] [0m[1mTask finished successfully.[0m
[35m2021-01-15 11:32:35.939 [0m[1mDone![0m
01-15 11:32:35 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes
01-15 11:32:35 doltpy.core.system_helpers INFO     No processes to clean up, exiting


In [44]:
!cd foo && dolt log -n 1

[33mcommit u3q1ov313bk5pb9ihgbfqtm505jt11q0[0m
Author: Max Hoffman <maximilian.wolfgang1@gmail.com>
Date:   Fri Jan 15 11:32:33 -0800 2021

	VersioningDemo/1610739150109964/middle/2



In [45]:
run2 = "VersioningDemo/1610739150109964"

In [46]:
from metaflow.datatools.dolt import DoltDT, DoltClient
d = DoltClient(run1)
d.steps # ["middle"]
run1_res = d.step_artifacts(d.steps[0])["baz"]

d = DoltClient(run2)
d.steps # ["middle"]
run2_res = d.step_artifacts(d.steps[0])["baz"]

01-15 11:32:46 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306
01-15 11:32:46 doltpy.core.dolt INFO       master                                        	tgf2dkad1dgnu9067rodchr7p225q1ce
  tmp_1018977351311987979                       	bh0o9vs96nh3u0ae6kiqn03cg6j72l1q
  tmp_12856210901849877287                      	tgf2dkad1dgnu9067rodchr7p225q1ce
  tmp_29747918150139222007                      	tgf2dkad1dgnu9067rodchr7p225q1ce
  tmp_68432860454557606248                      	g3vutit9tvje18e0k2rtefudsi3s2m01
  tmp_71487223116425680420                      	tgf2dkad1dgnu9067rodchr7p225q1ce
  tmp_7428457039860019215                       	r055r4ur5qm96gtekdr096ah7pq9t8tq
  tmp_74549267022466356033                      	skhib94eki65s6gofrrs4cb7fqt6dldm
  tmp_97227446906231499282                      	pv4s3bpopnf8b5l9n579298r1vdjknb5
  tmp_99775912141817850388                      	skhib94eki65s6gofrrs4cb7fqt6dldm
* tmp_99810567825936726133 

['dolt', 'branch', '--list', '--verbose']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'checkout', 'pv4s3bpopnf8b5l9n579298r1vdjknb5']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'checkout', '-b', 'tmp_69385612680234777248', 'pv4s3bpopnf8b5l9n579298r1vdjknb5']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning


01-15 11:32:46 doltpy.core.dolt INFO     Switched to branch 'tmp_69385612680234777248'

01-15 11:32:46 doltpy.core.dolt INFO     index,A,B
0,1,2
1,1,2
2,1,2

01-15 11:32:46 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306
01-15 11:32:46 doltpy.core.dolt INFO       master                                        	tgf2dkad1dgnu9067rodchr7p225q1ce
  tmp_1018977351311987979                       	bh0o9vs96nh3u0ae6kiqn03cg6j72l1q
  tmp_12856210901849877287                      	tgf2dkad1dgnu9067rodchr7p225q1ce
  tmp_29747918150139222007                      	tgf2dkad1dgnu9067rodchr7p225q1ce
  tmp_68432860454557606248                      	g3vutit9tvje18e0k2rtefudsi3s2m01
* tmp_69385612680234777248                      	pv4s3bpopnf8b5l9n579298r1vdjknb5
  tmp_71487223116425680420                      	tgf2dkad1dgnu9067rodchr7p225q1ce
  tmp_7428457039860019215                       	r055r4ur5qm96gtekdr096ah7pq9t8tq
  tmp_74549267022466356033       

['dolt', 'sql', '--query', 'SELECT * FROM `baz` AS OF "pv4s3bpopnf8b5l9n579298r1vdjknb5"', '--result-format', 'csv']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'branch', '--list', '--verbose']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'checkout', 'u3q1ov313bk5pb9ihgbfqtm505jt11q0']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning


01-15 11:32:46 doltpy.core.dolt INFO     Switched to branch 'tmp_62882806928449093731'

01-15 11:32:46 doltpy.core.dolt INFO     index,A,B
0,1,2
1,1,2
2,1,2
3,2,4
4,2,4
5,2,4



['dolt', 'checkout', '-b', 'tmp_62882806928449093731', 'u3q1ov313bk5pb9ihgbfqtm505jt11q0']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning
['dolt', 'sql', '--query', 'SELECT * FROM `baz` AS OF "u3q1ov313bk5pb9ihgbfqtm505jt11q0"', '--result-format', 'csv']
/Users/max-hoffman/Documents/sandbox/dolt/metaflow/dolt-demos/1-reproducibility-3-versioning


In [47]:
run1_res

Unnamed: 0,index,A,B
0,0,1,2
1,1,1,2
2,2,1,2


In [48]:
run2_res

Unnamed: 0,index,A,B
0,0,1,2
1,1,1,2
2,2,1,2
3,3,2,4
4,4,2,4
5,5,2,4
