# Reproducibility: versioning

1. Create two versions of a dataframe (v1, v2)
2. Commit both to dolt as table: `bar` -> get two commit references
3. Run Flow with two different versions of `bar` (parameter to specify)
4. Read the results table `baz` for the two runs given the `run_id`'s

In [25]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.WARNING)

import pandas as pd
from doltpy.core import Dolt
from doltpy.core.write import import_df

dolt = Dolt.init("foo")

df_v1 = pd.DataFrame({"A": [1,1,1], "B": [1,1,1]})
df_v2 = pd.DataFrame({"A": [1,1,1,2,2,2], "B": [1,1,1,2,2,2]})

import_df(dolt, "bar", df_v1.reset_index(), ["index"], "create")
dolt.add("bar")
dolt.commit("Initialize bar")

v1 = list(dolt.log(number="1").keys())[0]

import_df(dolt, "bar", df_v2.reset_index(), ["index"], "update")
dolt.add("bar")
dolt.commit("Add rows to bar")

v2 = list(dolt.log(number="1").keys())[0]

In [10]:
! cat demo.py

import logging

logger = logging.getLogger()

import pickle

from metaflow import FlowSpec, step, DoltDT, Parameter
import pandas as pd
from sklearn import tree

class VersioningDemo(FlowSpec):
    bar_version = Parameter('bar-version',  help="Specifc the tag for the input version", required=True)
    @step
    def start(self):
        with DoltDT(run=self, database='foo', branch="master") as dolt:
            self.df = dolt.read_table('bar', commit=self.bar_version)

        self.next(self.middle)

    @step
    def middle(self):
        with DoltDT(run=self, database='foo', branch="master") as dolt:

            df = self.df
            df["B"] = df["B"].map(lambda x: x*2)

            dolt.write_table(table_name='baz', df=df, pks=['index'])

        self.next(self.end)

    @step
    def end(self):
        pass


if __name__ == '__main__':
    VersioningDemo()


In [26]:
!poetry run python3 demo.py run --bar-version $v1

[35m[1mMetaflow 2.2.5.post24+git1f18147[0m[35m[22m executing [0m[31m[1mVersioningDemo[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:max-hoffman[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[1m    Pylint is happy![K[0m[32m[1m[0m
[35m2021-01-17 19:02:28.232 [0m[1mWorkflow starting (run-id 1610938948223837):[0m
[35m2021-01-17 19:02:28.239 [0m[32m[1610938948223837/start/1 (pid 31582)] [0m[1mTask is starting.[0m
[35m2021-01-17 19:02:29.325 [0m[32m[1610938948223837/start/1 (pid 31582)] [0m[22m01-17 19:02:29 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m
[35m2021-01-17 19:02:29.356 [0m[32m[1610938948223837/start/1 (pid 31582)] [0m[22m01-17 19:02:29 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m
[3

[35m2021-01-17 19:02:32.228 [0m[32m[1610938948223837/middle/2 (pid 31616)] [0m[22m01-17 19:02:32 doltpy.core.system_helpers INFO     No processes to clean up, exiting[0m
[35m2021-01-17 19:02:32.410 [0m[32m[1610938948223837/middle/2 (pid 31616)] [0m[1mTask finished successfully.[0m
[35m2021-01-17 19:02:32.417 [0m[32m[1610938948223837/end/3 (pid 31656)] [0m[1mTask is starting.[0m
[35m2021-01-17 19:02:33.596 [0m[32m[1610938948223837/end/3 (pid 31656)] [0m[22m01-17 19:02:33 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes[0m
[35m2021-01-17 19:02:33.603 [0m[32m[1610938948223837/end/3 (pid 31656)] [0m[22m01-17 19:02:33 doltpy.core.system_helpers INFO     No processes to clean up, exiting[0m
[35m2021-01-17 19:02:33.776 [0m[32m[1610938948223837/end/3 (pid 31656)] [0m[1mTask finished successfully.[0m
[35m2021-01-17 19:02:33.776 [0m[1mDone![0m
01-17 19:02:33 doltpy.core.system_helpers INFO     Before exiting cleaning up child 

In [27]:
!cd foo && dolt log -n 1

[33mcommit 01pnohoe7p3tevbg6u35k36a4ljobqjb[0m
Author: Max Hoffman <maximilian.wolfgang1@gmail.com>
Date:   Sun Jan 17 19:02:31 -0800 2021

	VersioningDemo/1610938948223837/middle/2



In [28]:
# save for later
run1_id = "1610938948223837"

In [29]:
!poetry run python3 demo.py run --bar-version $v2

[35m[1mMetaflow 2.2.5.post24+git1f18147[0m[35m[22m executing [0m[31m[1mVersioningDemo[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:max-hoffman[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[1m    Pylint is happy![K[0m[32m[1m[0m
[35m2021-01-17 19:02:45.773 [0m[1mWorkflow starting (run-id 1610938965767319):[0m
[35m2021-01-17 19:02:45.781 [0m[32m[1610938965767319/start/1 (pid 31675)] [0m[1mTask is starting.[0m
[35m2021-01-17 19:02:46.964 [0m[32m[1610938965767319/start/1 (pid 31675)] [0m[22m01-17 19:02:46 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m
[35m2021-01-17 19:02:46.993 [0m[32m[1610938965767319/start/1 (pid 31675)] [0m[22m01-17 19:02:46 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m
[3

[35m2021-01-17 19:02:50.034 [0m[32m[1610938965767319/middle/2 (pid 31709)] [0m[1mTask finished successfully.[0m
[35m2021-01-17 19:02:50.042 [0m[32m[1610938965767319/end/3 (pid 31749)] [0m[1mTask is starting.[0m
[35m2021-01-17 19:02:51.212 [0m[32m[1610938965767319/end/3 (pid 31749)] [0m[22m01-17 19:02:51 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes[0m
[35m2021-01-17 19:02:51.219 [0m[32m[1610938965767319/end/3 (pid 31749)] [0m[22m01-17 19:02:51 doltpy.core.system_helpers INFO     No processes to clean up, exiting[0m
[35m2021-01-17 19:02:51.378 [0m[32m[1610938965767319/end/3 (pid 31749)] [0m[1mTask finished successfully.[0m
[35m2021-01-17 19:02:51.378 [0m[1mDone![0m
01-17 19:02:51 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes
01-17 19:02:51 doltpy.core.system_helpers INFO     No processes to clean up, exiting


In [30]:
!cd foo && dolt log -n 1

[33mcommit ttd37vr2p5h9j6ormidbn0u7ojubbetg[0m
Author: Max Hoffman <maximilian.wolfgang1@gmail.com>
Date:   Sun Jan 17 19:02:49 -0800 2021

	VersioningDemo/1610938965767319/middle/2



In [31]:
run2_id = "1610938965767319"

In [32]:
from metaflow.datatools.dolt import DoltDT, DoltRun
d1 = DoltRun(flow_name="VersioningDemo", run_id=run1_id)
d2 = DoltRun(flow_name="VersioningDemo", run_id=run2_id)

res1 = d1.writes[0].data
res2 = d2.writes[0].data

foo 01pnohoe7p3tevbg6u35k36a4ljobqjb baz
foo ttd37vr2p5h9j6ormidbn0u7ojubbetg baz


In [33]:
res1

Unnamed: 0,index,A,B
0,0,1,11
1,1,1,11
2,2,1,11


In [34]:
res2

Unnamed: 0,index,A,B
0,0,1,11
1,1,1,11
2,2,1,11
3,3,2,22
4,4,2,22
5,5,2,22
