# Reproducibility - resume

1. Create and commit two versions of dataframe -> v1, v2
2. Run flow 1 with v1 data -> crashes b/c flow 2 hasn't run
3. Run flow 2 with v2 data
4. Resume flow 1 successfully

In [26]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.WARNING)

from doltpy.core import Dolt
from doltpy.core.write import import_df
import pandas as pd

dolt = Dolt.init("foo")

df_v1 = pd.DataFrame({"A": [1,1,1], "B": [1,1,1]})
df_v2 = pd.DataFrame({"A": [1,1,1,2,2,2], "B": [1,1,1,2,2,2]})

import_df(dolt, "bar", df_v1.reset_index(), ["index"], "create")
dolt.add("bar")
dolt.commit("Initialize bar")

v1 = list(dolt.log(number="1").keys())[0]

import_df(dolt, "bar", df_v2.reset_index(), ["index"], "update")
dolt.add("bar")
dolt.commit("Add rows to bar")

v2 = list(dolt.log(number="1").keys())[0]

In [24]:
!cat succeeds_second.py

import datetime
import pickle

from metaflow import FlowSpec, step, DoltDT, Parameter, Flow
import pandas as pd
from sklearn import tree

class SucceedsSecondDemo(FlowSpec):

    bar_version = Parameter('bar-version',  help="Specifc the tag for the input version", required=True)

    @step
    def start(self):
        with DoltDT(run=self, database='foo', branch="master") as dolt:
            self.df = dolt.read_table('bar')

        first_run = Flow("SucceedsFirstDemo").latest_successful_run
        first_run_ts = datetime.datetime.strptime(first_run.finished_at, "%Y-%m-%dT%H:%M:%SZ")
        if first_run_ts < (datetime.datetime.now() - datetime.timedelta(minutes=1)):
            raise Exception("Run `FirstDemo` within one minute of `SecondDemo`")

        self.next(self.middle)

    @step
    def middle(self):
        with DoltDT(run=self, database='foo', branch="master") as dolt:
            df = self.df
            df["B"] = df["B"].map(lambda x: x*2)



In [28]:
!cat succeeds_first.py

import pickle

from metaflow import FlowSpec, step, DoltDT, Parameter
import pandas as pd
from sklearn import tree

class SucceedsFirstDemo(FlowSpec):

    bar_version = Parameter('bar-version',  help="Specifc the tag for the input version", required=True)

    @step
    def start(self):
        with DoltDT(run=self, database='foo', branch="master") as dolt:
            self.df = dolt.read_table('bar', commit=self.bar_version)

        self.next(self.middle)

    @step
    def middle(self):
        with DoltDT(run=self, database='foo', branch="master") as dolt:

            df = self.df
            df["B"] = df["B"].map(lambda x: x*2)

            dolt.write_table(table_name='baz', df=df, pks=['index'])

        self.next(self.end)

    @step
    def end(self):
        pass


if __name__ == '__main__':
    SucceedsFirstDemo()


In [45]:
!poetry run python3 succeeds_second.py run --bar-version $v1

[35m[1mMetaflow 2.2.5.post23+git20868e9[0m[35m[22m executing [0m[31m[1mSucceedsSecondDemo[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:max-hoffman[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[1m    Pylint is happy![K[0m[32m[1m[0m
[35m2021-01-17 18:13:27.616 [0m[1mWorkflow starting (run-id 1610936007609681):[0m
[35m2021-01-17 18:13:27.624 [0m[32m[1610936007609681/start/1 (pid 30483)] [0m[1mTask is starting.[0m
[35m2021-01-17 18:13:28.998 [0m[32m[1610936007609681/start/1 (pid 30483)] [0m[22m01-17 18:13:28 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m
[35m2021-01-17 18:13:29.046 [0m[32m[1610936007609681/start/1 (pid 30483)] [0m[22m01-17 18:13:29 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m

In [46]:
!poetry run python3 succeeds_first.py run --bar-version $v2

[35m[1mMetaflow 2.2.5.post23+git20868e9[0m[35m[22m executing [0m[31m[1mSucceedsFirstDemo[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:max-hoffman[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[1m    Pylint is happy![K[0m[32m[1m[0m
[35m2021-01-17 18:13:45.921 [0m[1mWorkflow starting (run-id 1610936025914102):[0m
[35m2021-01-17 18:13:45.929 [0m[32m[1610936025914102/start/1 (pid 30526)] [0m[1mTask is starting.[0m
[35m2021-01-17 18:13:47.168 [0m[32m[1610936025914102/start/1 (pid 30526)] [0m[22m01-17 18:13:47 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m
[35m2021-01-17 18:13:47.196 [0m[32m[1610936025914102/start/1 (pid 30526)] [0m[22m01-17 18:13:47 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m


[35m2021-01-17 18:13:50.690 [0m[32m[1610936025914102/middle/2 (pid 30560)] [0m[1mTask finished successfully.[0m
[35m2021-01-17 18:13:50.699 [0m[32m[1610936025914102/end/3 (pid 30600)] [0m[1mTask is starting.[0m
[35m2021-01-17 18:13:51.891 [0m[32m[1610936025914102/end/3 (pid 30600)] [0m[22m01-17 18:13:51 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes[0m
[35m2021-01-17 18:13:51.898 [0m[32m[1610936025914102/end/3 (pid 30600)] [0m[22m01-17 18:13:51 doltpy.core.system_helpers INFO     No processes to clean up, exiting[0m
[35m2021-01-17 18:13:52.056 [0m[32m[1610936025914102/end/3 (pid 30600)] [0m[1mTask finished successfully.[0m
[35m2021-01-17 18:13:52.056 [0m[1mDone![0m
01-17 18:13:52 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes
01-17 18:13:52 doltpy.core.system_helpers INFO     No processes to clean up, exiting


In [47]:
!poetry run python3 succeeds_second.py resume start

[35m[1mMetaflow 2.2.5.post23+git20868e9[0m[35m[22m executing [0m[31m[1mSucceedsSecondDemo[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:max-hoffman[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[1m    Pylint is happy![K[0m[32m[1m[0m
[35m2021-01-17 18:14:07.621 [0m[22mGathering required information to resume run (this may take a bit of time)...[0m
[35m2021-01-17 18:14:07.631 [0m[1mWorkflow starting (run-id 1610936047620343):[0m
[35m2021-01-17 18:14:07.639 [0m[32m[1610936047620343/start/1 (pid 30615)] [0m[1mTask is starting.[0m
[35m2021-01-17 18:14:08.711 [0m[32m[1610936047620343/start/1 (pid 30615)] [0m[22m01-17 18:14:08 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m
[35m2021-01-17 18:14:08.737 [0m[32m[1610936047620343/start/1 (pid 30615)] [0m

[35m2021-01-17 18:14:11.801 [0m[32m[1610936047620343/middle/2 (pid 30649)] [0m[1mTask finished successfully.[0m
[35m2021-01-17 18:14:11.808 [0m[32m[1610936047620343/end/3 (pid 30689)] [0m[1mTask is starting.[0m
[35m2021-01-17 18:14:12.995 [0m[32m[1610936047620343/end/3 (pid 30689)] [0m[22m01-17 18:14:12 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes[0m
[35m2021-01-17 18:14:13.003 [0m[32m[1610936047620343/end/3 (pid 30689)] [0m[22m01-17 18:14:13 doltpy.core.system_helpers INFO     No processes to clean up, exiting[0m
[35m2021-01-17 18:14:13.166 [0m[32m[1610936047620343/end/3 (pid 30689)] [0m[1mTask finished successfully.[0m
[35m2021-01-17 18:14:13.166 [0m[1mDone![0m
01-17 18:14:13 doltpy.core.system_helpers INFO     Before exiting cleaning up child processes
01-17 18:14:13 doltpy.core.system_helpers INFO     No processes to clean up, exiting
