In [10]:
!dolt clone vinai/iris-test

cloning https://doltremoteapi.dolthub.com/vinai/iris-test
[31merror: data repository already exists at iris-test[0m


In [2]:
!dolt clone vinai/iris-model-results

cloning https://doltremoteapi.dolthub.com/vinai/iris-model-results
Retrieving remote informatio0 of 63 chunks complete. 0 chunks being downloaded currently0 of 63 chunks complete. 4 chunks being downloaded currently0 of 63 chunks complete. 8 chunks being downloaded currently4 of 63 chunks complete. 4 chunks being downloaded currently8 of 63 chunks complete. 0 chunks being downloaded currently8 of 63 chunks complete. 2 chunks being downloaded currently10 of 63 chunks complete. 0 chunks being downloaded currentl10 of 63 chunks complete. 4 chunks being downloaded currentl14 of 63 chunks complete. 0 chunks being downloaded currentl14 of 63 chunks complete. 4 chunks being downloaded currentl18 of 63 chunks complete. 0 chunks being downloaded currentl18 of 63 chunks complete. 1 chunks being downloaded currentl19 of 63 chunks complete. 0 chunks being downloaded currentl19 of 63 chunks complete. 4 chunks being downloaded currentl23 of 63 chunks complete. 0 chunks being downloaded currentl23 of

In [4]:
!cat iris_demo.py

from metaflow import FlowSpec, step, DoltDT
import pandas as pd
import pickle
from sklearn import tree

class DoltMLDemoFlow(FlowSpec):
    @step
    def start(self):
        # Start by getting original dataset
        with DoltDT(run=self, doltdb_path='iris-test') as dolt:
            self.test_set = dolt.read_table('iris-test')

        self.next(self.predict)

    @step
    def predict(self):
        with DoltDT(run=self, doltdb_path='iris-model-results') as dolt:
            self.model = pickle.load(open('model.p', 'rb'))
            self.model_type = 'Decision Tree'

            samples = self.test_set['sample']
            y_true = self.test_set['species']
            y_true = y_true.rename('labels')

            test = self.test_set.drop(columns=['species', 'sample'])
            predictions = pd.Series(self.model.predict(test))
            predictions = predictions.rename('predictions')

            self.result = pd.concat([samples, y_true, predictio

In [5]:
!poetry run python3 iris_demo.py run

[35m[1mMetaflow 2.2.5.post14+git4337f78[0m[35m[22m executing [0m[31m[1mDoltMLDemoFlow[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:max-hoffman[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[1m    Pylint is happy![K[0m[32m[1m[0m
[35m2021-01-14 12:05:54.259 [0m[1mWorkflow starting (run-id 1610654754251345):[0m
[35m2021-01-14 12:05:54.265 [0m[32m[1610654754251345/start/1 (pid 2731)] [0m[1mTask is starting.[0m
[35m2021-01-14 12:05:55.315 [0m[32m[1610654754251345/start/1 (pid 2731)] [0m[22m01-14 12:05:55 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306[0m
[35m2021-01-14 12:05:55.587 [0m[32m[1610654754251345/start/1 (pid 2731)] [0m[22m01-14 12:05:55 doltpy.core.dolt INFO     * master                                        	cnt6q9n22svhdvb1n3g90kk43k3b9aol

In [28]:
from dataclasses import dataclass
import time

from metaflow import Run
from doltpy.core.read import read_table_sql
from doltpy.core import Dolt
import pandas as pd

@dataclass
class Read:
    flow_name: str
    run_id: str
    step_name: str
    task_id: str
    table_name: str
    kind: str
    data: pd.DataFrame
    database: str = "."
    commit: str = None
    timestamp: float = time.time()
        
@dataclass
class Write:
    flow_name: str
    run_id: str
    step_name: str
    task_id: str
    table_name: str
    kind: str
    data: pd.DataFrame
    database: str = "."
    commit: str = None
    timestamp: float = time.time()
        
class DoltRun(object):

    def __init__(self, flow_name, run_id):
        self.flow_name = flow_name
        self.run_id = run_id
        self.db_cache = {}
        self.metadb = Dolt(".")
        self.db_cache["."] = self.metadb

    @property
    def steps(self):
        # use regular Client
        pass

    @property
    def reads(self):
        # query metadata
        # return objects that can load tables
        filters = f"flow_name = \"{self.flow_name}\""
        filters += f" AND run_id = \"{self.run_id}\""
        filters += f" AND kind = \"read\""
        df = read_table_sql(self.metadb, f"SELECT * FROM `metadata` WHERE {filters}")
        databases = df.database.values
        commits = df.commit.values
        tables = df.table_name.values
        dicts = df.to_dict("records")
        
        res = []
        
        row = 0
        for db_name, commit, table_name in zip(databases, commits, tables):
            print(db_name, commit, table_name)
            db = self.db_cache.get("db_name", None) or Dolt(db_name)
            table = read_table_sql(db, f"SELECT * FROM `{table_name}` AS OF \"{commit}\"")
            read = Read(data=table, **dicts[row])
            res.append(read)
            row += 1

        return res

    @property
    def writes(self):
        filters = f"flow_name = \"{self.flow_name}\""
        filters += f" AND run_id = \"{self.run_id}\""
        filters += f" AND kind = \"write\""
        df = read_table_sql(self.metadb, f"SELECT * FROM `metadata` WHERE {filters}")
        databases = df.database.values
        commits = df.commit.values
        tables = df.table_name.values
        dicts = df.to_dict("records")
        
        res = []
        
        row = 0
        for db_name, commit, table_name in zip(databases, commits, tables):
            print(db_name, commit, table_name)
            db = self.db_cache.get("db_name", None) or Dolt(db_name)
            table = read_table_sql(db, f"SELECT * FROM `{table_name}` AS OF \"{commit}\"")
            read = Write(data=table, **dicts[row])
            res.append(read)
            row += 1

        return res


In [30]:
d = DoltRun(flow_name="DoltMLDemoFlow", run_id="1610920919008498")

01-17 14:26:08 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306


In [32]:
d.reads[0].data

01-17 14:26:27 doltpy.core.dolt INFO     flow_name,run_id,step_name,task_id,kind,database,table_name,commit,timestamp
DoltMLDemoFlow,1610920919008498,start,1,read,iris-test,iris-test,cnt6q9n22svhdvb1n3g90kk43k3b9aol,1.610921e+09

01-17 14:26:27 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306
01-17 14:26:27 doltpy.core.dolt INFO     sample,sepal_length,sepal_width,petal_length,petal_width,species
3,4.7,3.2,1.3,0.2,setosa
5,5,3.6,1.4,0.2,setosa
10,4.9,3.1,1.5,0.1,setosa
29,5.2,3.4,1.4,0.2,setosa
32,5.4,3.4,1.5,0.4,setosa
35,4.9,3.1,1.5,0.1,setosa
40,5.1,3.4,1.5,0.2,setosa
42,4.5,2.3,1.3,0.3,setosa
51,7,3.2,4.7,1.4,versicolor
58,4.9,2.4,3.3,1,versicolor
60,5.2,2.7,3.9,1.4,versicolor
62,5.9,3,4.2,1.5,versicolor
63,6,2.2,4,1,versicolor
65,5.6,2.9,3.6,1.3,versicolor
67,5.6,3,4.5,1.5,versicolor
69,6.2,2.2,4.5,1.5,versicolor
70,5.6,2.5,3.9,1.1,versicolor
73,6.3,2.5,4.9,1.5,versicolor
74,6.1,2.8,4.7,1.2,versicolor
75,6.4,2.9,4.3,1.3,versicolor
13

iris-test cnt6q9n22svhdvb1n3g90kk43k3b9aol iris-test


Unnamed: 0,sample,sepal_length,sepal_width,petal_length,petal_width,species
0,3,4.7,3.2,1.3,0.2,setosa
1,5,5.0,3.6,1.4,0.2,setosa
2,10,4.9,3.1,1.5,0.1,setosa
3,29,5.2,3.4,1.4,0.2,setosa
4,32,5.4,3.4,1.5,0.4,setosa
5,35,4.9,3.1,1.5,0.1,setosa
6,40,5.1,3.4,1.5,0.2,setosa
7,42,4.5,2.3,1.3,0.3,setosa
8,51,7.0,3.2,4.7,1.4,versicolor
9,58,4.9,2.4,3.3,1.0,versicolor


In [33]:
d.writes[0].data

01-17 14:26:32 doltpy.core.dolt INFO     flow_name,run_id,step_name,task_id,kind,database,table_name,commit,timestamp
DoltMLDemoFlow,1610920919008498,predict,2,write,iris-model-results,result,1l7gietmsfm60kclhmip6gbortiotb0g,1.610921e+09

01-17 14:26:32 doltpy.core.dolt INFO     Creating engine for Dolt SQL Server instance running on 127.0.0.1:3306
01-17 14:26:32 doltpy.core.dolt INFO     sample,labels,predictions
3,setosa,setosa
5,setosa,setosa
10,setosa,setosa
29,setosa,setosa
32,setosa,setosa
35,setosa,setosa
40,setosa,setosa
42,setosa,setosa
51,versicolor,versicolor
58,versicolor,versicolor
60,versicolor,versicolor
62,versicolor,versicolor
63,versicolor,versicolor
65,versicolor,versicolor
67,versicolor,versicolor
69,versicolor,versicolor
70,versicolor,versicolor
73,versicolor,versicolor
74,versicolor,versicolor
75,versicolor,versicolor
79,versicolor,versicolor
89,versicolor,versicolor
91,versicolor,versicolor
104,virginica,virginica
115,virginica,virginica
121,virginica,virginica
1

iris-model-results 1l7gietmsfm60kclhmip6gbortiotb0g result


Unnamed: 0,sample,labels,predictions
0,3,setosa,setosa
1,5,setosa,setosa
2,10,setosa,setosa
3,29,setosa,setosa
4,32,setosa,setosa
5,35,setosa,setosa
6,40,setosa,setosa
7,42,setosa,setosa
8,51,versicolor,versicolor
9,58,versicolor,versicolor
