In [11]:
from daggerml import Dml, Error, Resource
from dml_util import funkify, S3Store, funk
import os

In [None]:
os.environ["DML_S3_BUCKET"] ="dml-examples"
os.environ["DML_S3_PREFIX"] ="clustering"
os.environ["DML_DEBUG"] = "1"

In [22]:
DOCKER_CONTEXT_DIR ="./dkr-context"
AWS_CREDS = os.path.expanduser("~/.aws/credentials")
DOCKER_FLAGS = ["-v", f"{AWS_CREDS}:/root/.aws/credentials:ro", "-e","AWS_SHARED_CREDENTIALS_FILE=/root/.aws/credentials"]

In [37]:
dml = Dml(repo="tutorial", branch="main")
dag = dml.new("ml-example")
s3 = S3Store()


In [42]:

excludes = [
    "tests/*.py",
    ".pytest_cache",
    ".ruff_cache",
    "__pycache__",
    "examples",
    ".venv",
    "**/.venv",
]

dag.tar = s3.tar(dml, DOCKER_CONTEXT_DIR, excludes=excludes)
dag.dkr = funk.dkr_build
dag.img = dag.dkr(
    dag.tar,
    ["--platform", "linux/amd64"],
    timeout=60_000,
)
dag.fn = funkify(
    fn,
    "docker",
    {"image": dag.img.value(), "flags": DOCKER_FLAGS},
    adapter="local",
)

[d01113ea] INFO dml_util.adapters.base: CloudWatch logging not enabled due to AWS access error: An error occurred (ResourceNotFoundException) when calling the DescribeLogStreams operation: The specified log group does not exist.
[d01113ea] DEBUG dml_util.adapters.base: reading data from <_io.TextIOWrapper name='<stdin>' mode='r' encoding='utf-8'>
[d01113ea] INFO dml_util.runners.base: getting info from 'LocalState'
[d01113ea] DEBUG dml_util.runners.local: Submitting script to local runner
[d01113ea] DEBUG dml_util.runners.local: Environment for script: {"DML_S3_BUCKET": "dml-examples", "DML_S3_PREFIX": "clustering", "DML_LOG_GROUP": "dml", "DML_RUN_ID": "d01113ea", "DML_DEBUG": "1", "DML_INPUT_LOC": "/tmp/dml.j_o7u1f8/input.dump", "DML_OUTPUT_LOC": "/tmp/dml.j_o7u1f8/output.dump", "DML_LOG_STDOUT": "/run/a2b82e541ec19cfcc8021b3794b92c02/stdout", "DML_LOG_STDERR": "/run/a2b82e541ec19cfcc8021b3794b92c02/stderr"}
[d01113ea] INFO dml_util.runners.local: Process 151040 started in /tmp/dml.j

In [27]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Using cached scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Downloading scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
Collecting joblib>=1.1.1 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.1 MB)
Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-l

In [46]:
@funkify(uri="docker",data={"image":dag.img.value(), "flags": DOCKER_FLAGS})
@funkify
def load_data(dag):
    from tempfile import NamedTemporaryFile
    from dml_util import S3Store
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    import pandas as pd
    s3 = S3Store()
    params = dag.argv[1].value()
    X, y = load_iris(as_frame=True, return_X_y=True)
    splits = train_test_split(X, y, random_state=params["random_state"])
    out = {}
    for name, spl in zip(["X_train", "X_test", "y_train", "y_test"], splits):
        with NamedTemporaryFile() as temp:
            if isinstance(spl,pd.Series):
                spl = spl.to_frame("class")
            spl.to_parquet(temp.name)
            temp.seek(0)
            out[name] = s3.put(filepath=temp.name, suffix=".parquet")

    return out 

dag.load_data = load_data
params = {"random_state": 2}
data = dag.load_data(params, name="iris")
    

In [None]:
@funkify
def fit_hdbscan():
    #TODO
    #find a way to serialize the model (pickle?)
    #send to s3 store
    pass

In [49]:
dag.iris

DictNode(node/e7fc3349e17724d182f1d623f348a94a)

In [52]:
data.keys()

['X_test', 'X_train', 'y_test', 'y_train']