In [68]:
%%writefile score.py

import pandas as pd
import statsmodels.api as sm
from azureml_user.parallel_run import EntryScript


def init():
    """Init once in a worker process."""
    entry_script = EntryScript()
    logger = entry_script.logger
    logger.info("This will show up in files under logs/user on the Azure portal.")


def run(input_data):
    """Call once for a mini batch. Accept and return the list back.
    This class is in singleton pattern and will return same instance as the one in init()"""
    entry_script = EntryScript()
    logger = entry_script.logger
    logger.info(f"{__file__}: {input_data}.")
    print("run() is called with: {}.".format(input_data))

    logger.info("KOT")
    for i in input_data:
        logger.info(f"Processing CSV={i}")
    logger.info("KOT")
    print(input_data)

    # as we have only one csv in our data -> index is 0
    logger.info(input_data[0])
    # input_data[0] returns path to the csv
    data = pd.read_csv(input_data[0])

    X = data.filter(regex='ColumnA')
    Y = data.filter(regex='ColumnB')
    model = sm.OLS(Y, X)
    result = model.fit()

    # returned results should be either a df or list
    r = (result.params).to_frame()

    return r

Overwriting model.py


In [53]:
import statsmodels.api as sm
import pandas as pd
from azureml.core import Workspace
from azureml.core.datastore import Datastore
from azureml.core.dataset import Dataset
from azureml.core.model import Model
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.exceptions import ComputeTargetException
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_GPU_IMAGE
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep

from datetime import datetime
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep

In [69]:
ws = Workspace(subscription_id="", #TODO
               resource_group="", #TODO
               workspace_name="") # TODO

ws.set_default_datastore('') # TODO

data_blob = ws.get_default_datastore()

# na blobie w contenerze data_container przerzucilam plik 
data = Dataset.File.from_files((data_blob,'data_container/ABCwithData.csv'))

type(data)

azureml.data.file_dataset.FileDataset

In [58]:
output_dir = PipelineData(name="scores", datastore=data_blob)

In [61]:
compute_name = "gpu-engine"

try:
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
except ComputeTargetException:
    config = AmlCompute.provisioning_configuration(vm_size="STANDARD_NC6",
                                                   vm_priority="lowpriority", 
                                                   min_nodes=1, 
                                                   max_nodes=1)

    compute_target = ComputeTarget.create(workspace=ws, name=compute_name, provisioning_configuration=config)
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)



In [62]:
cd = CondaDependencies.create(pip_packages=["pandas","statsmodels", "scikit-learn==0.20.3", "azureml-core", "azureml-dataset-runtime[pandas,fuse]"])
env = Environment(name="parallelenv")
env.python.conda_dependencies=cd
env.docker.base_image = DEFAULT_GPU_IMAGE

In [63]:
parallel_run_config = ParallelRunConfig(
    environment=env,
    entry_script="score.py",
    source_directory="./",
    output_action="append_row",
    append_row_file_name="parallel_run_step.txt",
    mini_batch_size="20000",
    error_threshold=1,
    compute_target=compute_target,
    process_count_per_node=1,
    node_count=1
)

In [65]:
parallel_step_name = "batchscoring-" + datetime.now().strftime("%Y%m%d%H%M")

dt = data.as_named_input('data')

batch_score_step = ParallelRunStep(
    name=parallel_step_name,
    inputs=[dt],
    output=output_dir,
    parallel_run_config=parallel_run_config,
    allow_reuse=False
)


In [None]:
pipeline = Pipeline(workspace=ws,steps=[batch_score_step])
pipeline_run = Experiment(ws, "Batch-Scoring-Experiment-Kot").submit(pipeline)

pipeline_run

pipeline_run.wait_for_completion(show_output=True)
