## Get AML workspace which has synapse spark pool attached

In [None]:
from azureml.core import Workspace, Experiment, Dataset, Environment
ws = Workspace.get(name='ws-dask-feli1',
                   subscription_id='1aefdc5e-3a7c-4d71-a9f9-f5d3b03be19a',
                   resource_group='feli1daskrg')
ws

## Leverage ScriptRunConfig to submit scala job to an attached synapse spark cluster

In [None]:
from azureml.core.runconfig import RunConfiguration
import uuid

run_config = RunConfiguration(framework="pyspark")
run_config.target = "link-pool"
run_config.spark.configuration["spark.driver.memory"] = "2g"
run_config.spark.configuration["spark.driver.cores"] = 2
run_config.spark.configuration["spark.executor.memory"] = "2g"
run_config.spark.configuration["spark.executor.cores"] = 1
run_config.spark.configuration["spark.executor.instances"] = 1
run_config.spark.configuration["spark.yarn.dist.jars"]="abfss://testfile@feli1devstoragegen2.dfs.core.windows.net/synapse/workspaces/feli1synapsews/batchjobs/testzipfile/scalaproj_2.11-0.1.jar"  # this can be removed if you are using local jars in source folder

dir_name = "multi-{}".format(str(uuid.uuid4()))
input_1 = "abfss://testfile@feli1devstoragegen2.dfs.core.windows.net/synapse/workspaces/feli1synapsews/batchjobs/testmulti/shakespeare.txt"
output = "abfss://testfile@feli1devstoragegen2.dfs.core.windows.net/synapse/workspaces/feli1synapsews/batchjobs/{}/result".format(dir_name)

from azureml.core import ScriptRunConfig
args = ['--input', input_1, '--output', output]
script_run_config = ScriptRunConfig(source_directory = '.',
                                    script= 'start_script.py',
                                    arguments= args,
                                    run_config = run_config)


In [None]:
from azureml.core import Experiment
exp = Experiment(workspace=ws, name='synapse-spark')
run = exp.submit(config=script_run_config)
run

## Leverage SynapseSparkStep in an AML pipeline to add dataprep step on synapse spark cluster

In [None]:
configs = {}
configs["spark.yarn.dist.jars"] = "abfss://testfile@feli1devstoragegen2.dfs.core.windows.net/synapse/workspaces/feli1synapsews/batchjobs/testzipfile/scalaproj_2.11-0.1.jar"
step_1 = SynapseSparkStep(name = 'synapse-spark',
                          file = 'start_script.py',
                          source_directory=".",
                          arguments = args,
                          compute_target = 'link-pool',
                          driver_memory = "2g",
                          driver_cores = 2,
                          executor_memory = "2g",
                          executor_cores = 1,
                          num_executors = 1,
                          conf = configs)

In [None]:
pipeline = Pipeline(workspace=ws, steps=[step_1])
pipeline_run = pipeline.submit('synapse-pipeline', regenerate_outputs=True)