# Sagemaker Spark

https://docs.aws.amazon.com/sagemaker/latest/dg/use-spark-processing-container.html


In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role


role = get_execution_role()

print(sagemaker.__version__)
print(role)

In [8]:
# configuration = [{
#     "Classification": "spark-defaults",
#     "Properties": {"spark.executor.extraClassPath": "/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar",
#                    "spark.driver.extraClassPath": "/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar"}
# }, {
#     "Classification": "spark-hive-site",
#     "Properties": {"hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"}
# }]

In [9]:
%%writefile spark_code.py

import pyspark
import argparse
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.window import Window


parser = argparse.ArgumentParser()
parser.add_argument("--schema", type=str, default="develop")
args = parser.parse_args()

schema = str(args.schema)
print(schema)

spark = SparkSession.builder.appName("PySparkApp") \
        .config("hive.metastore.client.factory.class",
                "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") \
        .enableHiveSupport() \
        .getOrCreate()


Overwriting spark_code.py


In [None]:
from sagemaker.spark.processing import PySparkProcessor

schema = "hyun2"
volume_size = 128
instance_type = "ml.m5.xlarge"

spark_processor = PySparkProcessor(
    base_job_name="spark-preprocessor",
    framework_version="3.0",
    role=role,
    instance_count=2,
    instance_type=instance_type,
    max_runtime_in_seconds=1200,
    volume_size_in_gb=volume_size,
)




In [None]:

# run directly
submit_jars = ["delta-core_2.12-0.8.0.jar"]
submit_py_files = ["some_files.py"]
spark_processor.run(
    submit_app="spark_code.py",
    submit_jars=submit_jars,
    submit_py_files=submit_py_files,
    #     configuration=configuration,
    arguments=["--schema", schema])

In [None]:
from sagemaker.workflow.steps import ProcessingStep

# run on pipeline step
submit_jars = ["delta-core_2.12-0.8.0.jar"]
submit_py_files = ["some_files.py"]
run_args = spark_processor.get_run_args(
    submit_app="spark_code.py",
    submit_jars=submit_jars,
    submit_py_files=submit_py_files,
    #     configuration=configuration,
    arguments=["--schema", schema])

step_spark_process = ProcessingStep(
    name="spark-process",
    processor=spark_processor,
    inputs=run_args.inputs,
    job_arguments=run_args.arguments,
    code=run_args.code
)
