In [2]:
# !pip install -U "sagemaker>2.0"

In [3]:
import boto3
import sagemaker

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name


In [4]:
from sagemaker.spark.processing import PySparkProcessor

# Run the processing job
processor = PySparkProcessor(
    base_job_name="sm-spark-3-2-py39",
    framework_version="3.2",
    image_uri="173754725891.dkr.ecr.us-east-1.amazonaws.com/sagemaker-spark-processing:3.2-cpu-py39-v1.0",
    role=role,
    instance_count=6,
    instance_type="ml.m5.24xlarge",
    max_runtime_in_seconds=86400,
    volume_size_in_gb=200
)


######################
######################
######################

# ./code/preprocess-parquet.py

######################
######################
######################

processor.run(
    submit_app="code/preprocess-parquet.py",
    arguments=[
        "--s3_input_bucket",
        "dsoaws",
        "--s3_input_key_prefix",
        "nyc-taxi-orig-cleaned-split-parquet-per-year",
        "--s3_output_bucket",
        bucket,
        "--s3_output_key_prefix",
        "nyc-taxi-orig-cleaned-split-parquet-per-year-output",
    ],
    spark_event_logs_s3_uri="s3://{}/{}/spark_event_logs".format(bucket, "nyc-taxi-orig-cleaned-split-parquet-per-year-logs"),
    logs=False,
    wait=False
)

INFO:sagemaker:Creating processing-job with name sm-spark-3-2-py39-2023-02-03-23-12-36-873


In [5]:
processing_job_name = processor.jobs[-1].describe()["ProcessingJobName"]
print(processing_job_name)

sm-spark-3-2-py39-2023-02-03-23-12-36-873


In [6]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/processing-jobs/{}">Processing Job</a></b>'.format(
            region, processing_job_name
        )
    )
)

In [7]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/ProcessingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(
            region, processing_job_name
        )
    )
)