In [None]:
!pip install -U "sagemaker>2.0"

In [None]:
import boto3
import sagemaker

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name


## Write the PySpark script

The source for a preprocessing script is in the cell below. The cell uses the %%writefile directive to save this file locally. This script does some basic feature engineering on a raw input dataset.

In [None]:
%%writefile ./preprocess-nyctaxi-parquet.py
import argparse
import os
import sys
import subprocess

subprocess.check_call([sys.executable, "-m", "pip", "install", "xgboost==1.7.2", "scikit-learn", "pyarrow", "pandas"])

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    VectorAssembler,
    VectorIndexer,
)
from pyspark.sql.functions import *

def main():
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    parser.add_argument("--s3_input_bucket", type=str, help="s3 input bucket")
    parser.add_argument("--s3_input_key_prefix", type=str, help="s3 input key prefix")
    parser.add_argument("--s3_output_bucket", type=str, help="s3 output bucket")
    parser.add_argument("--s3_output_key_prefix", type=str, help="s3 output key prefix")
    args = parser.parse_args()

    spark = SparkSession.builder.appName("PySparkApp").getOrCreate()

    df_ride_fare = spark.read.option("recursiveFileLookup", "true").parquet(
        os.path.join("s3://", args.s3_input_bucket, args.s3_input_key_prefix, "ride-fare")
    )
    df_ride_fare.show()

        
    df_ride_info = spark.read.option("recursiveFileLookup", "true").parquet(
        os.path.join("s3://", args.s3_input_bucket, args.s3_input_key_prefix, "ride-info")
    )
    df_ride_info.show()

        
    df_joined = df_ride_fare.join(df_ride_info, on="ride_id")

    # (Optional) Calculate average total_amount per passenger count
    # df_avg_amount_by_passenger_count = df_joined.select("passenger_count", "total_amount") \
    #                                             .groupby("passenger_count") \
    #                                             .avg("total_amount") \
    #                                             .sort("passenger_count")
    # df_avg_amount_by_passenger_count.show()

    
    # Drop columns
    df_dropped = df_joined.drop("ride_id") \
                          .drop("vendor_id") \
                          .drop("pickup_at") \
                          .drop("dropoff_at") \
                          .drop("store_and_fwd_flag")
    df_dropped.show()

    # Move 'total_amount' to the first column
    df_columns = df_dropped.columns
    feature_columns = [feature for feature in df_columns if feature != 'total_amount']
    df_dropped = df_dropped.select(['total_amount'] + feature_columns)

    # Remove all rows with negative `total_amount`
    df_dropped = df_dropped[(df_dropped.total_amount >= 0)]
    
    # Split the dataset randomly into 70% for training and 30% for testing. Passing a seed for deterministic behavior
    df_train, df_validation = df_dropped.randomSplit([0.7, 0.3], seed = 42)
    print("There are %d train examples and %d validation examples." % (df_train.count(), df_validation.count()))

    # Write out the data
    df_train.write.option("mode", "overwrite").parquet(os.path.join("s3://", args.s3_output_bucket, args.s3_output_key_prefix, "train"))
    df_validation.write.option("mode", "overwrite").parquet(os.path.join("s3://", args.s3_output_bucket, args.s3_output_key_prefix, "validation"))


if __name__ == "__main__":
    main()

## OUTPUT LOCATION

In [None]:
import time

prefix = f'gsml-nyc-taxi-full-script-mode/{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'
output_key_prefix = f"{prefix}/output"
print(f'Output S3: s3://{bucket}/{output_key_prefix}')
spark_event_logs_s3_uri = f"s3://{bucket}/{prefix}/spark_event_logs"
print(f'spark_event_logs_s3_uri: {spark_event_logs_s3_uri}')

print(f'Output train data s3 URI: s3://{bucket}/{output_key_prefix}/train/')
print(f'Output validation data s3 URI: s3://{bucket}/{output_key_prefix}/validation/')

In [None]:
from sagemaker.spark.processing import PySparkProcessor

# Run the processing job
processor = PySparkProcessor(
    base_job_name="sm-spark-3-2-py39",
    framework_version="3.2",
    # image_uri="173754725891.dkr.ecr.us-east-1.amazonaws.com/sagemaker-spark-processing:3.2-cpu-py39-v1.0",
    role=role,
    instance_count=3,
    instance_type="ml.m5.24xlarge",
    max_runtime_in_seconds=86400,
    volume_size_in_gb=200
)

processor.run(
    submit_app="./preprocess-nyctaxi-parquet.py",
    arguments=[
        "--s3_input_bucket", "dsoaws",
        "--s3_input_key_prefix", "nyc-taxi-orig-cleaned-split-parquet-per-year-multiple-files",
        "--s3_output_bucket", bucket,
        "--s3_output_key_prefix", output_key_prefix,
 
    ],
    spark_event_logs_s3_uri=spark_event_logs_s3_uri,
    logs=False,
    wait=False
)

In [None]:
processing_job_name = processor.jobs[-1].describe()["ProcessingJobName"]
print(processing_job_name)

In [None]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/processing-jobs/{}">Processing Job</a></b>'.format(
            region, processing_job_name
        )
    )
)

In [None]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/ProcessingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(
            region, processing_job_name
        )
    )
)