# Learning Journey  : Feature Engineering with Sagemaker Processing

### Load the data to S3 bucket

In [5]:
# Upload to S3 Bucket
from sagemaker import Session
import sagemaker
bucket=sagemaker.Session().default_bucket()
prefix = 'mlops/sagemaker-processing-activity'

sess = Session()
input_source = sess.upload_data('./bank-additional-full.csv', bucket=bucket, key_prefix=f'{prefix}/input_data')
input_source

's3://sagemaker-us-east-1-607119565685/mlops/sagemaker-processing-activity/input_data/bank-additional-full.csv'

### Define the IAM Role.

In [8]:
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()
role

'arn:aws:iam::607119565685:role/LearnMlOpsSageMakerExecutionRole'

### Fetch Preprocessing Script

In [16]:
!wget --no-check-certificate https://raw.githubusercontent.com/garganshulgarg/learn-mlops-with-sagemaker/refs/heads/main/applications/feature-engineering/feature-engg-script.py

--2024-11-04 15:14:33--  https://raw.githubusercontent.com/garganshulgarg/learn-mlops-with-sagemaker/refs/heads/feature-engg-sagemaker-processing/applications/feature-engineering/feature-engg-script.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 3285 (3.2K) [text/plain]
Saving to: ‘feature-engg-script.py’


2024-11-04 15:14:33 (46.7 MB/s) - ‘feature-engg-script.py’ saved [3285/3285]



In [17]:
train_path = f"s3://{bucket}/{prefix}/train"
validation_path = f"s3://{bucket}/{prefix}/validation"
test_path = f"s3://{bucket}/{prefix}/test"

In [18]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker import get_execution_role


sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=get_execution_role(),
    instance_type="ml.m5.large",
    instance_count=1, 
    base_job_name='mlops-sklearnprocessing'
)

sklearn_processor.run(
    code='feature-engg-script.py',
    # arguments = ['arg1', 'arg2'],
    inputs=[
        ProcessingInput(
            source=input_source, 
            destination="/opt/ml/processing/input",
            s3_input_mode="File",
            s3_data_distribution_type="ShardedByS3Key"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="train_data", 
            source="/opt/ml/processing/output/train",
            destination=train_path,
        ),
        ProcessingOutput(output_name="validation_data", source="/opt/ml/processing/output/validation", destination=validation_path),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/output/test", destination=test_path),
    ]
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker:Creating processing-job with name mlops-sklearnprocessing-2024-11-04-15-14-49-941


...........[34m## Processing completed. Exiting.[0m



### Validate the processed data

In [19]:
!aws s3 ls $train_path/

2024-11-04 15:16:54    3545009 train_script.csv


In [20]:
!aws s3 ls $test_path/

2024-11-04 15:16:54     498229 test_script_x.csv
2024-11-04 15:16:54       8238 test_script_y.csv
