# Prepare your data for training

In this lab you will use a SageMaker Processing job to convert your raw data into a set of train, test, and validation datasets that can be used to train a model.

In [2]:
!pip install sagemaker==2.117

Keyring is skipped due to an exception: 'keyring.backends'
Collecting sagemaker==2.117
  Using cached sagemaker-2.117.0-py2.py3-none-any.whl
Collecting importlib-metadata<5.0,>=1.4.0
  Using cached importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Installing collected packages: importlib-metadata, sagemaker
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 5.1.0
    Uninstalling importlib-metadata-5.1.0:
      Successfully uninstalled importlib-metadata-5.1.0
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.120.0
    Uninstalling sagemaker-2.120.0:
      Successfully uninstalled sagemaker-2.120.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytest-astropy 0.8.0 requires pytest-cov>=2.0, which is not installed.
pytest-astropy 0.8.0 requires pytest-filter-subpackage>=0.1, which is

In [3]:
import json
import sagemaker
import boto3
import numpy as np                                
import pandas as pd                               
import os                                      
import time
from sagemaker import get_execution_role
from sagemaker.processing import ProcessingInput, ProcessingOutput

# Get user profile name
metadataFile = open('/opt/ml/metadata/resource-metadata.json')
metadata = json.load(metadataFile)
userprofileName = metadata['UserProfileName']

# Get default bucket
session = sagemaker.Session()
default_bucket = session.default_bucket()
bucket_prefix = "mlops-workshop/"
# Get SageMaker Execution Role
role = get_execution_role()
region = boto3.Session().region_name

In [4]:
# Define boto session and SageMaker Client

boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)

In [5]:
!mkdir -p scripts

In [6]:
%%writefile ./scripts/preprocessing.py
import argparse
import os

import numpy as np
import pandas as pd

def process(df):
    # Add two new indicators
    df["no_previous_contact"] = (df["pdays"] == 999).astype(int)
    df["not_working"] = df["job"].isin(["student", "retired", "unemployed"]).astype(int)
    columns = list(df.columns)
    
    toremove = ["emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"]
    columns = [x for x in columns if x not in toremove]
    
    # Keeping only columns that we need
    df = df[columns]
    
    # One hot encode
    df=pd.get_dummies(df)
    df = pd.concat([df['y_yes'], df.drop(['y_no', 'y_yes'], axis=1)], axis=1)
    df = df.sample(frac=1).reset_index(drop=True)
    return df

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-path", type=str, default="/opt/ml/processing")
    args, _ = parser.parse_known_args()
    
    base_dir = args.input_path

    df = pd.read_csv(
        f"{base_dir}/input/bank-additional-full.csv",
        header=0
    )
    
    # Call the helper method
    df = process(df)
    
    train, validation, test = np.split(df, [int(.7*len(df)), int(.85*len(df))])

    train.to_csv(f"{base_dir}/train/train.csv", header=False, index=False)
    validation.to_csv(f"{base_dir}/validation/validation.csv", header=False, index=False)
    test.to_csv(f"{base_dir}/test/test.csv", header=False, index=False)

Writing ./scripts/preprocessing.py


In [7]:
# Upload the raw dataset to S3 so that it can be used in the processing job

local_data_path = "bank-additional-full.csv"

base_uri = f"s3://{default_bucket}/{userprofileName}/marketing"
input_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=local_data_path, 
    desired_s3_uri=base_uri,
)

In [16]:
from sagemaker.sklearn.processing import SKLearnProcessor

framework_version = "0.23-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    base_job_name=f"sklearn-marketing-process-{userprofileName}",
    role=role,
    sagemaker_session=session
)

In [17]:
sklearn_processor.run(
    inputs=[
      ProcessingInput(source=input_data_uri, destination="/opt/ml/processing/input"),  
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test")
    ],
    code="scripts/preprocessing.py",
)


Job Name:  sklearn-marketing-process-pplhy997-inta-2022-12-15-16-23-27-984
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ca-central-1-222848388999/pplhy997-intact-net-bc8/marketing/bank-additional-full.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ca-central-1-222848388999/sklearn-marketing-process-pplhy997-inta-2022-12-15-16-23-27-984/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-ca-central-1-222848388999/sklearn-marketing-process-pplhy997-inta-2022-12-15-16-23-27-984/output/train', 'LocalPa

In [18]:
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

In [19]:
train_uri = preprocessing_job_description['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']
val_uri = preprocessing_job_description['ProcessingOutputConfig']['Outputs'][1]['S3Output']['S3Uri']
test_uri = preprocessing_job_description['ProcessingOutputConfig']['Outputs'][2]['S3Output']['S3Uri']

In [20]:
%store train_uri
%store val_uri
%store test_uri

Stored 'train_uri' (str)
Stored 'val_uri' (str)
Stored 'test_uri' (str)


### Let's view the processed data

Here we download the training dataset and view the first 10 rows

In [21]:
!aws s3 cp {train_uri}/train.csv /tmp/train.csv

download: s3://sagemaker-ca-central-1-222848388999/sklearn-marketing-process-pplhy997-inta-2022-12-15-16-23-27-984/output/train/train.csv to ../../../../tmp/train.csv


In [22]:
train_df = pd.read_csv('/tmp/train.csv', header = None)

In [23]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,1,54,178,4,3,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
1,0,29,69,1,999,1,1,0,1,0,...,0,0,0,0,1,0,0,1,0,0
2,0,57,97,4,999,0,1,0,0,1,...,0,0,0,0,0,0,1,0,1,0
3,0,49,136,3,999,0,1,0,1,0,...,0,0,0,0,1,0,0,0,1,0
4,0,37,97,2,999,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0


#### You can now move to the next section of the module `Train a model & track your experiments`

The notebook used in that section is `sagemaker-train.ipynb`