This will only run in Sagemaker Studio

### Downloading csv file

In [5]:
# reading data from URL
import pandas as pd
url = "https://github.com/h2oai/h2o-2/raw/master/smalldata/bank-additional-full.csv"
data = pd.read_csv(url, sep=';')
#display(data.head(3))

# Randomly sample 70% of your dataframe
df = data.sample(frac=0.7)
display(df.head(3))

# saving file in current path
df.to_csv('bank-additional-full.csv')

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
20910,30,self-employed,married,university.degree,no,yes,yes,cellular,aug,thu,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.964,5228.1,no
16612,29,blue-collar,married,basic.9y,no,no,no,cellular,jul,wed,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,no
24287,42,blue-collar,married,basic.4y,unknown,yes,no,cellular,nov,mon,...,2,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no


### Reading CSV file

In [6]:
import os
import pandas as pd
df = pd.read_csv('bank-additional-full.csv')#,sep = ';')
df.head(2)

Unnamed: 0.1,Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,20910,30,self-employed,married,university.degree,no,yes,yes,cellular,aug,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.964,5228.1,no
1,16612,29,blue-collar,married,basic.9y,no,no,no,cellular,jul,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,no


### Uploading dataset to Amazon S3 bucket.
We'll use a default bucket automatically created by SageMaker in the region we're running in.

We'll just add a prefix to keep things nice and tidy

In [7]:
## No need to upload it in bucket
## If you face some error than uncomment and run this cell also

#import sagemaker
#prefix = 'sagemaker/DEMO-smprocessing/input'
#input_data = sagemaker.Session().upload_data(path='./bank-additional-full.csv',
#                                             key_prefix=prefix)

### One extra step here
for some reason get_execution_role() don't work locally for that we have few other ways to do that 

In [8]:
# 1st way
# Open terminal
# Activate sagemaker envoirnment
# and paste below command

# aws iam list-roles|grep SageMaker-Execution

# using cli it will list all the role you are using
# copy paste any one of them like this

# role = 'arn:aws:iam::603012210694:role/service-role/AmazonSageMaker-ExecutionRole-20210304T123661'

In [9]:
# 2nd way
# Automatically select Current role and seleting it
# Code link :
# https://github.com/aws/sagemaker-python-sdk/issues/300

import boto3
region = boto3.Session().region_name

def resolve_sm_role():
    client = boto3.client('iam', region_name=region)
    response_roles = client.list_roles(
        PathPrefix='/',
        # Marker='string',
        MaxItems=999
    )
    for role in response_roles['Roles']:
        if role['RoleName'].startswith('AmazonSageMaker-ExecutionRole-'):
            #print('Resolved SageMaker IAM Role to: ' + str(role))
            return role['Arn']
    raise Exception('Could not resolve what should be the SageMaker role to be used')

#resolve_sm_role()
#role = get_execution_role()
role = resolve_sm_role()
print(role)

'arn:aws:iam::603012210694:role/service-role/AmazonSageMaker-ExecutionRole-20210304T123661'

### Running a processing script:
We use the SKLearnProcessor object from the SageMaker SDK to configure the processing job:

In [10]:
import sagemaker
from sagemaker.sklearn.processing import SKLearnProcessor
sklearn_processor = SKLearnProcessor(
                    framework_version='0.20.0', # version of scikit-learn we want to use
                    role=role,#sagemaker.get_execution_role(),
                    instance_type='ml.t3.medium', # select instance of your choice
                    instance_count=1) # run instance 1 time

Then, we simply launch the job, passing the name of the script(preprocessing.py),

the dataset input path in S3,

the user-defined dataset paths inside the SageMaker Processing environment, 

and the command-line arguments:

In [11]:
from sagemaker.processing import ProcessingInput,ProcessingOutput
sklearn_processor.run(
    code='preprocessing.py',
    inputs=[ProcessingInput(
    source='bank-additional-full.csv',
    # Our data in Container
    destination='/opt/ml/processing/input')],
    outputs=[ProcessingOutput(
    source='/opt/ml/processing/train',
    output_name='train_data'),ProcessingOutput(
    source='/opt/ml/processing/test',
    output_name='test_data')],
    arguments=['--train-test-split-ratio', '0.2'])


Job Name:  sagemaker-scikit-learn-2021-03-09-12-16-08-917
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-603012210694/sagemaker-scikit-learn-2021-03-09-12-16-08-917/input/input-1/bank-additional-full.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-603012210694/sagemaker-scikit-learn-2021-03-09-12-16-08-917/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-603012210694/sagemaker-scikit-learn-2021-03-09-12-16-08-917/output/train_data', 'LocalPath': '/opt/ml/processin