https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb

In [1]:
import os
import boto3
import re
import sagemaker
import pandas as pd
import json
import numpy as np
from sagemaker import get_execution_role
from sklearn.metrics import r2_score
import pprint

role = sagemaker.get_execution_role()
region = boto3.Session().region_name
sm_boto3 = boto3.client('sagemaker')
sess = sagemaker.Session()

bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/bmeajg31'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region, bucket)

In [2]:
data = pd.read_csv('s3://sagemakerbmeironia/sagemaker_input_data/data.csv', sep=';')
data.set_index('date_ws', inplace=True)

In [3]:
s3 = boto3.resource('s3')
content_object = s3.Object('sagemakerbmeironia', 'sagemaker_input_data/names.json')
file_content = content_object.get()['Body'].read().decode('utf-8')
names_dict = json.loads(file_content)

# Filtrado de fondos

In [4]:
n = 10
filt_columns = data.columns.tolist()[:10]
filt_names = {v:k for k,v in names_dict.items() if str(v) in filt_columns}
data_filt = data[filt_columns]

In [5]:
check_equal_df = False
while not check_equal_df:
    data_pre = data_filt.copy()
    data_filt.fillna(method='ffill', inplace=True)
    check_equal_df = data_filt.equals(data_pre)

check_equal_df = False
while not check_equal_df:
    data_pre = data_filt.copy()
    data_filt.fillna(method='bfill', inplace=True)
    check_equal_df = data_filt.equals(data_pre)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [6]:
returns = np.log(data_filt).diff()

In [7]:
ret_f1 = returns.shift(-1)

In [8]:
ret_p1 = returns.shift(1)
ret_p1.columns = [elem+'_1' for elem in returns.columns.tolist()]
ret_p2 = returns.shift(2)
ret_p2.columns = [elem+'_2' for elem in returns.columns.tolist()]
ret_p3 = returns.shift(3)
ret_p3.columns = [elem+'_3' for elem in returns.columns.tolist()]
ret_p4 = returns.shift(4)
ret_p4.columns = [elem+'_4' for elem in returns.columns.tolist()]
ret_p5 = returns.shift(5)
ret_p5.columns = [elem+'_5' for elem in returns.columns.tolist()]

In [9]:
inputs = pd.concat([returns, ret_p1, ret_p2, ret_p3, ret_p4, ret_p5], axis=1).iloc[5+1:-1]
target = ret_f1.iloc[5+1:-1]

# División en train, test y split

In [10]:
perc_train = 0.7
perc_test = 0.2

In [11]:
train_x, train_y = inputs.iloc[:int(len(inputs)*perc_train)] , target.iloc[:int(len(inputs)*perc_train)]
test_x, test_y = inputs.iloc[int(len(inputs)*perc_train):int(len(inputs)*(perc_train+perc_test))] , target.iloc[int(len(inputs)*perc_train):int(len(inputs)*(perc_train+perc_test))]
val_x, val_y = inputs.iloc[int(len(inputs)*(perc_train+perc_test)):] , target.iloc[int(len(inputs)*(perc_train+perc_test)):]

# Persistimos los datos

In [12]:
with open('fund_names.json', 'w') as fp:
    json.dump(filt_names, fp)

In [13]:
train_x.to_csv('train_x.csv', index=False)
train_y.to_csv('train_y.csv', index=False)
test_x.to_csv('test_x.csv', index=False)
test_y.to_csv('test_y.csv', index=False)

In [14]:
names = sess.upload_data(
    path='fund_names.json', bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

train_x_path = sess.upload_data(
    path='train_x.csv', bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

train_y_path = sess.upload_data(
    path='train_y.csv', bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

test_x_path = sess.upload_data(
    path='test_x.csv', bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

test_y_path = sess.upload_data(
    path='test_y.csv', bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

# Generamos script

In [15]:
%%writefile script.py

import argparse
import joblib
import json
import os
import pprint

import boto3
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

import subprocess
import sys



# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf



if __name__ =='__main__':
    
    subprocess.check_call([sys.executable, "-m", "pip", "install", "s3fs"])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "fsspec"])
    

    print('extracting arguments')
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument('--n-estimators', type=int, default=10)

    # Data, model, and output directories
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--x_train_file', type=str, default='train_x.csv')
    parser.add_argument('--y_train_file', type=str, default='train_y.csv')
    parser.add_argument('--x_test_file', type=str, default='test_x.csv')
    parser.add_argument('--y_test_file', type=str, default='test_y.csv')
    parser.add_argument('--names_file', type=str, default='fund_names.json')

    args, _ = parser.parse_known_args()


    if args.train == 'unused':
        print('reading data')
        x_train = pd.read_csv(args.x_train_file).values
        y_train = pd.read_csv(args.y_train_file).values
        x_test = pd.read_csv(args.x_test_file).values
        y_test_df = pd.read_csv(args.y_test_file)
        y_test = y_test_df.values
        columns = y_test_df.columns.tolist()
        
        print('reading fund names')
        s3 = boto3.resource('s3')
        a = args.names_file
        bucket = a.split('//')[1].split('/')[0]
        file = a.split(bucket)[1][1:]
        content_object = s3.Object(bucket, file)
        file_content = content_object.get()['Body'].read().decode('utf-8')
        names_dict = json.loads(file_content)
        
    else:
        print('reading data')
        x_train = pd.read_csv(os.path.join(args.train, args.x_train_file)).values
        y_train = pd.read_csv(os.path.join(args.train, args.y_train_file)).values
        x_test = pd.read_csv(os.path.join(args.test, args.x_test_file)).values
        y_test_df = pd.read_csv(os.path.join(args.test, args.y_test_file))
        y_test = y_test_df.values
        columns = y_test_df.columns.tolist()

        print('reading fund names')
        json_file = open(os.path.join(args.train, args.names_file))
        json_str = json_file.read()
        names_dict = json.loads(json_str)


    print('training model')
    model = RandomForestRegressor(
        n_estimators=args.n_estimators)
    
    model.fit(x_train, y_train)

    # print abs error
    print('validating model')
    pred = model.predict(x_test)
    results_dict = {}
    for index, elem in enumerate(columns):
        score = r2_score(y_test[:,index], pred[:,index])
        results_dict[names_dict[elem]] = score

    pprint.pprint(results_dict)

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print('model persisted at ' + path)

Writing script.py


# Test local

In [16]:
! python script.py --n-estimators 100 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
extracting arguments
reading data
reading fund names
training model
validating model
{'"FRANKLIN BIOTECHNOLOGY DISCOVERY ""I"""': -0.04987917634918415,
 '"FRANKLIN EURO HIGH YIELD ""I"" (EUR) ACC"': 0.04963080896920735,
 '"FRANKLIN EURO HIGH YIELD ""I"" (EUR) INC"': -0.07302500118418909,
 '"FRANKLIN EUROPEAN GROWTH ""I"""': 0.01302759380208618,
 '"FRANKLIN GLOBAL REAL ESTATE ""I"" ACC"': -0.04477176260211735,
 '"FRANKLIN INCOME ""I"""': -0.004758254547868601,
 '"FRANKLIN INDIA ""I"" (USD)"': -0.04610736340032506,
 '"JAN HEN GLOBAL EQUITY ""R"" (GBP) ACC"': -0.038341412148969356,
 '"JAN HEN GLOBAL EQUITY ""R"" (USD) ACC"': -0.019338519139112886,
 '"JAN HEN UK ABSOLUTE RETURN ""R"" (GBP) ACC"': -0.00040235172699620847}
model persi

# Entranamos en Sagemaker

In [17]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = '0.23-1'

sklearn_estimator = SKLearn(
    entry_point='script.py',
    role = get_execution_role(),
    train_instance_count=1,
    train_instance_type='ml.m5.large',
    framework_version=FRAMEWORK_VERSION,
    base_job_name='rf-scikit',
    hyperparameters = {'n-estimators': 100,
                       'train':'unused',
                       'test':'unused',
                       'x_train_file': train_x_path,
                       'y_train_file': train_y_path,
                       'x_test_file': test_x_path,
                       'y_test_file': test_y_path,
                       'names_file': names})

In [18]:
sklearn_estimator.fit( wait=False)

# Desplegar un endpoint

In [19]:
sklearn_estimator.latest_training_job.wait()
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']

print('Model artifact persisted at ' + artifact)

2020-10-31 12:31:39 Starting - Starting the training job...
2020-10-31 12:31:41 Starting - Launching requested ML instances......
2020-10-31 12:32:44 Starting - Preparing the instances for training...
2020-10-31 12:33:25 Downloading - Downloading input data...
2020-10-31 12:33:41 Training - Downloading the training image...
2020-10-31 12:34:32 Training - Training image download completed. Training in progress..[34m2020-10-31 12:34:32,090 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-10-31 12:34:32,092 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-10-31 12:34:32,101 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-10-31 12:34:32,375 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-10-31 12:34:33,831 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[3

In [20]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=artifact,
    role=get_execution_role(),
    entry_point='script.py',
    framework_version=FRAMEWORK_VERSION)

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [21]:
predictor = model.deploy(
    instance_type='ml.t2.medium',
    initial_instance_count=1)

---------------------!

# Predicciones

In [22]:
runtime = boto3.client('sagemaker-runtime')

In [23]:
response = runtime.invoke_endpoint(
    EndpointName=predictor.endpoint,
    Body=val_x.to_csv(header=False, index=False).encode('utf-8'),
    ContentType='text/csv')

In [24]:
pred = np.asarray(json.loads(response['Body'].read().decode()))

In [26]:
columns = returns.columns.tolist()
dict_results = {}
dict_metrics = {}
for elem in range(pred.shape[1]):
    dict_results[filt_names[int(columns[elem])]] = pred[:,elem]
    dict_metrics[filt_names[int(columns[elem])]] = r2_score(val_y.values[:,elem],pred[:,elem])

In [27]:
pprint.pprint(dict_metrics)

{'"FRANKLIN BIOTECHNOLOGY DISCOVERY ""I"""': -0.01280736176890862,
 '"FRANKLIN EURO HIGH YIELD ""I"" (EUR) ACC"': 0.23036996498375562,
 '"FRANKLIN EURO HIGH YIELD ""I"" (EUR) INC"': 0.07674783599098578,
 '"FRANKLIN EUROPEAN GROWTH ""I"""': 0.054970468441639886,
 '"FRANKLIN GLOBAL REAL ESTATE ""I"" ACC"': -0.033814530076192506,
 '"FRANKLIN INCOME ""I"""': 0.06471463291704915,
 '"FRANKLIN INDIA ""I"" (USD)"': 0.01670012172389801,
 '"JAN HEN GLOBAL EQUITY ""R"" (GBP) ACC"': 0.000636133245301651,
 '"JAN HEN GLOBAL EQUITY ""R"" (USD) ACC"': -0.001286615885380149,
 '"JAN HEN UK ABSOLUTE RETURN ""R"" (GBP) ACC"': -0.1590348695713526}


In [28]:
dict_results['"FRANKLIN BIOTECHNOLOGY DISCOVERY ""I"""']

array([ 5.98545981e-03, -2.06560987e-03, -1.75772289e-03,  4.66344078e-04,
        2.01076206e-03, -1.50827680e-04,  5.11689174e-04,  2.26000492e-03,
       -1.98446696e-04,  2.83056315e-03,  8.08962142e-03,  1.44598122e-03,
        1.01876094e-03,  3.96069822e-03,  1.97694840e-03,  4.24837211e-03,
        2.51251127e-03,  1.31869668e-03, -9.13528199e-04, -2.41273560e-03,
        2.87619330e-03,  6.03999760e-04, -1.18147304e-03, -4.48268478e-04,
       -1.67327978e-03,  2.82515139e-03,  1.36658792e-03,  2.37096975e-03,
        8.63296570e-05, -4.01784641e-03, -3.00708625e-03, -3.94984065e-04,
        1.47440221e-03,  3.61872443e-04, -1.30471497e-03,  9.69871303e-04,
       -1.16060215e-03,  2.06084712e-03, -1.47244691e-03,  2.38671534e-03,
       -7.62066956e-04,  7.17562497e-04,  1.73357711e-03, -7.12408134e-04,
       -3.02554975e-04,  2.23746959e-03, -3.69154979e-03, -1.39809204e-04,
       -2.81844488e-04,  7.34745883e-04, -1.16792085e-03,  1.11070109e-03,
        9.64005637e-04,  

# Borrar el endpoint

In [29]:
sm_boto3.delete_endpoint(EndpointName=predictor.endpoint)

{'ResponseMetadata': {'RequestId': '63d837cb-cf97-419a-9fd9-bde3b4894b50',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '63d837cb-cf97-419a-9fd9-bde3b4894b50',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sat, 31 Oct 2020 12:47:47 GMT'},
  'RetryAttempts': 0}}