https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb

In [2]:
import os
import boto3
import re
import sagemaker
import pandas as pd
import json
import numpy as np
from sagemaker import get_execution_role
from sklearn.metrics import r2_score
import pprint

role = sagemaker.get_execution_role()
region = boto3.Session().region_name
sm_boto3 = boto3.client('sagemaker')
sess = sagemaker.Session()

bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/bmeajg31'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region, bucket)

In [3]:
data = pd.read_csv('s3://sagemakerbmeironia/sagemaker_input_data/data.csv', sep=';')
data.set_index('date_ws', inplace=True)

In [4]:
s3 = boto3.resource('s3')
content_object = s3.Object('sagemakerbmeironia', 'sagemaker_input_data/names.json')
file_content = content_object.get()['Body'].read().decode('utf-8')
names_dict = json.loads(file_content)

# Filtrado de fondos

In [5]:
n = 10
filt_columns = data.columns.tolist()[:10]
filt_names = {v:k for k,v in names_dict.items() if str(v) in filt_columns}
data_filt = data[filt_columns]

In [6]:
check_equal_df = False
while not check_equal_df:
    data_pre = data_filt.copy()
    data_filt.fillna(method='ffill', inplace=True)
    check_equal_df = data_filt.equals(data_pre)

check_equal_df = False
while not check_equal_df:
    data_pre = data_filt.copy()
    data_filt.fillna(method='bfill', inplace=True)
    check_equal_df = data_filt.equals(data_pre)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [7]:
returns = np.log(data_filt).diff()

In [8]:
ret_f1 = returns.shift(-1)

In [9]:
ret_p1 = returns.shift(1)
ret_p1.columns = [elem+'_1' for elem in returns.columns.tolist()]
ret_p2 = returns.shift(2)
ret_p2.columns = [elem+'_2' for elem in returns.columns.tolist()]
ret_p3 = returns.shift(3)
ret_p3.columns = [elem+'_3' for elem in returns.columns.tolist()]
ret_p4 = returns.shift(4)
ret_p4.columns = [elem+'_4' for elem in returns.columns.tolist()]
ret_p5 = returns.shift(5)
ret_p5.columns = [elem+'_5' for elem in returns.columns.tolist()]

In [10]:
inputs = pd.concat([returns, ret_p1, ret_p2, ret_p3, ret_p4, ret_p5], axis=1).iloc[5+1:-1]
target = ret_f1.iloc[5+1:-1]

# División en train, test y split

In [11]:
perc_train = 0.7
perc_test = 0.2

In [12]:
train_x, train_y = inputs.iloc[:int(len(inputs)*perc_train)] , target.iloc[:int(len(inputs)*perc_train)]
test_x, test_y = inputs.iloc[int(len(inputs)*perc_train):int(len(inputs)*(perc_train+perc_test))] , target.iloc[int(len(inputs)*perc_train):int(len(inputs)*(perc_train+perc_test))]
val_x, val_y = inputs.iloc[int(len(inputs)*(perc_train+perc_test)):] , target.iloc[int(len(inputs)*(perc_train+perc_test)):]

# Persisitmos datos

In [13]:
with open('fund_names.json', 'w') as fp:
    json.dump(filt_names, fp)

In [14]:
train_x.to_csv('train_x.csv', index=False)
train_y.to_csv('train_y.csv', index=False)
test_x.to_csv('test_x.csv', index=False)
test_y.to_csv('test_y.csv', index=False)

In [15]:
names = sess.upload_data(
    path='fund_names.json', bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

train_x_path = sess.upload_data(
    path='train_x.csv', bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

train_y_path = sess.upload_data(
    path='train_y.csv', bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

test_x_path = sess.upload_data(
    path='test_x.csv', bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

test_y_path = sess.upload_data(
    path='test_y.csv', bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

# Generamos script

In [16]:
%%writefile script.py

import argparse
import joblib
import json
import os
import pprint

import boto3
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

import subprocess
import sys



# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf



if __name__ =='__main__':
    
    subprocess.check_call([sys.executable, "-m", "pip", "install", "s3fs"])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "fsspec"])
    

    print('extracting arguments')
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument('--n-estimators', type=int, default=10)

    # Data, model, and output directories
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--x_train_file', type=str, default='train_x.csv')
    parser.add_argument('--y_train_file', type=str, default='train_y.csv')
    parser.add_argument('--x_test_file', type=str, default='test_x.csv')
    parser.add_argument('--y_test_file', type=str, default='test_y.csv')
    parser.add_argument('--names_file', type=str, default='fund_names.json')

    args, _ = parser.parse_known_args()


    if args.train == 'unused':
        print('reading data')
        x_train = pd.read_csv(args.x_train_file).values
        y_train = pd.read_csv(args.y_train_file).values
        x_test = pd.read_csv(args.x_test_file).values
        y_test_df = pd.read_csv(args.y_test_file)
        y_test = y_test_df.values
        columns = y_test_df.columns.tolist()
        
        print('reading fund names')
        s3 = boto3.resource('s3')
        a = args.names_file
        bucket = a.split('//')[1].split('/')[0]
        file = a.split(bucket)[1][1:]
        content_object = s3.Object(bucket, file)
        file_content = content_object.get()['Body'].read().decode('utf-8')
        names_dict = json.loads(file_content)
        
    else:
        print('reading data')
        x_train = pd.read_csv(os.path.join(args.train, args.x_train_file)).values
        y_train = pd.read_csv(os.path.join(args.train, args.y_train_file)).values
        x_test = pd.read_csv(os.path.join(args.test, args.x_test_file)).values
        y_test_df = pd.read_csv(os.path.join(args.test, args.y_test_file))
        y_test = y_test_df.values
        columns = y_test_df.columns.tolist()

        print('reading fund names')
        json_file = open(os.path.join(args.train, args.names_file))
        json_str = json_file.read()
        names_dict = json.loads(json_str)


    print('training model')
    model = RandomForestRegressor(
        n_estimators=args.n_estimators)
    
    model.fit(x_train, y_train)

    # print abs error
    print('validating model')
    pred = model.predict(x_test)
    results_dict = {}
    for index, elem in enumerate(columns):
        score = r2_score(y_test[:,index], pred[:,index])
        results_dict[names_dict[elem]] = score

    pprint.pprint(results_dict)

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print('model persisted at ' + path)

Writing script.py


# Test local

In [17]:
! python script.py --n-estimators 100 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
extracting arguments
reading data
reading fund names
training model
validating model
{'"FRANKLIN BIOTECHNOLOGY DISCOVERY ""I"""': -0.027675129507533702,
 '"FRANKLIN EURO HIGH YIELD ""I"" (EUR) ACC"': 0.086387143101197,
 '"FRANKLIN EURO HIGH YIELD ""I"" (EUR) INC"': -0.058639555228167106,
 '"FRANKLIN EUROPEAN GROWTH ""I"""': 0.009186029419416819,
 '"FRANKLIN GLOBAL REAL ESTATE ""I"" ACC"': -0.016463225406092574,
 '"FRANKLIN INCOME ""I"""': -0.0050965522329047275,
 '"FRANKLIN INDIA ""I"" (USD)"': -0.023214526436507343,
 '"JAN HEN GLOBAL EQUITY ""R"" (GBP) ACC"': -0.02883038222917289,
 '"JAN HEN GLOBAL EQUITY ""R"" (USD) ACC"': -0.0018103041294308397,
 '"JAN HEN UK ABSOLUTE RETURN ""R"" (GBP) ACC"': -0.011029781479804823}
model per

# Entranamos en Sagemaker

In [19]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = '0.23-1'

sklearn_estimator = SKLearn(
    entry_point='script.py',
    role = get_execution_role(),
    train_instance_count=1,
    train_instance_type='ml.m5.large',
    framework_version=FRAMEWORK_VERSION,
    base_job_name='rf-scikit',
    hyperparameters = {'n-estimators': 100,
                       'train':'unused',
                       'test':'unused',
                       'x_train_file': train_x_path,
                       'y_train_file': train_y_path,
                       'x_test_file': test_x_path,
                       'y_test_file': test_y_path,
                       'names_file': names})

In [20]:
sklearn_estimator.fit( wait=False)

# Despliegue de un endpoint

In [22]:
sklearn_estimator.latest_training_job.wait()
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']

print('Model artifact persisted at ' + artifact)

2020-10-30 11:55:43 Starting - Launching requested ML instances......
2020-10-30 11:56:49 Starting - Preparing the instances for training...
2020-10-30 11:57:28 Downloading - Downloading input data...
2020-10-30 11:57:47 Training - Downloading the training image.....[34m2020-10-30 11:58:45,515 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-10-30 11:58:45,517 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-10-30 11:58:45,526 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-10-30 11:58:45,915 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-10-30 11:58:48,949 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-10-30 11:58:48,960 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-10-30 11:58:48,969 sagem

In [23]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=artifact,
    role=get_execution_role(),
    entry_point='script.py',
    framework_version=FRAMEWORK_VERSION)

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [24]:
predictor = model.deploy(
    instance_type='ml.t2.medium',
    initial_instance_count=1)

-----------------------------!

# Predicciones

In [30]:
runtime = boto3.client('sagemaker-runtime')

In [31]:
response = runtime.invoke_endpoint(
    EndpointName=predictor.endpoint,
    Body=val_x.to_csv(header=False, index=False).encode('utf-8'),
    ContentType='text/csv')

In [32]:
pred = np.asarray(json.loads(response['Body'].read().decode()))

In [47]:
columns = returns.columns.tolist()
dict_results = {}
dict_metrics = {}
for elem in range(pred.shape[1]):
    dict_results[filt_names[int(columns[elem])]] = pred[:,elem]
    dict_metrics[filt_names[int(columns[elem])]] = r2_score(val_x.values[:,elem], pred[:,elem])

In [48]:
pprint.pprint(dict_metrics)

{'"FRANKLIN BIOTECHNOLOGY DISCOVERY ""I"""': -0.0730260807461196,
 '"FRANKLIN EURO HIGH YIELD ""I"" (EUR) ACC"': 0.22670586671819626,
 '"FRANKLIN EURO HIGH YIELD ""I"" (EUR) INC"': 0.09879536720594773,
 '"FRANKLIN EUROPEAN GROWTH ""I"""': 0.04271303305510421,
 '"FRANKLIN GLOBAL REAL ESTATE ""I"" ACC"': 0.043264228954850936,
 '"FRANKLIN INCOME ""I"""': 0.006422644004763445,
 '"FRANKLIN INDIA ""I"" (USD)"': 0.05725437069898487,
 '"JAN HEN GLOBAL EQUITY ""R"" (GBP) ACC"': -0.07907399727912923,
 '"JAN HEN GLOBAL EQUITY ""R"" (USD) ACC"': -0.013093707748482286,
 '"JAN HEN UK ABSOLUTE RETURN ""R"" (GBP) ACC"': 0.07868525294518525}


In [53]:
dict_results['"FRANKLIN BIOTECHNOLOGY DISCOVERY ""I"""']

array([ 2.24523398e-03, -4.42569920e-03, -4.80388497e-03,  2.23791403e-03,
       -3.56618008e-04, -1.78467827e-03, -1.82429894e-03,  1.78019384e-03,
        1.53473108e-03,  3.03456772e-03,  7.52677786e-03,  1.77735641e-03,
        1.47628425e-03, -6.01861402e-04,  5.37259501e-04,  7.15102234e-04,
        3.69284590e-03,  3.51444661e-03, -2.68548443e-03, -3.32714110e-03,
        1.09566216e-03,  1.35656267e-03, -2.00676765e-03, -5.42755057e-04,
        4.47618616e-04,  4.99372212e-03,  1.16215859e-03,  3.56782600e-03,
       -1.53350052e-03,  5.59619252e-04, -1.07876396e-03,  4.78885448e-04,
       -3.33849075e-03,  1.48610591e-03,  7.23051392e-04,  2.44878498e-03,
        6.67236755e-04,  2.61044368e-03, -5.06042999e-05, -1.19572093e-04,
        2.80754396e-03,  1.48982116e-03,  3.90905117e-03, -2.14024204e-03,
        1.00691215e-03,  2.04481894e-03, -1.36277796e-03, -2.59711283e-03,
       -8.94115497e-04, -1.56925152e-03, -7.45117640e-04,  5.88300280e-04,
        3.54257648e-04,  

# Borrar endpoint

In [None]:
sm_boto3.delete_endpoint(EndpointName=predictor.endpoint)