# Create a S3 Bucket

In [1]:
import boto3

bucket_name = 'yahoofinancestockprice-gpk-1995'
region = 'eu-central-1'  

In [None]:
try:
    s3 = boto3.resource('s3', region_name=region)
    s3.create_bucket(
        Bucket=bucket_name,
        CreateBucketConfiguration={'LocationConstraint': region}
    )
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error:', e)

# Create train and validation csv

In [3]:
# !pip install yfinance
import pandas as pd
from datetime import datetime
import yfinance as yf

# initialize parameters
start_date = datetime(2019, 1, 1)
end_date = datetime(2024, 1, 1)

# get the data
df_data = yf.download('AAPL', start = start_date, end = end_date)

df_data.reset_index(inplace=True)

df_data

[*********************100%***********************]  1 of 1 completed


Price,Date,Adj Close,Close,High,Low,Open,Volume
Ticker,Unnamed: 1_level_1,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
0,2019-01-02 00:00:00+00:00,37.708607,39.480000,39.712502,38.557499,38.722500,148158800
1,2019-01-03 00:00:00+00:00,33.952549,35.547501,36.430000,35.500000,35.994999,365248800
2,2019-01-04 00:00:00+00:00,35.401951,37.064999,37.137501,35.950001,36.132500,234428400
3,2019-01-07 00:00:00+00:00,35.323151,36.982498,37.207500,36.474998,37.174999,219111200
4,2019-01-08 00:00:00+00:00,35.996529,37.687500,37.955002,37.130001,37.389999,164101200
...,...,...,...,...,...,...,...
1253,2023-12-22 00:00:00+00:00,192.656174,193.600006,195.410004,192.970001,195.179993,37122800
1254,2023-12-26 00:00:00+00:00,192.108856,193.050003,193.889999,192.830002,193.610001,28919300
1255,2023-12-27 00:00:00+00:00,192.208359,193.149994,193.500000,191.089996,192.490005,48087700
1256,2023-12-28 00:00:00+00:00,192.636276,193.580002,194.660004,193.169998,194.139999,34049900


## Extract, Load & Transform

In [4]:
# df_data.columns
df_data.drop(axis=1, columns=[('Adj Close', 'AAPL')], inplace=True)
df_data.drop(axis=1, columns=[(     'Date',     '')], inplace=True)

In [5]:
# re-arrange some columns
# df_data.columns
col_order =[(  'Open', 'AAPL'), (  'High', 'AAPL'), (   'Low', 'AAPL'), ( 'Close', 'AAPL'),
            ('Volume', 'AAPL')]
df_data = df_data[col_order]
df_data

Price,Open,High,Low,Close,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
0,38.722500,39.712502,38.557499,39.480000,148158800
1,35.994999,36.430000,35.500000,35.547501,365248800
2,36.132500,37.137501,35.950001,37.064999,234428400
3,37.174999,37.207500,36.474998,36.982498,219111200
4,37.389999,37.955002,37.130001,37.687500,164101200
...,...,...,...,...,...
1253,195.179993,195.410004,192.970001,193.600006,37122800
1254,193.610001,193.889999,192.830002,193.050003,28919300
1255,192.490005,193.500000,191.089996,193.149994,48087700
1256,194.139999,194.660004,193.169998,193.580002,34049900


In [6]:
# take just the last row out
df_data_features = df_data.iloc[:-1, :]
df_data_features

Price,Open,High,Low,Close,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
0,38.722500,39.712502,38.557499,39.480000,148158800
1,35.994999,36.430000,35.500000,35.547501,365248800
2,36.132500,37.137501,35.950001,37.064999,234428400
3,37.174999,37.207500,36.474998,36.982498,219111200
4,37.389999,37.955002,37.130001,37.687500,164101200
...,...,...,...,...,...
1252,196.100006,197.080002,193.500000,194.679993,46482500
1253,195.179993,195.410004,192.970001,193.600006,37122800
1254,193.610001,193.889999,192.830002,193.050003,28919300
1255,192.490005,193.500000,191.089996,193.149994,48087700


In [7]:
df_data_targets = df_data.iloc[1:, 0].rename("Targets")
df_data_targets

1        35.994999
2        36.132500
3        37.174999
4        37.389999
5        37.822498
           ...    
1253    195.179993
1254    193.610001
1255    192.490005
1256    194.139999
1257    193.899994
Name: Targets, Length: 1257, dtype: float64

In [8]:
# combine together to get final data
df_data_features['Target'] = list(df_data_targets)

first_column = df_data_features.pop('Target')
df_data_features.insert(0, 'Target', first_column)

df_data_final = df_data_features
df_data_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_features['Target'] = list(df_data_targets)


Price,Target,Open,High,Low,Close,Volume
Ticker,Unnamed: 1_level_1,AAPL,AAPL,AAPL,AAPL,AAPL
0,35.994999,38.722500,39.712502,38.557499,39.480000,148158800
1,36.132500,35.994999,36.430000,35.500000,35.547501,365248800
2,37.174999,36.132500,37.137501,35.950001,37.064999,234428400
3,37.389999,37.174999,37.207500,36.474998,36.982498,219111200
4,37.822498,37.389999,37.955002,37.130001,37.687500,164101200
...,...,...,...,...,...,...
1252,195.179993,196.100006,197.080002,193.500000,194.679993,46482500
1253,193.610001,195.179993,195.410004,192.970001,193.600006,37122800
1254,192.490005,193.610001,193.889999,192.830002,193.050003,28919300
1255,194.139999,192.490005,193.500000,191.089996,193.149994,48087700


## Train Test Split

In [9]:
import numpy as np

df_randomized = df_data_final.sample(frac = 1, random_state = 123)
df_randomized

Price,Target,Open,High,Low,Close,Volume
Ticker,Unnamed: 1_level_1,AAPL,AAPL,AAPL,AAPL,AAPL
870,132.080002,134.289993,137.339996,132.160004,135.429993,91533000
367,87.852501,88.787498,88.849998,87.772499,87.897499,114406400
875,139.899994,136.820007,138.589996,135.630005,138.270004,72433800
525,135.759995,135.729996,136.309998,134.610001,134.990005,83305400
1256,193.899994,194.139999,194.660004,193.169998,193.580002,34049900
...,...,...,...,...,...,...
1238,189.979996,190.330002,191.559998,189.229996,191.240005,45679300
1147,193.669998,193.330002,194.440002,192.919998,193.619995,37283200
106,45.770000,46.070000,46.247501,45.285000,45.634998,119093600
1041,148.869995,150.199997,151.300003,148.410004,148.479996,58867200


In [10]:
train_data, test_data = np.split(df_randomized, [int(0.8*len(df_randomized))])

print(train_data.shape, test_data.shape)

(1005, 6) (252, 6)


## Set a path and upload dataset to S3 bucket

In [12]:
import os

prefix = 'xgboost-as-a-built-in-algo'

train_csv_path = 's3://{}/{}/{}/{}'.format(bucket_name, prefix, 'train', 'train.csv')
test_csv_path = 's3://{}/{}/{}/{}'.format(bucket_name, prefix, 'test', 'test.csv')

print(train_csv_path)
print(test_csv_path)

s3://yahoofinancestockprice-gpk-1995/xgboost-as-a-built-in-algo/train/train.csv
s3://yahoofinancestockprice-gpk-1995/xgboost-as-a-built-in-algo/test/test.csv


In [44]:
train_data.to_csv(train_csv_path, index = False, header=False)
test_data.to_csv(test_csv_path, index = False, header=False)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



# Build XGBoost Model

Use either XGBoost as a framework or a built-in algorithm
I will use built-in algorithm.

In [13]:
import sagemaker

# import container image through image_uris
from sagemaker import image_uris

# kick-start a session for a training
from sagemaker.session import Session

# prepare training inputs for the XGBoost algorithm
from sagemaker.inputs import TrainingInput

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Find an XGBoost image URI and build an XGBoost container

In [14]:
# version 1.2-2 (the last argument)
xgboost_container = image_uris.retrieve("xgboost", boto3.Session().region_name, "1.2-2")

display(xgboost_container)

'492215442770.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-xgboost:1.2-2'

## Initialize hyperparameters

In [15]:
hyperparameters = {"max_depth":"5",
                  "eta":"0.2",
                  "gamma":"4",
                  "min_child_weight":"6",
                  "subsample":"0.7",
                  "objective":"reg:squarederror",
                  "early_stopping_rounds":10,
                  "num_round":1000}

## Set an output path where the trained model will be saved

In [16]:
# first {} is bucket name
# second {} is prefix
# output folder
output_path = 's3://{}/{}/{}/'.format(bucket_name, prefix, 'output')

print(output_path)

s3://yahoofinancestockprice-gpk-1995/xgboost-as-a-built-in-algo/output/


## Construct a SageMaker estimator that calls the xgboost-container

In [17]:
estimator = sagemaker.estimator.Estimator(image_uri = xgboost_container,
                                         hyperparameters = hyperparameters,
                                         role = sagemaker.get_execution_role(),
                                         instance_count = 1,
                                         instance_type = 'ml.m4.xlarge',
                                         volume_size = 5, # 5 GB
                                         output_path = output_path,
                                         use_spot_instances = True,
                                         max_run = 300,
                                         max_wait = 600)

## Define the data types and paths to the training and validation datasets

In [18]:
content_type = "csv"
train_input = TrainingInput("s3://{}/{}/{}/".format(bucket_name, prefix, 'train'), content_type = content_type)
test_input = TrainingInput("s3://{}/{}/{}".format(bucket_name, prefix, 'test'), content_type = content_type)


## Execute the XGBoost training job

In [19]:
estimator.fit({'train': train_input, 'validation': test_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-11-12-11-42-07-500


2024-11-12 11:42:08 Starting - Starting the training job...
2024-11-12 11:42:33 Starting - Preparing the instances for training...
2024-11-12 11:43:07 Downloading - Downloading input data...
2024-11-12 11:43:32 Downloading - Downloading the training image......
2024-11-12 11:44:43 Training - Training image download completed. Training in progress...[34m[2024-11-12 11:44:54.765 ip-10-0-194-76.eu-central-1.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-11-12:11:44:54:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-11-12:11:44:54:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2024-11-12:11:44:54:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-11-12:11:44:54:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2024-11-12:11:44:54:INFO] Determined delimiter of CSV input is ','[0m
[34m[2024-11-12:11:44:54:INFO] Deter

## Deploy trained xgb model as Endpoints

1. Environment:
> Within SageMaker - Serialization by User<br>
> **Outside SageMaker - Serialization by Endpoint**
  
2. Method to invoke the endpoint
> **API - Single Prediction**<br>
> s3 Bucket - Batch Prediction

3. Data type based on method
> **API - Json**<br>
>s3 Bucket - CSV

In [20]:
from sagemaker.serializers import CSVSerializer

xgb_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', serializer=CSVSerializer())

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-11-12-11-48-01-899
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-11-12-11-48-01-899
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-11-12-11-48-01-899


------!

In [21]:
xgb_predictor.endpoint_name

'sagemaker-xgboost-2024-11-12-11-48-01-899'

## Make prediction with the use of Endpoints

In [22]:
# initialize parameters
start_date = datetime(2024, 1, 4)
end_date = datetime(2024, 1, 5)

# get the data
df_data = yf.download('AAPL', start = start_date, end = end_date)
df_data.reset_index(inplace=True)
df_data

[*********************100%***********************]  1 of 1 completed


Price,Date,Adj Close,Close,High,Low,Open,Volume
Ticker,Unnamed: 1_level_1,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
0,2024-01-04 00:00:00+00:00,181.023178,181.910004,183.089996,180.880005,182.149994,71983600


In [23]:
# df_data.columns
df_data.drop(axis=1, columns=[('Adj Close', 'AAPL')], inplace=True)
df_data.drop(axis=1, columns=[(     'Date',     '')], inplace=True)

# re-arrange some columns
# df_data.columns
col_order =[(  'Open', 'AAPL'), (  'High', 'AAPL'), (   'Low', 'AAPL'), ( 'Close', 'AAPL'),
            ('Volume', 'AAPL')]
df_data = df_data[col_order]
df_data

Price,Open,High,Low,Close,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
0,182.149994,183.089996,180.880005,181.910004,71983600


In [24]:
data_features_array = df_data.values
data_features_array

array([[1.82149994e+02, 1.83089996e+02, 1.80880005e+02, 1.81910004e+02,
        7.19836000e+07]])

## Serialize data 

## Inference - Serialized Input by SageMaker Function

In [26]:
Y_pred_Fcn = xgb_predictor.predict(data_features_array).decode('utf-8')
print(type(Y_pred_Fcn), Y_pred_Fcn)

<class 'str'> 180.4252471923828



In [28]:
from sagemaker.serializers import CSVSerializer

Serialized_Input_Fcn = CSVSerializer().serialize([[1.82149994e+02, 1.83089996e+02, 1.80880005e+02, 1.81910004e+02, 
                                                   7.19836000e+07]])
print(type(Serialized_Input_Fcn), Serialized_Input_Fcn)

Y_pred_Fcn = xgb_predictor.predict(Serialized_Input_Fcn).decode('utf-8')
Y_pred_Fcn

<class 'str'> 182.149994,183.089996,180.880005,181.910004,71983600.0


'180.4252471923828\n'

## Inference - Serialized Input by built-in Function (Lambda function friendly)

In [30]:
# 2nd and 3rd row are just to show the advantage of list of list. Only Input[0] is taken as input
Input = [[1.82149994e+02, 1.83089996e+02, 1.80880005e+02, 1.81910004e+02, 7.19836000e+07],
        [1.82149994e+02, 1.83089996e+02, 1.80880005e+02, 1.81910004e+02, 7.19836000e+07],
        [1.82149994e+02, 1.83089996e+02, 1.80880005e+02, 1.81910004e+02, 7.19836000e+07]]

Serialized_Input = ','.join(map(str, Input[0]))
print(Serialized_Input, type(Serialized_Input))

Y_pred = xgb_predictor.predict(Serialized_Input).decode('utf-8')
Y_pred

182.149994,183.089996,180.880005,181.910004,71983600.0 <class 'str'>


'180.4252471923828\n'

# Lambda function handler
### Inference - Lambda function (Base) 

In [39]:
import boto3

ENDPOINT_NAME = 'sagemaker-xgboost-2024-11-12-11-48-01-899'
runtime = boto3.client('runtime.sagemaker')

def lambda_handler(event, context):
    inputs = event['data']
    # for loop for multiple inference
    result = []
    for input in inputs:
        serialized_input = ','.join(map(str, input))
    
        response = runtime.invoke_endpoint(EndpointName = ENDPOINT_NAME, 
                                       ContentType = 'text/csv',
                                      Body = serialized_input)
        response = response['Body'].read().strip()
        result.append(response.decode())
    return result

In [40]:
Input_json = {'data':
        [[1.82149994e+02, 1.83089996e+02, 1.80880005e+02, 1.81910004e+02, 7.19836000e+07],
        [1.82149994e+02, 1.83089996e+02, 1.80880005e+02, 1.81910004e+02, 7.19836000e+07],
        [1.82149994e+02, 1.83089996e+02, 1.80880005e+02, 1.81910004e+02, 7.19836000e+07]]
             }

result = lambda_handler(Input_json, __)
result

['180.4252471923828', '180.4252471923828', '180.4252471923828']

# Send results via email

## Using API Gateway to make a POST request

In [42]:
# importing the requests library
import requests

# defining the api-endpoint
API_ENDPOINT = "https://g21946ocwg.execute-api.eu-central-1.amazonaws.com/xgbmodel"

# data to be sent to api
json = {"data": [[1.82149994e+02, 1.83089996e+02, 1.80880005e+02, 1.81910004e+02, 7.19836000e+07],
        [1.82149994e+02, 1.83089996e+02, 1.80880005e+02, 1.81910004e+02, 7.19836000e+07],
        [1.82149994e+02, 1.83089996e+02, 1.80880005e+02, 1.81910004e+02, 7.19836000e+07]]
       }

# sending post request and saving response as response object
r = requests.post(url = API_ENDPOINT, json = json)

In [44]:
# Status code 200 means everything is OK
print(f"Status Code: {r.status_code}, Response: {r.json()}")

Status Code: 200, Response: ['180.4252471923828', '180.4252471923828', '180.4252471923828']


# Close and Terminate

In [45]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2024-11-12-11-48-01-899


In [46]:
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'QPW0NKW4QTBTJ98T',
   'HostId': '0fDezRdJe1yzsZiSuBGIrKglv/SbmFNo1Sw91ewrdjT2PcbvDta3QqCQh6n8LiifLesDMI1aDZSYVvA7c8wmjGFagmOBiAIcDh/2rf0WD3c=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': '0fDezRdJe1yzsZiSuBGIrKglv/SbmFNo1Sw91ewrdjT2PcbvDta3QqCQh6n8LiifLesDMI1aDZSYVvA7c8wmjGFagmOBiAIcDh/2rf0WD3c=',
    'x-amz-request-id': 'QPW0NKW4QTBTJ98T',
    'date': 'Tue, 12 Nov 2024 14:36:33 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2024-11-12-11-42-07-500/debug-output/index/000000000/000000000000_worker_0.json'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2024-11-12-11-42-07-500/debug-output/events/000000000010/000000000010_worker_0.tfevents'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2024-11-12-11-42-07-500/profi

# Extra code: Function wrote in Amazon Lambda

In [None]:
import boto3

ENDPOINT_NAME = 'sagemaker-xgboost-2024-11-12-11-48-01-899'
runtime = boto3.client('runtime.sagemaker')
email_client = boto3.client('sns')

def lambda_handler(event, context):
    inputs = event['data']
    # for loop for multiple inference
    result = []
    for input in inputs:
        serialized_input = ','.join(map(str, input))
    
        response = runtime.invoke_endpoint(EndpointName = ENDPOINT_NAME, 
                                       ContentType = 'text/csv',
                                      Body = serialized_input)
        response = response['Body'].read().strip()
        result.append(response.decode())

    response_sns = email_client.publish(
        TopicArn = 'arn:aws:sns:eu-central-1:010438479781:TopicStockPricePredictionGPK',
        Message = 'Good day! The prediction/s is/are ' + str(result),
        Subject = 'Finance - Daily Prediction for Apple Inc.')
    
    return result