# Predicting Stock Market Trends with Amazon SageMaker Autopilot

**Paper:** https://doi.org/doi:10.1016/j.eswa.2010.10.027

**Blog post:** https://fg-research.com/blog/general/posts/equity-trend-prediction-automl.html

## Environment Set-Up

In [1]:
!pip install yfinance pyti



In [2]:
import warnings
import io
import json
import sagemaker
import yfinance as yf
import pandas as pd
import numpy as np
from pyti.simple_moving_average import simple_moving_average
from pyti.weighted_moving_average import weighted_moving_average
from pyti.momentum import momentum
from pyti.stochastic import percent_k, percent_d
from pyti.williams_percent_r import williams_percent_r
from pyti.accumulation_distribution import accumulation_distribution
from pyti.moving_average_convergence_divergence import moving_average_convergence_divergence
from pyti.relative_strength_index import relative_strength_index
from pyti.commodity_channel_index import commodity_channel_index
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
warnings.filterwarnings(action="ignore")

# SageMaker session
session = sagemaker.Session()

# SageMaker role
role = sagemaker.get_execution_role()

# S3 bucket
bucket = session.default_bucket()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Data Preparation

Download the data.

In [3]:
dataset = yf.download(tickers="^SPX", start="2021-08-01", end="2024-08-01")

[*********************100%%**********************]  1 of 1 completed


In [4]:
dataset.shape

(754, 6)

In [5]:
dataset.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-08-02,4406.859863,4422.180176,4384.810059,4387.160156,4387.160156,3724090000
2021-08-03,4392.740234,4423.790039,4373.0,4423.149902,4423.149902,3965190000
2021-08-04,4415.950195,4416.169922,4400.22998,4402.660156,4402.660156,4260760000
2021-08-05,4408.859863,4429.759766,4408.859863,4429.100098,4429.100098,3769410000
2021-08-06,4429.069824,4440.819824,4429.069824,4436.52002,4436.52002,3451870000


In [6]:
dataset.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-07-25,5428.700195,5491.589844,5390.950195,5399.220215,5399.220215,4592120000
2024-07-26,5433.669922,5488.319824,5430.700195,5459.100098,5459.100098,3638770000
2024-07-29,5476.549805,5487.740234,5444.439941,5463.540039,5463.540039,3379970000
2024-07-30,5478.72998,5489.459961,5401.700195,5436.439941,5436.439941,3777740000
2024-07-31,5505.589844,5551.509766,5493.75,5522.299805,5522.299805,4546910000


Calculate the technical indicators.

In [7]:
# simple moving average
dataset["Simple MA"] = simple_moving_average(
    data=dataset["Close"],
    period=10
)

# weighted moving average
dataset["Weighted MA"] = weighted_moving_average(
    data=dataset["Close"],
    period=10
)

# momentum
dataset["Momentum"] = momentum(
    data=dataset["Close"],
    period=10
)

# stochastic K%
dataset["Stochastic K%"] = percent_k(
    data=dataset["Close"],
    period=10
)

# stochastic D%
dataset["Stochastic D%"] = percent_d(
    data=dataset["Close"],
    period=10
)

# relative strength index
dataset["RSI"] = relative_strength_index(
    data=dataset["Close"],
    period=10
)

# moving average convergence divergence
dataset["MACD"] = moving_average_convergence_divergence(
    data=dataset["Close"],
    short_period=12,
    long_period=26
)

# Larry William’s R%
dataset["LW R%"] = williams_percent_r(
    close_data=dataset["Close"],
)

# accumulation / distribution oscillator
dataset["A/D Oscillator"] = accumulation_distribution(
    close_data=dataset["Close"],
    low_data=dataset["Low"],
    high_data=dataset["High"],
    volume=dataset["Volume"]
)

# commodity channel index
dataset["CCI"] = commodity_channel_index(
    close_data=dataset["Close"],
    low_data=dataset["Low"],
    high_data=dataset["High"],
    period=10
)

Derive the class labels (up = `1`, down = `0`).

In [8]:
dataset.insert(0, "Trend", (dataset["Close"].shift(periods=-1) > dataset["Close"]).mask(dataset["Close"].shift(periods=-1).isna()).astype(float))

Drop the unnecessary columns.

In [9]:
dataset.drop(labels=["Close", "Open", "High", "Low", "Volume", "Adj Close"], axis=1, inplace=True)

Drop the missing values.

In [10]:
dataset.dropna(inplace=True)

In [11]:
dataset.shape

(728, 11)

In [12]:
dataset.head()

Unnamed: 0_level_0,Trend,Simple MA,Weighted MA,Momentum,Stochastic K%,Stochastic D%,RSI,MACD,LW R%,A/D Oscillator,CCI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-09-07,0.0,4512.976025,4521.627663,33.799805,0.747269,0.908188,63.584309,29.01935,-54.884068,16538190000.0,1.835139
2021-09-08,0.0,4515.76001,4521.826536,17.879883,0.658248,0.794271,61.036421,26.950266,-55.16921,18208770000.0,-0.940604
2021-09-09,0.0,4515.468994,4517.739222,23.279785,0.347718,0.584412,52.831201,22.518786,-56.163868,14712320000.0,-2.02342
2021-09-10,1.0,4514.327002,4507.395783,-50.790039,0.0,0.335322,42.288508,15.094595,-57.824006,11195270000.0,-6.831255
2021-09-13,0.0,4510.262988,4499.105415,-60.060059,0.129512,0.159077,45.803512,9.960096,-57.338404,11093430000.0,-8.074242


In [13]:
dataset.tail()

Unnamed: 0_level_0,Trend,Simple MA,Weighted MA,Momentum,Stochastic K%,Stochastic D%,RSI,MACD,LW R%,A/D Oscillator,CCI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-07-24,0.0,5568.345068,5543.815146,-157.410156,0.0,0.226367,36.184891,4.0354,-11.485683,264865100000.0,-22.900853
2024-07-25,1.0,5549.813086,5513.065172,-216.129883,0.0,0.104275,33.716938,-11.579047,-12.820965,261027700000.0,-24.196463
2024-07-26,1.0,5534.188086,5496.571902,-172.120117,0.223449,0.074483,42.986717,-19.550389,-9.956132,260975900000.0,-15.36006
2024-07-29,0.0,5517.420068,5483.726802,-203.660156,0.240017,0.154489,43.636131,-24.402278,-9.743712,260577800000.0,-11.126445
2024-07-30,1.0,5494.344043,5469.003143,-151.830078,0.196878,0.220115,40.506981,-31.457025,-11.040262,259790900000.0,-11.327447


Split the data.

In [14]:
test_size = 30

In [15]:
training_dataset = dataset.iloc[:- 2 * test_size]

In [16]:
print(f"training samples: {len(training_dataset)}")
print(f"training dates: {training_dataset.index.min().date()} / {training_dataset.index.max().date()}")

training samples: 668
training dates: 2021-09-07 / 2024-05-02


In [17]:
validation_dataset = dataset.iloc[- 2 * test_size: - test_size]

In [18]:
print(f"validation samples: {len(validation_dataset)}")
print(f"validation dates: {validation_dataset.index.min().date()} / {validation_dataset.index.max().date()}")

validation samples: 30
validation dates: 2024-05-03 / 2024-06-14


In [19]:
test_dataset = dataset.iloc[- test_size:]

In [20]:
print(f"test samples: {len(test_dataset)}")
print(f"test dates: {test_dataset.index.min().date()} / {test_dataset.index.max().date()}")

test samples: 30
test dates: 2024-06-17 / 2024-07-30


Save the data to S3.

In [21]:
training_data = session.upload_string_as_file_body(
    body=training_dataset.to_csv(index=False),
    bucket=bucket,
    key="data/train.csv"
)

In [22]:
training_data

's3://sagemaker-eu-west-1-661670223746/data/train.csv'

In [23]:
validation_data = session.upload_string_as_file_body(
    body=validation_dataset.to_csv(index=False),
    bucket=bucket,
    key="data/valid.csv"
)

In [24]:
validation_data

's3://sagemaker-eu-west-1-661670223746/data/valid.csv'

In [25]:
test_data = session.upload_string_as_file_body(
    body=test_dataset.drop(labels=["Trend"], axis=1).to_csv(index=False, header=False),
    bucket=bucket,
    key="data/test.csv"
)

In [26]:
test_data

's3://sagemaker-eu-west-1-661670223746/data/test.csv'

## Model Selection

Find the best model.

In [27]:
# define the AutoML job configuration
automl = sagemaker.automl.automlv2.AutoMLV2(
    problem_config=sagemaker.automl.automlv2.AutoMLTabularConfig(
        target_attribute_name="Trend",
        algorithms_config=["xgboost", "lightgbm", "catboost"],
        mode="ENSEMBLING",
        problem_type="BinaryClassification",
        max_candidates=10,
    ),
    output_path=f"s3://{bucket}/output/",
    job_objective={"MetricName": "Accuracy"},
    base_job_name="equity-trend-automl",
    role=role,
    sagemaker_session=session,
)

# run the AutoML job
automl.fit(
    inputs=[
        sagemaker.automl.automlv2.AutoMLDataChannel(
            s3_data_type="S3Prefix",
            s3_uri=training_data,
            channel_type="training",
            compression_type=None,
            content_type="text/csv;header=present"
        ),
        sagemaker.automl.automlv2.AutoMLDataChannel(
            s3_data_type="S3Prefix",
            s3_uri=validation_data,
            channel_type="validation",
            compression_type=None,
            content_type="text/csv;header=present"
        ),
    ]
)

{"AutoMLJobName": "equity-t-2024-08-19-11-30-33-057", "AutoMLJobInputDataConfig": [{"DataSource": {"S3DataSource": {"S3DataType": "S3Prefix", "S3Uri": "s3://sagemaker-eu-west-1-661670223746/data/train.csv"}}, "ChannelType": "training", "ContentType": "text/csv;header=present"}, {"DataSource": {"S3DataSource": {"S3DataType": "S3Prefix", "S3Uri": "s3://sagemaker-eu-west-1-661670223746/data/valid.csv"}}, "ChannelType": "validation", "ContentType": "text/csv;header=present"}], "OutputDataConfig": {"S3OutputPath": "s3://sagemaker-eu-west-1-661670223746/output/"}, "AutoMLProblemTypeConfig": {"TabularJobConfig": {"CompletionCriteria": {"MaxCandidates": 10}, "TargetAttributeName": "Trend", "ProblemType": "BinaryClassification", "Mode": "ENSEMBLING", "CandidateGenerationConfig": {"AlgorithmsConfig": [{"AutoMLAlgorithms": ["xgboost", "lightgbm", "catboost"]}]}}}, "RoleArn": "arn:aws:iam::661670223746:role/service-role/AmazonSageMaker-ExecutionRole-20191030T162743", "AutoMLJobObjective": {"Metric

## Model Evaluation

Generate the test set predictions using the best model.

In [28]:
# create the model
model = automl.create_model(
    name="equity-trend-model",
    sagemaker_session=session,
    inference_response_keys=["probabilities", "labels", "predicted_label", "probability"]
)

# create the transformer
transformer = model.transformer(
    instance_count=1,
    instance_type="ml.m5.2xlarge",
)

# run the transform job
transformer.transform(
    data=test_data,
    content_type="text/csv",
)

Using already existing model: equity-trend-model
INFO:sagemaker:Creating transform job with name: equity-trend-model-2024-08-19-11-38-57-184


[34m2024-08-19T11:44:03,671 [INFO ] main com.amazonaws.ml.mms.ModelServer - [0m
[34mMMS Home: /usr/local/lib/python3.8/dist-packages[0m
[34mCurrent directory: /[0m
[34mTemp directory: /home/model-server/tmp[0m
[34mNumber of GPUs: 0[0m
[34mNumber of CPUs: 8[0m
[34mMax heap size: 7044 M[0m
[34mPython executable: /usr/bin/python3[0m
[34mConfig file: /etc/sagemaker-mms.properties[0m
[34mInference address: http://0.0.0.0:8080[0m
[34mManagement address: http://0.0.0.0:8080[0m
[34mModel Store: /.sagemaker/mms/models[0m
[34mInitial Models: ALL[0m
[34mLog dir: null[0m
[34mMetrics dir: null[0m
[34mNetty threads: 0[0m
[34mNetty client threads: 0[0m
[34mDefault workers per model: 8[0m
[35m2024-08-19T11:44:03,671 [INFO ] main com.amazonaws.ml.mms.ModelServer - [0m
[35mMMS Home: /usr/local/lib/python3.8/dist-packages[0m
[35mCurrent directory: /[0m
[35mTemp directory: /home/model-server/tmp[0m
[35mNumber of GPUs: 0[0m
[35mNumber of CPUs: 8[0m
[35mMax h

Get the test set predictions from S3.

In [29]:
# download the predictions from S3
predictions = session.read_s3_file(
    bucket=bucket,
    key_prefix=f"{transformer.latest_transform_job.name}/test.csv.out"
)

# cast the predictions to data frame
predictions = pd.read_csv(io.StringIO(predictions), header=None)
predictions.columns = ["probabilities", "labels", "predicted_label", "probability"]

In [30]:
predictions.shape

(30, 4)

In [31]:
predictions.head()

Unnamed: 0,probabilities,labels,predicted_label,probability
0,"[0.44339269399642944, 0.5566073060035706]","['1.0', '0.0']",0.0,0.556607
1,"[0.3947669267654419, 0.6052330732345581]","['1.0', '0.0']",0.0,0.605233
2,"[0.35636866092681885, 0.6436313390731812]","['1.0', '0.0']",0.0,0.643631
3,"[0.4925159811973572, 0.5074840188026428]","['1.0', '0.0']",0.0,0.507484
4,"[0.4931630492210388, 0.5068369507789612]","['1.0', '0.0']",0.0,0.506837


In [32]:
predictions.tail()

Unnamed: 0,probabilities,labels,predicted_label,probability
25,"[0.4145863652229309, 0.5854136347770691]","['1.0', '0.0']",0.0,0.585414
26,"[0.46709680557250977, 0.5329031944274902]","['1.0', '0.0']",0.0,0.532903
27,"[0.4988706707954407, 0.5011293292045593]","['1.0', '0.0']",0.0,0.501129
28,"[0.3520074486732483, 0.6479925513267517]","['1.0', '0.0']",0.0,0.647993
29,"[0.5043209195137024, 0.4956790804862976]","['1.0', '0.0']",1.0,0.504321


Process the test set predictions.

In [33]:
# extract the predicted probabilities
predictions["Class 0 Probability"] = predictions["probabilities"].apply(lambda x: json.loads(x)[1])
predictions["Class 1 Probability"] = predictions["probabilities"].apply(lambda x: json.loads(x)[0])
predictions["Predicted Trend"] = predictions[["Class 0 Probability", "Class 1 Probability"]].apply(lambda x: np.argmax(x), axis=1)

# add the dates
predictions.index = test_dataset.index

# add the ground truth labels
predictions["Actual Trend"] = test_dataset["Trend"].astype(int)

# drop the unnecessary columns
predictions = predictions[["Class 0 Probability", "Class 1 Probability", "Predicted Trend", "Actual Trend"]]

In [34]:
predictions.shape

(30, 4)

In [35]:
predictions.head()

Unnamed: 0_level_0,Class 0 Probability,Class 1 Probability,Predicted Trend,Actual Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-06-17,0.556607,0.443393,0,1
2024-06-18,0.605233,0.394767,0,0
2024-06-20,0.643631,0.356369,0,0
2024-06-21,0.507484,0.492516,0,0
2024-06-24,0.506837,0.493163,0,1


In [36]:
predictions.tail()

Unnamed: 0_level_0,Class 0 Probability,Class 1 Probability,Predicted Trend,Actual Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-07-24,0.585414,0.414586,0,0
2024-07-25,0.532903,0.467097,0,1
2024-07-26,0.501129,0.498871,0,1
2024-07-29,0.647993,0.352007,0,0
2024-07-30,0.495679,0.504321,1,1


Calculate the classification metrics.

In [37]:
metrics = pd.DataFrame(
    data={
        "Accuracy": accuracy_score(y_true=predictions["Actual Trend"], y_pred=predictions["Predicted Trend"]),
        "ROC-AUC": roc_auc_score(y_true=predictions["Actual Trend"], y_score=predictions["Class 1 Probability"]),
        "Precision": precision_score(y_true=predictions["Actual Trend"], y_pred=predictions["Predicted Trend"]),
        "Recall": recall_score(y_true=predictions["Actual Trend"], y_pred=predictions["Predicted Trend"]),
        "F1": f1_score(y_true=predictions["Actual Trend"], y_pred=predictions["Predicted Trend"]),
    },
    index=["Value"]
).transpose().reset_index().rename(columns={"index": "Metric"})

In [38]:
metrics

Unnamed: 0,Metric,Value
0,Accuracy,0.633333
1,ROC-AUC,0.800926
2,Precision,0.818182
3,Recall,0.5
4,F1,0.62069


Calculate the confusion matrix.

In [39]:
matrix = pd.crosstab(
    index=predictions["Actual Trend"],
    columns=predictions["Predicted Trend"],
)

In [40]:
matrix

Predicted Trend,0,1
Actual Trend,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10,2
1,9,9


Delete the model.

In [41]:
transformer.delete_model()

INFO:sagemaker:Deleting model with name: equity-trend-model
