# Predicting Stock Market Trends with Amazon SageMaker Autopilot

**Blog post:** https://fg-research.com/blog/general/posts/equity-trend-prediction-automl.html

## Environment Set-Up

In [1]:
!pip install yfinance pyti



In [2]:
import warnings
import io
import boto3
import json
import sagemaker
import yfinance as yf
import pandas as pd
import numpy as np
from pyti.simple_moving_average import simple_moving_average
from pyti.weighted_moving_average import weighted_moving_average
from pyti.momentum import momentum
from pyti.stochastic import percent_k, percent_d
from pyti.williams_percent_r import williams_percent_r
from pyti.accumulation_distribution import accumulation_distribution
from pyti.moving_average_convergence_divergence import moving_average_convergence_divergence
from pyti.relative_strength_index import relative_strength_index
from pyti.commodity_channel_index import commodity_channel_index
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
warnings.filterwarnings(action="ignore")

# SageMaker session
session = sagemaker.Session()

# SageMaker role
role = sagemaker.get_execution_role()

# S3 bucket
bucket = session.default_bucket()

# Boto3 client
client = boto3.client("sagemaker-runtime")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Data Preparation

Download the data.

In [3]:
dataset = yf.download(tickers="^SPX", start="2021-08-01", end="2024-08-01")

[*********************100%%**********************]  1 of 1 completed


Calculate the technical indicators.

In [4]:
# simple moving average
dataset["Simple MA"] = simple_moving_average(
    data=dataset["Close"],
    period=10
)

# weighted moving average
dataset["Weighted MA"] = weighted_moving_average(
    data=dataset["Close"],
    period=10
)

# momentum
dataset["Momentum"] = momentum(
    data=dataset["Close"],
    period=10
)

# stochastic K%
dataset["Stochastic K%"] = percent_k(
    data=dataset["Close"],
    period=10
)

# stochastic D%
dataset["Stochastic D%"] = percent_d(
    data=dataset["Close"],
    period=10
)

# relative strength index
dataset["RSI"] = relative_strength_index(
    data=dataset["Close"],
    period=10
)

# moving average convergence divergence
dataset["MACD"] = moving_average_convergence_divergence(
    data=dataset["Close"],
    short_period=12,
    long_period=26
)

# Larry Williamâ€™s R%
dataset["LW R%"] = williams_percent_r(
    close_data=dataset["Close"],
)

# accumulation / distribution oscillator
dataset["A/D Oscillator"] = accumulation_distribution(
    close_data=dataset["Close"],
    low_data=dataset["Low"],
    high_data=dataset["High"],
    volume=dataset["Volume"]
)

# commodity channel index
dataset["CCI"] = commodity_channel_index(
    close_data=dataset["Close"],
    low_data=dataset["Low"],
    high_data=dataset["High"],
    period=10
)

Derive the class labels (up = `1`, down = `0`).

In [5]:
dataset.insert(0, "Trend", (dataset["Close"] > dataset["Close"].shift(1)).astype(int))

Drop the unnecessary columns.

In [6]:
dataset.drop(labels=["Close", "Open", "High", "Low", "Volume", "Adj Close"], axis=1, inplace=True)

Drop the missing values.

In [7]:
dataset.dropna(inplace=True)

In [8]:
dataset.shape

(729, 11)

In [9]:
dataset.head()

Unnamed: 0_level_0,Trend,Simple MA,Weighted MA,Momentum,Stochastic K%,Stochastic D%,RSI,MACD,LW R%,A/D Oscillator,CCI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-09-07,0,4512.976025,4521.627663,33.799805,0.747269,0.908188,63.584309,29.01935,-54.884068,16538190000.0,1.835139
2021-09-08,0,4515.76001,4521.826536,17.879883,0.658248,0.794271,61.036421,26.950266,-55.16921,18208770000.0,-0.940604
2021-09-09,0,4515.468994,4517.739222,23.279785,0.347718,0.584412,52.831201,22.518786,-56.163868,14712320000.0,-2.02342
2021-09-10,0,4514.327002,4507.395783,-50.790039,0.0,0.335322,42.288508,15.094595,-57.824006,11195270000.0,-6.831255
2021-09-13,1,4510.262988,4499.105415,-60.060059,0.129512,0.159077,45.803512,9.960096,-57.338404,11093430000.0,-8.074242


In [10]:
dataset.tail()

Unnamed: 0_level_0,Trend,Simple MA,Weighted MA,Momentum,Stochastic K%,Stochastic D%,RSI,MACD,LW R%,A/D Oscillator,CCI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-07-25,0,5549.813086,5513.065172,-216.129883,0.0,0.104275,33.716938,-11.579047,-12.820965,261027700000.0,-24.196463
2024-07-26,1,5534.188086,5496.571902,-172.120117,0.223449,0.074483,42.986717,-19.550389,-9.956132,260975900000.0,-15.36006
2024-07-29,1,5517.420068,5483.726802,-203.660156,0.240017,0.154489,43.636131,-24.402278,-9.743712,260577800000.0,-11.126445
2024-07-30,0,5494.344043,5469.003143,-151.830078,0.196878,0.220115,40.506981,-31.457025,-11.040262,259790900000.0,-11.327447
2024-07-31,1,5487.747021,5474.086009,-22.290039,0.745079,0.393991,52.498281,-29.555742,-6.932469,259739000000.0,4.720467


Split the data.

In [11]:
test_size = 30

In [12]:
training_dataset = dataset.iloc[:- 2 * test_size]

In [13]:
print(f"training samples: {len(training_dataset)}")
print(f"training dates: {training_dataset.index.min().date()} / {training_dataset.index.max().date()}")

training samples: 669
training dates: 2021-09-07 / 2024-05-03


In [14]:
validation_dataset = dataset.iloc[- 2 * test_size: - test_size]

In [15]:
print(f"validation samples: {len(validation_dataset)}")
print(f"validation dates: {validation_dataset.index.min().date()} / {validation_dataset.index.max().date()}")

validation samples: 30
validation dates: 2024-05-06 / 2024-06-17


In [16]:
test_dataset = dataset.iloc[- test_size:]

In [17]:
print(f"test samples: {len(test_dataset)}")
print(f"test dates: {test_dataset.index.min().date()} / {test_dataset.index.max().date()}")

test samples: 30
test dates: 2024-06-18 / 2024-07-31


Save the data to S3.

In [18]:
training_data = session.upload_string_as_file_body(
    body=training_dataset.to_csv(index=False),
    bucket=bucket,
    key="data/train.csv"
)

In [19]:
training_data

's3://sagemaker-eu-west-1-661670223746/data/train.csv'

In [20]:
validation_data = session.upload_string_as_file_body(
    body=validation_dataset.to_csv(index=False),
    bucket=bucket,
    key="data/valid.csv"
)

In [21]:
validation_data

's3://sagemaker-eu-west-1-661670223746/data/valid.csv'

In [22]:
test_data = session.upload_string_as_file_body(
    body=test_dataset.drop(labels=["Trend"], axis=1).to_csv(index=False, header=False),
    bucket=bucket,
    key="data/test.csv"
)

In [23]:
test_data

's3://sagemaker-eu-west-1-661670223746/data/test.csv'

## Model Selection

Find the best model.

In [24]:
# define the AutoML job configuration
automl = sagemaker.automl.automlv2.AutoMLV2(
    problem_config=sagemaker.automl.automlv2.AutoMLTabularConfig(
        target_attribute_name="Trend",
        algorithms_config=["xgboost", "lightgbm", "catboost"],
        mode="ENSEMBLING",
        problem_type="BinaryClassification",
        max_candidates=10,
    ),
    output_path=f"s3://{bucket}/output/",
    job_objective={"MetricName": "Accuracy"},
    base_job_name="equity-trend-automl",
    role=role,
    sagemaker_session=session,
)

# run the AutoML job
automl.fit(
    inputs=[
        sagemaker.automl.automlv2.AutoMLDataChannel(
            s3_data_type="S3Prefix",
            s3_uri=training_data,
            channel_type="training",
            compression_type=None,
            content_type="text/csv;header=present"
        ),
        sagemaker.automl.automlv2.AutoMLDataChannel(
            s3_data_type="S3Prefix",
            s3_uri=validation_data,
            channel_type="validation",
            compression_type=None,
            content_type="text/csv;header=present"
        ),
    ]
)

{"AutoMLJobName": "equity-t-2024-08-18-16-51-30-087", "AutoMLJobInputDataConfig": [{"DataSource": {"S3DataSource": {"S3DataType": "S3Prefix", "S3Uri": "s3://sagemaker-eu-west-1-661670223746/data/train.csv"}}, "ChannelType": "training", "ContentType": "text/csv;header=present"}, {"DataSource": {"S3DataSource": {"S3DataType": "S3Prefix", "S3Uri": "s3://sagemaker-eu-west-1-661670223746/data/valid.csv"}}, "ChannelType": "validation", "ContentType": "text/csv;header=present"}], "OutputDataConfig": {"S3OutputPath": "s3://sagemaker-eu-west-1-661670223746/output/"}, "AutoMLProblemTypeConfig": {"TabularJobConfig": {"CompletionCriteria": {"MaxCandidates": 10}, "TargetAttributeName": "Trend", "ProblemType": "BinaryClassification", "Mode": "ENSEMBLING", "CandidateGenerationConfig": {"AlgorithmsConfig": [{"AutoMLAlgorithms": ["xgboost", "lightgbm", "catboost"]}]}}}, "RoleArn": "arn:aws:iam::661670223746:role/service-role/AmazonSageMaker-ExecutionRole-20191030T162743", "AutoMLJobObjective": {"Metric

## Model Evaluation

Generate the test set predictions using the best model.

In [25]:
# create the model
model = automl.create_model(
    name="equity-trend-model",
    sagemaker_session=session,
    inference_response_keys=["probabilities", "labels"]
)

# create the transformer
transformer = model.transformer(
    instance_count=1,
    instance_type="ml.m5.2xlarge",
)

# run the transform job
transformer.transform(
    data=test_data,
    content_type="text/csv",
)

INFO:sagemaker:Creating transform job with name: equity-trend-model-2024-08-18-16-57-22-823


[34m2024-08-18T17:02:17,169 [INFO ] main com.amazonaws.ml.mms.ModelServer - [0m
[34mMMS Home: /usr/local/lib/python3.8/dist-packages[0m
[34mCurrent directory: /[0m
[34mTemp directory: /home/model-server/tmp[0m
[34mNumber of GPUs: 0[0m
[34mNumber of CPUs: 8[0m
[34mMax heap size: 7044 M[0m
[34mPython executable: /usr/bin/python3[0m
[34mConfig file: /etc/sagemaker-mms.properties[0m
[34mInference address: http://0.0.0.0:8080[0m
[34mManagement address: http://0.0.0.0:8080[0m
[34mModel Store: /.sagemaker/mms/models[0m
[34mInitial Models: ALL[0m
[34mLog dir: null[0m
[34mMetrics dir: null[0m
[34mNetty threads: 0[0m
[34mNetty client threads: 0[0m
[34mDefault workers per model: 8[0m
[34mBlacklist Regex: N/A[0m
[34mMaximum Response Size: 6553500[0m
[34mMaximum Request Size: 6553500[0m
[34mPreload model: false[0m
[34mPrefer direct buffer: false[0m
[34m2024-08-18T17:02:17,230 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStrea

Get the test set predictions from S3.

In [26]:
# download the predictions from S3
predictions = session.read_s3_file(
    bucket=bucket,
    key_prefix=f"{transformer.latest_transform_job.name}/test.csv.out"
)

# cast the predictions to data frame
predictions = pd.read_csv(io.StringIO(predictions), header=None)

# extract the predicted probabilities
predictions["Class 0 Probability"] = predictions.iloc[:, 0].apply(lambda x: json.loads(x)[1])
predictions["Class 1 Probability"] = predictions.iloc[:, 0].apply(lambda x: json.loads(x)[0])
predictions["Predicted Trend"] = predictions[["Class 0 Probability", "Class 1 Probability"]].apply(lambda x: np.argmax(x), axis=1)

# add the dates
predictions.index = test_dataset.index

# add the ground truth labels
predictions["Actual Trend"] = test_dataset["Trend"]

In [27]:
predictions.shape

(30, 6)

In [28]:
predictions.head()

Unnamed: 0_level_0,0,1,Class 0 Probability,Class 1 Probability,Predicted Trend,Actual Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-06-18,"[0.9377749562263489, 0.06222505867481232]","['1', '0']",0.062225,0.937775,1,1
2024-06-20,"[0.30421024560928345, 0.6957897543907166]","['1', '0']",0.69579,0.30421,0,0
2024-06-21,"[0.20496052503585815, 0.7950394749641418]","['1', '0']",0.795039,0.204961,0,0
2024-06-24,"[0.1875396966934204, 0.8124603033065796]","['1', '0']",0.81246,0.18754,0,0
2024-06-25,"[0.626672625541687, 0.3733274042606354]","['1', '0']",0.373327,0.626673,1,1


In [29]:
predictions.tail()

Unnamed: 0_level_0,0,1,Class 0 Probability,Class 1 Probability,Predicted Trend,Actual Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-07-25,"[0.064228355884552, 0.935771644115448]","['1', '0']",0.935772,0.064228,0,0
2024-07-26,"[0.8685197830200195, 0.13148021697998047]","['1', '0']",0.13148,0.86852,1,1
2024-07-29,"[0.792151927947998, 0.20784805715084076]","['1', '0']",0.207848,0.792152,1,1
2024-07-30,"[0.42239910364151, 0.57760089635849]","['1', '0']",0.577601,0.422399,0,0
2024-07-31,"[0.8618403673171997, 0.1381596028804779]","['1', '0']",0.13816,0.86184,1,1


Calculate the classification metrics.

In [30]:
metrics = pd.DataFrame(
    data={
        "Accuracy": accuracy_score(y_true=predictions["Actual Trend"], y_pred=predictions["Predicted Trend"]),
        "ROC-AUC": roc_auc_score(y_true=predictions["Actual Trend"], y_score=predictions["Class 1 Probability"]),
        "Precision": precision_score(y_true=predictions["Actual Trend"], y_pred=predictions["Predicted Trend"]),
        "Recall": recall_score(y_true=predictions["Actual Trend"], y_pred=predictions["Predicted Trend"]),
        "F1": f1_score(y_true=predictions["Actual Trend"], y_pred=predictions["Predicted Trend"]),
    },
    index=["Value"]
).transpose().reset_index().rename(columns={"index": "Metric"})

In [31]:
metrics

Unnamed: 0,Metric,Value
0,Accuracy,0.866667
1,ROC-AUC,0.953704
2,Precision,0.9375
3,Recall,0.833333
4,F1,0.882353


Calculate the confusion matrix.

In [32]:
matrix = pd.crosstab(
    index=predictions["Actual Trend"],
    columns=predictions["Predicted Trend"],
)

In [33]:
matrix

Predicted Trend,0,1
Actual Trend,Unnamed: 1_level_1,Unnamed: 2_level_1
0,11,1
1,3,15


Delete the model.

In [34]:
transformer.delete_model()

INFO:sagemaker:Deleting model with name: equity-trend-model
