In [None]:
import urllib.request

src_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
dest_file = "./data/higgs.csv.gz"
urllib.request.urlretrieve(src_url, dest_file)

In [None]:
import pandas as pd

df = pd.read_csv(
    dest_file,
    header=None,
    usecols=range(0, 22),
    float_precision="round_trip",
    names=["target"] + [f"kinematic_{i}" for i in range(1, 22)] + [f"high_level_feature_{i}" for i in range(1, 8)])

In [None]:
from sklearn.model_selection import train_test_split

train_val_df, test_df = train_test_split(df, test_size=.1, stratify=df["target"])

In [None]:
train_df, val_df = train_test_split(train_val_df, test_size=.15, stratify=train_val_df["target"])

In [None]:
train_df.to_csv("./data/train.csv.gz", header=False, compression="gzip")
val_df.to_csv("./data/val.csv.gz", header=False, compression="gzip")
test_df.to_csv("./data/test.csv.gz", header=False, compression="gzip")

In [None]:
import boto3

boto3_session = boto3.session.Session(region_name="us-east-1")

In [None]:
import sagemaker

sess = sagemaker.Session(boto3_session, default_bucket="sagemaker-acme")

In [None]:
train_data_location = sess.upload_data("./data/train.csv.gz")
val_data_location  = sess.upload_data("./data/val.csv.gz")
test_data_location = sess.upload_data("./data/test.csv.gz")

In [None]:
import tarfile

with tarfile.open("model.tar.gz", mode="w:gz") as tar_gz:
    tar_gz.add("train.py")

In [None]:
model_location = sess.upload_data("model.tar.gz")

In [None]:
from sagemaker.estimator import Estimator
import os

estimator = Estimator(
    image_name=f"{os.getenv('ACC_ID')}.dkr.ecr.us-east-1.amazonaws.com/mxnet-training:1.6.0-gpu",
    role="<redacted>",
    train_instance_count=1,
    train_instance_type="ml.p3.8xlarge",
    sagemaker_session=sess,
    subnets=["<redacted>", "<redacted>"],
    security_group_ids=["<redacted>"],
    train_use_spot_instances=True,
    train_max_run=12*60*60,
    train_max_wait=12*60*60,
    hyperparameters={
        "sagemaker_submit_directory": f'"{model_location}"',
        "sagemaker_program": '"train.py"',
        "mlflow-secret": '"<redacted>"',
        "mlflow-artifacts-location": '"s3://<redacted>/mlruns"',
        "epochs": "8",
        "learning-rate": "0.01"
    })

In [None]:
estimator.fit(inputs={
    "train": train_data_location,
    "validation": val_data_location
})