### Notebook Env Details
`kernel`: Python 3
`image`: Data Science 3.0
`instance`: ml.t3.medium

<div style="background-color: darkgreen; font-size: 20px; color: white;">
Setup

In [2]:
%store -r
%store

Stored variables and their in-db values:
data_bucket             -> 'sagemaker-us-east-1-717145514721/nyc-taxi/data/pr


In [3]:
# Ensure updated SageMaker SDK version
%pip install -U -q sagemaker

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awscli 1.29.63 requires botocore==1.31.63, but you have botocore 1.33.6 which is incompatible.
awscli 1.29.63 requires s3transfer<0.8.0,>=0.7.0, but you have s3transfer 0.8.2 which is incompatible.
distributed 2022.7.0 requires tornado<6.2,>=6.0.3, but you have tornado 6.3.3 which is incompatible.[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


<div style="background-color: darkgreen; font-size: 20px; color: white;">
Training Script

In [5]:
%%writefile training_scripts/tf_train.py

import os
import argparse

import boto3
import tensorflow as tf
from tensorflow.keras.experimental import LinearModel, WideDeepModel
from tensorflow import keras
from sagemaker.experiments import load_run
from sagemaker.session import Session



class SageMakerExperimentCallback(keras.callbacks.Callback):
    def __init__(self, run):
        super().__init__()
        self.run = run
    
    def on_epoch_end(self, epoch, logs=None):
        self.run.log_metric(name="loss", value=logs["loss"], step=epoch)
        self.run.log_metric(name="mse", value=logs["mse"], step=epoch)


def parse_args():

    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script
    parser.add_argument("--epochs", type=int, default=1)
    parser.add_argument("--batch_size", type=int, default=64)
    parser.add_argument("--learning_rate", type=float, default=0.1)

    # data directories
    parser.add_argument("--training", type=str, default=os.environ["SM_CHANNEL_TRAINING"])
    parser.add_argument("--testing", type=str, default=os.environ["SM_CHANNEL_TESTING"])

    # model directory: we will use the default set by SageMaker, /opt/ml/model
    parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--sagemaker_region", type=str, default='us-east-1')


    return parser.parse_known_args()

def get_train_data(train_dir, batch_size):

    def pack(features, label):
        linear_features = [tf.cast(features['day_of_week'], tf.float32), tf.cast(features['month'], tf.float32),
                           tf.cast(features['hour'], tf.float32), features["trip_distance"]]
        
        dnn_features = [tf.cast(features["pickup_location_id"], tf.float32), tf.cast(features["dropoff_location_id"], tf.float32), features["trip_distance"]]
        return (tf.stack(linear_features, axis=-1), tf.stack(dnn_features, axis=-1)), label

    
    column_headers = ["day_of_week","month","hour","pickup_location_id","dropoff_location_id","trip_distance","fare_amount"]

    ds = tf.data.experimental.make_csv_dataset(tf.io.gfile.glob(train_dir + '/*.csv'),
                                               batch_size=batch_size,
                                               column_names=column_headers,
                                               num_epochs=1,
                                               shuffle=True,
                                               label_name="fare_amount")
    ds = ds.map(pack)
    return ds


if __name__ == "__main__":
    args, _ = parse_args()
    
    batch_size = args.batch_size
    epochs = args.epochs
    learning_rate = args.learning_rate
    train_dir = args.training
    region = args.sagemaker_region
    ds = get_train_data(train_dir, batch_size)
    
    boto_session = boto3.session.Session(region_name=region)
    sagemaker_session = Session(boto_session=boto_session)
    
    with load_run(sagemaker_session=sagemaker_session) as run:
        linear_model = LinearModel()
        dnn_model = keras.Sequential([
            keras.layers.Flatten(),
            keras.layers.Dense(128, activation='elu'),  
            keras.layers.Dense(64, activation='elu'), 
            keras.layers.Dense(32, activation='elu'), 
            keras.layers.Dense(1,activation='sigmoid') 
        ])
        combined_model = WideDeepModel(linear_model, dnn_model)
        combined_model.compile(optimizer="Adam", loss="mse", metrics=["mse"])

        combined_model.fit(ds, epochs=epochs, callbacks=SageMakerExperimentCallback(run))   

Writing training_scripts/tf_train.py


In [6]:
%%writefile training_scripts/requirements.txt
sagemaker >= 2.123.0

Writing training_scripts/requirements.txt


<div style="background-color: darkgreen; font-size: 20px; color: white;">
Run Training Job (within SageMaker Experiemnts)

In [9]:
import sagemaker
from sagemaker.tensorflow import TensorFlow
from sagemaker.experiments import Run

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
output_bucket = f"s3://{bucket}/nyc-taxi/model/"


experiment_name = "TaxiFare-Experiment"
run_name = "TrainingJob-Run"

with Run(experiment_name=experiment_name, run_name=run_name) as run:
    tf_estimator = TensorFlow(
        source_dir="training_scripts",
        entry_point="tf_train.py",
        base_job_name="tf2-taxi-wide-deep",
        role=role,
        framework_version="2.6.2",
        py_version="py38",
        input_mode="File",
        output_path=output_bucket,
        instance_count=1,
        instance_type="ml.c4.xlarge",
        hyperparameters={"batch_size": 512, "epochs": 5},
    )

    tf_estimator.fit(
        {
            "training": f"s3://{data_bucket}/train/",
            "testing": f"s3://{data_bucket}/test/",
        },
        logs=True,
    )

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


INFO:sagemaker.experiments.run:The run (trainingjob-run) under experiment (taxifare-experiment) already exists. Loading it.


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Using provided s3_resource


INFO:sagemaker:Creating training-job with name: tf2-taxi-wide-deep-2023-12-04-19-36-21-458


2023-12-04 19:36:21 Starting - Starting the training job...
2023-12-04 19:36:38 Starting - Preparing the instances for training.........
2023-12-04 19:38:09 Downloading - Downloading input data...
2023-12-04 19:38:44 Training - Downloading the training image...
2023-12-04 19:39:15 Training - Training image download completed. Training in progress..[34m2023-12-04 19:39:21.316894: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2023-12-04 19:39:21.317079: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2023-12-04 19:39:21.345211: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2023-12-04 19:39:22,450 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2023-12-04 19:39: