### Notebook Env Details
`kernel`: Python 3
`image`: TensorFlow 2.6 Python 3.8 CPU
`instance`: ml.t3.medium

<div style="background-color: darkgreen; font-size: 20px; color: white;">
Setup

In [2]:
%store -r
%store

Stored variables and their in-db values:
data_bucket             -> 'sagemaker-us-east-1-717145514721/nyc-taxi/data/pr


In [3]:
# Ensure updated SageMaker SDK version
%pip install -U -q sagemaker

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.6.2 requires typing-extensions~=3.7.4, but you have typing-extensions 4.8.0 which is incompatible.
awscli 1.22.22 requires botocore==1.23.22, but you have botocore 1.33.6 which is incompatible.
awscli 1.22.22 requires PyYAML<5.5,>=3.10, but you have pyyaml 6.0.1 which is incompatible.
awscli 1.22.22 requires s3transfer<0.6.0,>=0.5.0, but you have s3transfer 0.8.2 which is incompatible.[0m
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


<div style="background-color: darkgreen; font-size: 20px; color: white;">
Download Dataset Sample

In [4]:
import sagemaker

data_bucket_s3_uri = "s3://" + data_bucket

# Filter directory for csv files
csv_files = [
    x for x in sagemaker.s3.S3Downloader.list(data_bucket_s3_uri) if x[-4:] == ".csv"
]

# Download one csv file
sagemaker.s3.S3Downloader.download(csv_files[0], "demo_data")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


['demo_data/part-00000-a58e012b-e1b3-4257-9a85-5afcf3f9c812-c000.csv']

In [5]:
import glob
import pandas as pd

# Find the file that matches the directory
csv_file = glob.glob("demo_data/*.csv")[0]

column_headers = [
    "day_of_week",
    "month",
    "hour",
    "pickup_location_id",
    "dropoff_location_id",
    "trip_distance",
    "fare_amount",
]

raw_dataset = pd.read_csv(csv_file, names=column_headers)
raw_dataset.head()

Unnamed: 0,day_of_week,month,hour,pickup_location_id,dropoff_location_id,trip_distance,fare_amount
0,1,1,0,7,7,0.04,2.5
1,1,1,0,7,7,0.47,4.0
2,1,1,0,7,7,0.51,4.0
3,1,1,0,7,7,0.63,4.5
4,1,1,0,7,7,0.77,5.0


In [6]:
linear_input = raw_dataset[["day_of_week", "month", "hour", "trip_distance"]]
dnn_input = raw_dataset[
    [
        "pickup_location_id",
        "dropoff_location_id",
        "trip_distance",
    ]
]
y = raw_dataset[["fare_amount"]]

<div style="background-color: darkgreen; font-size: 20px; color: white;">
Tensorflow 'Wide and Deep' Model

Architecture Article: https://ai.googleblog.com/2016/06/wide-deep-learning-better-together-with.html

In [8]:
import tensorflow as tf
from tensorflow.keras.experimental import LinearModel, WideDeepModel
from tensorflow import keras

<div style="background-color: teal; font-size: 15px; color: white;">
Pre-Handling Record Creation and Data Loading into Tensorflow Model

In [10]:
def pack(features, label):
    linear_features = [
        tf.cast(features["day_of_week"], tf.float32),
        tf.cast(features["month"], tf.float32),
        tf.cast(features["hour"], tf.float32),
        features["trip_distance"],
    ]

    dnn_features = [
        tf.cast(features["pickup_location_id"], tf.float32),
        tf.cast(features["dropoff_location_id"], tf.float32),
        features["trip_distance"],
    ]

    return (tf.stack(linear_features, axis=-1), tf.stack(dnn_features, axis=-1)), label


ds = tf.data.experimental.make_csv_dataset(
    csv_file,
    batch_size=1,
    column_names=column_headers,
    num_epochs=5,
    shuffle=False,
    label_name="fare_amount",
)
ds = ds.map(pack)

In [11]:
iterator = iter(ds)
(x1, x2), y = next(iterator)

print(x1)
print(x2)
print(y)

tf.Tensor([[1.   1.   0.   0.47]], shape=(1, 4), dtype=float32)
tf.Tensor([[7.   7.   0.47]], shape=(1, 3), dtype=float32)
tf.Tensor([4.], shape=(1,), dtype=float32)


<div style="background-color: darkgreen; font-size: 20px; color: white;">
Building the Regression Model

In [13]:
# Increase Batch Size
ds = tf.data.experimental.make_csv_dataset(
    csv_file,
    batch_size=128,
    column_names=column_headers,
    num_epochs=1,
    shuffle=False,
    label_name="fare_amount",
)
ds = ds.map(pack)

In [14]:
class SageMakerExperimentCallback(keras.callbacks.Callback):
    def __init__(self, run):
        super().__init__()
        self.run = run

    def on_epoch_end(self, epoch, logs=None):
        self.run.log_metric(name="loss", value=logs["loss"], step=epoch)
        self.run.log_metric(name="mse", value=logs["mse"], step=epoch)

<div style="background-color: darkgreen; font-size: 20px; color: white;">
Train Run

In [15]:
from sagemaker.experiments import Run

experiment_name = "TaxiFare-Experiment"
run_name = "Local-Notebook-Run"
optimizer = "Adam"
epochs = 5

with Run(experiment_name=experiment_name, run_name=run_name) as run:
    run.log_parameters({"optimizer": optimizer, "epochs": epochs})

    linear_model = LinearModel()
    dnn_model = keras.Sequential(
        [
            keras.layers.Flatten(),
            keras.layers.Dense(128, activation="elu"),
            keras.layers.Dense(64, activation="elu"),
            keras.layers.Dense(32, activation="elu"),
            keras.layers.Dense(1, activation="sigmoid"),
        ]
    )
    combined_model = WideDeepModel(linear_model, dnn_model)
    combined_model.compile(optimizer=optimizer, loss="mse", metrics=["mse"])

    combined_model.fit(ds, epochs=epochs, callbacks=SageMakerExperimentCallback(run))

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


INFO:sagemaker.experiments.run:The run (local-notebook-run) under experiment (taxifare-experiment) already exists. Loading it.


Epoch 1/5
Extension horovod.torch has not been built: /usr/local/lib/python3.8/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-38-x86_64-linux-gnu.so not found
If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.
[2023-12-04 19:14:18.552 tensorflow-2-6-cpu-py-ml-t3-medium-9169b2e75617c45c79c40579f6a8:20 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2023-12-04 19:14:18.811 tensorflow-2-6-cpu-py-ml-t3-medium-9169b2e75617c45c79c40579f6a8:20 INFO profiler_config_parser.py:111] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
