## Overview

The purpose of the notebook is to train a model that will predict the number of cyclist for a given location in Edmonton.

In [None]:
!pip install --upgrade pandas==1.5.3
!pip install --upgrade numpy==1.24.3
!pip install --upgrade matplotlib==3.7.1

import pkg_resources

pkg_resources.require("pandas==1.5.3")
pkg_resources.require("numpy==1.24.3")

import pandas
import matplotlib.pyplot as plt
import numpy as np

# This ensures plots are displayed inline in the Jupyter notebook
%matplotlib inline

## Collect Data

Collection the data from Edmonton's data hub.

### Collect daily forecast

The *Speed of Maximum Wind Gust (km/h)* column from the data source has mixed data types. In the snippet below everything is cleaned and converted to a float. If the value cannot be converted to a float it is considered NaN.

In [None]:
dataFilePath = "data/Weather_Data__Daily__-_Environment_Canada.csv"
cols = list(pandas.read_csv(dataFilePath, nrows=1))
print(cols)

In [None]:
edmontonWeatherStationName = ["EDMONTON STONY PLAIN"]

cols = list(pandas.read_csv(dataFilePath, nrows=1))

def speed_of_max_wind_gust(col):
    if not isinstance(col, float):
        return float('NaN')
    else:        
        return col
    
edmontonDailyWeather = pandas.read_csv(dataFilePath, converters={'Speed of Maximum Wind Gust (km/h)': speed_of_max_wind_gust})

edmontonDailyWeather = edmontonDailyWeather[edmontonDailyWeather["Station Name"].isin(edmontonWeatherStationName)]
edmontonDailyWeather.head(10)

### Collect daily cyclist count

In [None]:
edmontonLocation = ["100 Avenue E of 107 Street"]

edmontonBikeCounts = pandas.read_csv("data/Daily_Pedestrian_and_Bike_Counts.csv")
edmontonBikeCounts = edmontonBikeCounts[edmontonBikeCounts["Counter Location Description"].isin(edmontonLocation)]
edmontonBikeCounts['Year'] = pandas.DatetimeIndex(edmontonBikeCounts['Log Timstamp']).year
edmontonBikeCounts['Month'] = pandas.DatetimeIndex(edmontonBikeCounts['Log Timstamp']).month
edmontonBikeCounts['Day'] = pandas.DatetimeIndex(edmontonBikeCounts['Log Timstamp']).day
edmontonBikeCounts.head(10)


In [None]:
fig, ax = plt.subplots(2, 1, figsize = (9,12))

label = edmontonBikeCounts['Total Cyclist Count']

# Plot the histogram   
ax[0].hist(label, bins=100)
ax[0].set_ylabel('Frequency')

# Add lines for the mean, median, and mode
ax[0].axvline(label.mean(), color='magenta', linestyle='dashed', linewidth=2)
ax[0].axvline(label.median(), color='cyan', linestyle='dashed', linewidth=2)

# Plot the boxplot   
ax[1].boxplot(label, vert=False)
ax[1].set_xlabel('Total Cyclist Count')

plt.autoscale()

# Add a title to the Figure
fig.suptitle('Total Cyclist Count Distribution for ' + edmontonLocation[0])

### Join the weather & daily cyclist count

Combine the weather and cyclists count datasets into one dataset.

In [None]:
combined = pandas.merge(edmontonDailyWeather, edmontonBikeCounts, on=["Year", "Month", "Day"])

numeric_features = ['Mean Temperature (C)', 'Total Rain (mm)', 'Total Snow (cm)']
categorical_features = ['Month','Day']

print(combined.columns)

## Explore the combined data

Explore some data that we think will have an impact on the number of cyclists for any given day.

In [None]:
numeric_features = ['Mean Temperature (C)', 'Total Rain (mm)', 'Total Snow (cm)']

# Plot a histogram for each numeric feature
for col in numeric_features:
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    feature = combined[col]
    feature.hist(bins=100, ax = ax)
    ax.axvline(feature.mean(), color='magenta', linestyle='dashed', linewidth=2)
    ax.axvline(feature.median(), color='cyan', linestyle='dashed', linewidth=2)
    ax.set_title(col)
plt.show()

In [None]:
for col in categorical_features:
    counts = combined[col].value_counts().sort_index()
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    counts.plot.bar(ax = ax, color='steelblue')
    ax.set_title(col + ' counts')
    ax.set_xlabel(col) 
    ax.set_ylabel("Frequency")
plt.show()

In [None]:
for col in numeric_features:
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    feature = combined[col]
    label = combined['Total Cyclist Count']
    correlation = feature.corr(label)
    plt.scatter(x=feature, y=label)
    plt.xlabel(col)
    plt.ylabel('Total Cyclist Count')
    ax.set_title('Total Cyclist Count vs ' + col + '- correlation: ' + str(correlation))
plt.show()

In [None]:
# plot a boxplot for the label by each categorical feature
for col in categorical_features:
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    combined.boxplot(column = 'Total Cyclist Count', by = col, ax = ax)
    ax.set_title('Label by ' + col)
    ax.set_ylabel("Total Cyclist Count")
plt.show()

## Train the Model using TensorFlow

In [None]:
!pip install --upgrade tensorflow==2.13.0

In [None]:
import pkg_resources

pkg_resources.require("tensorflow==2.13.0")

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

### Combine cyclist and weather data

Combine the cyclist and weather data together using the date. Once the data is combined, select only the feature and label columns. Feature columns are separated into numeric and categorical features. Numeric features will be normalized and cateforical features will be one hot encoded.

In [None]:
numeric_features = ['MeanTemperature', 'TotalRain', 'TotalSnow']
categorical_features = ['Month','Day']
featureCols = ['MeanTemperature', 'TotalRain', 'TotalSnow', 'Month', 'Day']
labelCol = 'TotalCyclistCount'
allCols = featureCols
allCols.append(labelCol)

allOriginalCols = ['Mean Temperature (C)', 'Total Rain (mm)', 'Total Snow (cm)', 'Total Cyclist Count', 'Month', 'Day']

combined = pandas.merge(edmontonDailyWeather, edmontonBikeCounts, on=["Year", "Month", "Day"])
combined = combined[allOriginalCols]

combinedWithRenamedCols = combined.rename(columns={'Mean Temperature (C)': 'MeanTemperature', 'Total Rain (mm)': 'TotalRain', 'Total Snow (cm)':'TotalSnow', 'Total Cyclist Count': 'TotalCyclistCount'})

train_dataset, val_dataset, test_dataset = np.split(combinedWithRenamedCols.sample(frac=1), [int(0.8*len(combinedWithRenamedCols)), int(0.9*len(combinedWithRenamedCols))])

print (combinedWithRenamedCols.head(10))
print(len(train_dataset), 'training examples')
print(len(val_dataset), 'validation examples')
print(len(test_dataset), 'test examples')

### Define a utility to convert a Pandas data frame to Tensorflow dataset

In [None]:
# A utility function to convert a data frame into a tf.data.Dataset, then shuffles and batches the data.
def df_to_dataset(dataframe, labelCol, shuffle=True, batch_size=32):
  df = dataframe.copy()
  print (df.columns)
  labels = df.pop(labelCol)
  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

### Convert Pandas data frames (training, test, and validation) to Tensorflow datasets.

In [None]:
batch_size = 256
train_ds = df_to_dataset(train_dataset, labelCol, batch_size=batch_size)
test_ds = df_to_dataset(test_dataset, labelCol, shuffle=False, batch_size=batch_size)
val_ds = df_to_dataset(val_dataset, labelCol, shuffle=False, batch_size=batch_size)
[(train_features, label_batch)] = train_ds.take(1)

### Define feature wise layer functions

The normalized layer function will create a normalization layer for a specific numeric feature. The category encoding function will create a category encoding layer for a speicific categorical feature.

In [None]:
# A utility function that applies feature-wise normalization to numerical features.
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for the feature.
  normalizer = layers.Normalization(axis=None)

  # Prepare a Dataset that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))

### Create model input tensors

Create the inputs that will be used to build the model.

In [None]:
all_inputs = []
encoded_features = []

# Numerical features.
for header in numeric_features:
  numeric_col_input = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_ds)
  encoded_numeric_col = normalization_layer(numeric_col_input)
  all_inputs.append(numeric_col_input)
  encoded_features.append(encoded_numeric_col)

# Categorical features
for header in categorical_features:
  categorical_col_input = tf.keras.Input(shape=(1,), name=header, dtype='int32')
  encoding_layer = get_category_encoding_layer(name=header,
                                               dataset=train_ds,
                                               dtype='int32',
                                               max_tokens=5)
  encoded_categorical_col = encoding_layer(categorical_col_input)
  all_inputs.append(categorical_col_input)
  encoded_features.append(encoded_categorical_col)
    
print("Encoded Features:")
print(encoded_features)
print("Inputs:")
print(all_inputs)

### Create the model

In [None]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
              loss='mean_absolute_error')
model.summary()

### Train the model

In [None]:
history = model.fit(train_ds, epochs=20, validation_data=val_ds)

In [None]:
hist = pandas.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('Error [Total Cyclists]')
  plt.legend()
  plt.grid(True)

plot_loss(history)

In [None]:
predictions = model.predict(test_ds)

# Plot predicted vs actual based on the test dataset
plt.scatter(test_dataset[labelCol], predictions)
plt.xlabel('Actual Labels')
plt.ylabel('Predicted Labels')
plt.title('Daily Cyclist Count Predictions')
z = np.polyfit(test_dataset[labelCol], predictions, 1)
p = np.poly1d([z[0][0],z[1][0]])
plt.plot(test_dataset[labelCol],p(test_dataset[labelCol]), color='magenta')
plt.show()

### Try it out

Pass in some sample inputs to try the model out.

In [None]:
sampleInput = {
    'MeanTemperature': -22,
    'TotalRain': 0,
    'TotalSnow': 0,
    'Month': 12,
    'Day': 23
}

input_dict = {name: tf.convert_to_tensor([value]) for name, value in sampleInput.items()}
predictions = model.predict(input_dict)
print(predictions)

### Save/Export the model

In [None]:
# Save the model
model.save('models/100-Avenue-E-of-107-Street-Model')