In [1]:
import os
import pandas as pd

In [2]:
data_dir = os.path.join(os.getcwd(), "..", "data")
daily_file = os.path.join(data_dir, "transit_daily.csv")

In [3]:
daily = pd.read_csv(daily_file, low_memory=False)

In [4]:
# gets a smaller version for easy troubleshooting
daily = daily.head(5000)

In [5]:
daily.deviance = daily.deviance.apply(lambda x: pd.to_timedelta(x).total_seconds())
daily.service_date = daily.service_date.apply(lambda x: pd.to_datetime(x))
# moves the labels one stop back, so that the previous stops information in the input for the current stops label
daily.deviance = daily.deviance.shift(-1)
daily = daily.dropna(axis=0, subset=["deviance"])

In [6]:
categories = [
    "trip_number",
    "gtfs_stop_time_id",
    "gtfs_stop_id",
    "gtfs_trip_id",
    "train",
    "trip_id",
    "data_agency",
    "data_source",
    "direction",
    "schedule_status",
    "service_key",
    "stop_id",
    "vehicle_number",
    "route_number",
    "trip_number",
]

# should work from what I understand, but doesn't...
# daily[categories] = daily[categories].astype("category")


def make_categories(make_category, df):
    for category in make_category:
        df[category] = pd.Categorical(df[category])


make_categories(categories, daily)

print(daily.dtypes)

id                           object
service_date         datetime64[ns]
vehicle_number             category
route_number               category
trip_number                category
arrival_time                float64
departure_time              float64
stop_id                    category
door                        float64
lift                        float64
ons                         float64
offs                        float64
estimated_load              float64
capacity                    float64
early                       float64
on_time                     float64
late                        float64
gtfs_stop_id               category
data_source                category
direction                  category
dwell                       float64
location_distance           float64
maximum_speed               float64
pattern_distance            float64
schedule_status            category
service_key                category
stop_time                   float64
train                      c

In [7]:
used_cols = [
    "ons",
    "offs",
    "maximum_speed",
    "deviance",
    "service_key",
]
x = daily[used_cols]
x = pd.get_dummies(x)  # turns all categoricals into one hot encoded columns!

target = x.pop("deviance")

In [8]:
print(x)
print(target)

      ons  offs  maximum_speed  service_key_U
0     0.0   0.0           26.0              1
1     0.0   0.0           35.0              1
2     1.0   0.0           26.0              1
3     0.0   0.0           33.0              1
4     0.0   0.0           33.0              1
...   ...   ...            ...            ...
4994  1.0   1.0           25.0              1
4995  0.0   0.0           25.0              1
4996  1.0   0.0           24.0              1
4997  0.0   0.0           17.0              1
4998  0.0   0.0           26.0              1

[4938 rows x 4 columns]
0         3.0
1       292.0
2       -72.0
3       -18.0
4       136.0
        ...  
4994    -11.0
4995     27.0
4996      8.0
4997    -95.0
4998    -21.0
Name: deviance, Length: 4938, dtype: float64


In [14]:
import tensorflow as tf

# https://www.tensorflow.org/tutorials/load_data/pandas_dataframe

batch_size = 30
data_len = len(x)
train_test_split = int(0.7 * data_len)

dataset = tf.data.Dataset.from_tensor_slices((x.values, target.values))
dataset = dataset.shuffle(data_len)
train_dataset = dataset.take(train_test_split).batch(batch_size)
test_dataset = dataset.skip(train_test_split).batch(batch_size)

In [15]:
# for features, target in train_dataset.take(5):
#     print("Features: {}, Target: {}".format(features, target))

In [16]:
def get_compiled_model():
    model = tf.keras.Sequential(
        [
            tf.keras.layers.Dense(10, activation="relu"),
            tf.keras.layers.Dense(10, activation="relu"),
            tf.keras.layers.Dense(1),
        ]
    )

    model.compile(
        optimizer="adam",
        loss=tf.keras.losses.MeanSquaredError(
            reduction="auto", name="mean_squared_error"
        ),
        metrics=["accuracy", "mean_absolute_error"],
    )

    return model

In [None]:
model = get_compiled_model()
model.fit(train_dataset, epochs=2)

Epoch 1/2

In [13]:
model.evaluate(test_dataset)



[37940.61328125, 0.005398110952228308, 117.55239868164062]