In [1]:
# TODO: Find the source of the top 20 issues to make sure it isn't a widespread issue
# Load the packages
# Load both datasets
# Transform both datasets into data loaders
# Create multiple models that predict ride count
    # Just ride count 
    # Ride count with other price, distance, etc
    # Ride count, price, distance, and time variables
    # Baseline model

# Time variable model
# Options:
    # Embed the time variables

# Model validation
    # Take a model, the test dataset, then run the test dataset through the model to compute MSE or sMAPE

## Load Packages

In [2]:
import sys, os
sys.path.append('..') # add parent directory to path
from typing import List, Tuple
from datetime import datetime
from enum import Enum

import numpy as np
import pandas as pd
# from utils import processing as pr
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
torch.set_printoptions(edgeitems=2, linewidth=75)

from sklearn.preprocessing import MinMaxScaler

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [4]:
def embed_size(distinct_values: int):
    return min(50, (distinct_values + 1) // 2)

## Load Datasets

In [47]:
mean_taxi_df = pd.read_pickle('data/test/adjusted_yellow_2022-01_2024-03_bypulocation.pkl')
zero_taxi_df = pd.read_pickle('data/test/yellow_2022-01_2024-03_bypulocation.pkl')

In [48]:
mean_taxi_df.to_csv("./taxi_actual.csv")

## Preprocessing

In [6]:
# Dataset implementation
import torch.utils

# Datset paramaters
# TODO: put these somewhere better
batch_size = None  # Equal to the number of zones for convenience
sequence_length = 96

HOURS_PER_SERIES = 19627


class TaxiDataset(torch.utils.data.Dataset):
    # The dataset must have batch_size == num_zones
    def __init__(
            self, 
            taxi_data: pd.DataFrame, 
            sequence_length, 
            continuous_features=False, 
            time_features=False, 
            use_alternate_scaler=False, 
            alternate_scaler_source=None
        ):
        self.sequence_length = sequence_length
        self.num_zones = taxi_data["PULocationID"].nunique()
        self.zones = torch.tensor(list(range(self.num_zones))).to(device)
        self.num_hours = taxi_data["pickup_datetime"].nunique()
        
        # Simplest way to organize the data is to have all zones in a batch
        self.batch_size = self.num_zones
        self.num_continuous_features = 1
        self.num_integer_features = 0

        self.use_alternate_scaler = use_alternate_scaler 
        # self.alternate_scaler_source=alternate_scaler_source

        taxi_data["counts"] = taxi_data["counts"].astype(np.float32)

        # All of the preprocessing will happpen here
        features_to_keep = ["PULocationID", "pickup_datetime", "counts"]
        if continuous_features:
            # Add continuous feature to keep list
            continuous_features = ['tip_amount', 'fare_amount', 'trip_distance', 'trip_duration']
            self.num_continuous_features += len(continuous_features)
            features_to_keep.extend(continuous_features)
        
        if time_features:
            # Create time features and add them to the keep list
            # TODO add time features to dataset
            taxi_data = self.add_time_features(taxi_data)
            time_features = ['pu_hour', 'pu_dayofweek', 'pu_month']
            self.num_integer_features = len(time_features)
            features_to_keep.extend(time_features)

        # SUBSET COLUMNS
        # Remove any columns that aren't sued for sorting or in the model
        taxi_data = taxi_data[features_to_keep]
        
        # DATA SCALING
        if self.use_alternate_scaler:
            self.count_scaler = alternate_scaler_source.count_scaler
            self.other_scaler = alternate_scaler_source.other_scaler
        else:
            self.count_scaler = MinMaxScaler()
            self.other_scaler = MinMaxScaler()
        
        taxi_data = self.fit_count_scaler(taxi_data)
        if continuous_features:
            taxi_data = self.fit_other_scaler(taxi_data)
    
        # Sort to prepare for splitting
        taxi_data = taxi_data.sort_values(["PULocationID", "pickup_datetime"], ascending=True)
        
        # Separate out the integer
        if self.num_integer_features:
            int_data = taxi_data.loc[:, time_features]
            self.integer_features = self.features_to_tensor(int_data)
            taxi_data = taxi_data.drop(time_features, axis=1)
            self.int_features_unique = {}
            for idx in range(self.num_integer_features):
                self.int_features_unique[idx] = len(self.integer_features[:, idx].unique())
        
        # Drop batch_number, PULocationID, pickup_datetime
        taxi_data = taxi_data.drop(["PULocationID", "pickup_datetime"], axis=1)
        self.continuous_features = self.features_to_tensor(taxi_data)

    def __len__(self):
        num_full_seqs = self.continuous_features.shape[0] // self.sequence_length
        return self.num_zones * num_full_seqs

    def __getitem__(self, idx):
        # 0 gets zone 0 item 0
        # 1 gets zone 1 item 0
        # Num_zones = batch_size
        # idx % (batch_size) = zone number
        batch_idx, zone_idx = divmod(idx, self.batch_size)
        cont_tensor = self.get_continuous_tensor(batch_idx, zone_idx)
        target = self.get_target_tensor(batch_idx, zone_idx)
        
        zone_id_tensor = torch.tensor([zone_idx]).to(device)

        if self.num_integer_features > 0:
            int_tensor = self.get_integer_tensor(batch_idx, zone_idx)
            return zone_id_tensor, cont_tensor, int_tensor, target
        else:
            return zone_id_tensor, cont_tensor, target

    def set_max_sequence_length(self):
        self.sequence_length = self.num_hours - 1

    def add_time_features(self, taxi_data):
        taxi_data['pu_hour'] = taxi_data['pickup_datetime'].dt.hour
        taxi_data['pu_hour'] = taxi_data['pu_hour'] - np.min(taxi_data['pu_hour'])
        taxi_data['pu_dayofweek'] = taxi_data['pickup_datetime'].dt.dayofweek
        taxi_data['pu_dayofweek'] = taxi_data['pu_dayofweek'] - np.min(taxi_data['pu_dayofweek'])
        taxi_data['pu_month'] = taxi_data['pickup_datetime'].dt.month
        taxi_data['pu_month'] = taxi_data['pu_month'] - np.min(taxi_data['pu_month'])
        return taxi_data

    def get_continuous_tensor(self, batch_idx, zone_idx):
        col_start = zone_idx * self.num_continuous_features
        col_end = (zone_idx + 1) * self.num_continuous_features
        row_start = batch_idx * self.sequence_length
        row_end = (batch_idx + 1) * self.sequence_length
        return self.continuous_features[row_start:row_end, col_start:col_end]
    
    def get_integer_tensor(self, batch_idx, zone_idx):
        col_start = zone_idx * self.num_integer_features
        col_end = (zone_idx + 1) * self.num_integer_features
        row_start = batch_idx * self.sequence_length
        row_end = (batch_idx + 1) * self.sequence_length
        # print("Col Start: ", col_start)
        # print("Col End  : ", col_end)
        # print("Row Start: ", row_start)
        # print("Row End  : ", row_end)
        return self.integer_features[row_start:row_end, col_start:col_end]
    
    def get_target_tensor(self, batch_idx, zone_idx):
        # TODO: Probably a lurking index out of bounds issue 
        # if data length % sequence_length == 0
        col_idx = zone_idx * self.num_continuous_features
        row_start = (batch_idx * self.sequence_length) + 1
        row_end = ((batch_idx + 1) * self.sequence_length) + 1
        return self.continuous_features[row_start:row_end, col_idx]

    def features_to_tensor(self, features: pd.DataFrame):
        result = features.to_numpy()
        # TODO: May need a type conversion
        # Split vertically into batches, then concat horizontally so 
        # time is along vertical axis and features are columns
        v_split_out = np.vsplit(result, self.batch_size)
        result = torch.tensor(np.hstack(v_split_out)).to(device)
        return result
    
    def fit_count_scaler(self, taxi_data: pd.DataFrame):
        value_cols = ["counts"]
        counts = taxi_data.pivot(columns="PULocationID", index="pickup_datetime", values=value_cols)
        counts_scaled = self.fit_scaler(self.count_scaler, counts)
        return taxi_data.drop(value_cols, axis=1).merge(counts_scaled, on=["PULocationID", "pickup_datetime"])

    def fit_other_scaler(self, taxi_data: pd.DataFrame):
        value_cols = ['tip_amount', 'fare_amount', 'trip_distance', 'trip_duration']
        other_vars = taxi_data.pivot(columns="PULocationID", index="pickup_datetime", values=value_cols)
        other_vars_scaled = self.fit_scaler(self.other_scaler, other_vars)
        return taxi_data.drop(value_cols, axis=1).merge(other_vars_scaled, on=["PULocationID", "pickup_datetime"])

    def fit_scaler(self, scaler: MinMaxScaler, pivoted_data: pd.DataFrame):
        # Time is along vertical axis
        # The pivot columns should be in order but not sure what guarantees that
        if self.use_alternate_scaler:
            mat = scaler.transform(pivoted_data)
        else:
            mat = scaler.fit_transform(pivoted_data)
        scaled = pd.DataFrame(mat)
        scaled.columns = pivoted_data.columns
        scaled.index = pivoted_data.index
        scaled = scaled.stack(future_stack=True).reset_index()
        return scaled


In [7]:
split_timestamp = '2023-10-19 14:00:00-0400'
def split_taxi_data_on_timestamp(taxi_dataset: pd.DataFrame, split_timestamp: str):
    train_set = taxi_dataset[taxi_dataset["pickup_datetime"] < split_timestamp].copy()
    validation_set = taxi_dataset[taxi_dataset["pickup_datetime"] >= split_timestamp].copy()
    return train_set, validation_set

def create_datasets(train_df, validation_df, sequence_length=24, continuous_features=False, time_features=False):
    train_set = TaxiDataset(train_df, sequence_length, continuous_features, time_features)
    validation_set = TaxiDataset(validation_df, sequence_length, continuous_features, time_features, use_alternate_scaler=True, alternate_scaler_source=train_set)
    return train_set, validation_set

def create_dataloaders(train_set, validation_set):
    train_loader = DataLoader(train_set, batch_size=train_set.batch_size, shuffle=False, drop_last=True)
    validation_loader = DataLoader(validation_set, batch_size=train_set.batch_size, shuffle=False, drop_last=True)
    return train_loader, validation_loader

def create_unbatched_loader(dataset: TaxiDataset):
    dataset.set_max_sequence_length

In [8]:
train_df, validation_df = split_taxi_data_on_timestamp(zero_taxi_df, split_timestamp)
train_set, validation_set = create_datasets(train_df, validation_df, sequence_length, continuous_features=True, time_features=True)
train_loader, validation_loader = create_dataloaders(train_set, validation_set)

In [10]:
test = zero_taxi_df.copy()
test['pu_hour'] = test['pickup_datetime'].dt.hour
test['pu_dayofweek'] = test['pickup_datetime'].dt.dayofweek
test['pu_month'] = test['pickup_datetime'].dt.month

In [11]:
test.sort_values(["PULocationID", "pickup_datetime"])

Unnamed: 0,PULocationID,pickup_datetime,counts,total_amount,tip_amount,fare_amount,trip_distance,passenger_count,trip_duration,pu_hour,pu_dayofweek,pu_month
0,4,2022-01-01 00:00:00-05:00,8,28.588137,2.320410,20.518707,3.363750,1.375000,14.481250,0,5,1
63,4,2022-01-01 01:00:00-05:00,9,28.869495,3.620874,19.499601,3.298889,1.888889,13.685184,1,5,1
126,4,2022-01-01 02:00:00-05:00,22,23.340609,3.562880,14.028709,2.087273,1.772727,9.458334,2,5,1
189,4,2022-01-01 03:00:00-05:00,5,24.288097,2.502337,16.036741,2.504000,1.600000,11.223333,3,5,1
252,4,2022-01-01 04:00:00-05:00,5,25.474213,0.756450,19.120031,2.632000,1.200000,11.460000,4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1236248,263,2024-03-31 19:00:00-04:00,78,16.183334,2.029487,10.134615,1.624231,1.333333,7.688675,19,6,3
1236311,263,2024-03-31 20:00:00-04:00,70,18.672571,2.578286,11.130000,1.879571,1.428571,8.332857,20,6,3
1236374,263,2024-03-31 21:00:00-04:00,61,19.478851,2.141803,12.394426,1.955574,1.098361,8.524590,21,6,3
1236437,263,2024-03-31 22:00:00-04:00,26,19.896152,2.365385,12.530768,2.263846,1.153846,8.896795,22,6,3


## Models

In [15]:
class ModelType(Enum):
    RNN = 1
    LSTM = 2

In [16]:
# Baseline
class BaselineModel(nn.Module):
    savable = False
    def __init__(self):
        super(BaselineModel, self).__init__()
        self.fc = nn.Linear(3, 10)
    
    def forward(self, _, continuous: torch.tensor):
        constant_pred = continuous[:, :, 0].detach().clone()
        constant_pred.requires_grad_()
        return constant_pred
        

    def reset(self):
        pass

In [17]:
class MultiSeriesModel(nn.Module):
    savable = True
    def __init__(
            self, 
            zone_count, 
            model_type: ModelType, 
            batch_size: int, 
            input_size=1, 
            hidden_size=50, 
            output_size=1, 
            num_layers=1
        ):
        super(MultiSeriesModel, self).__init__()
        self.model_type = model_type
        self.embed_size = embed_size(zone_count)
        self.input_size = input_size
        
        self.zone_embed = nn.Embedding(zone_count, self.embed_size)
        
        if self.model_type == ModelType.RNN:
            self.cell = nn.RNN(self.embed_size + input_size, hidden_size, num_layers, batch_first=True)
            self.h = torch.zeros(num_layers, batch_size, hidden_size, device=device)
        elif self.model_type == ModelType.LSTM:
            self.cell = nn.LSTM(self.embed_size + input_size, hidden_size, num_layers, batch_first=True)
            self.h = [torch.zeros(num_layers, batch_size, hidden_size, device=device) for _ in range(2)]
        else:
            raise NotImplementedError("Model only supports RNN and LSTM")
        
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, zones, continuous):
        # Zones is (zone_count, 1), emb (zone_count, embed_size(zone_count))
        sequence_length = continuous.shape[-2]
        embed_result = self.zone_embed(zones).expand((-1, sequence_length, -1))
        x = torch.cat([embed_result, continuous], dim=-1)
        if self.model_type == ModelType.RNN:
            out, h = self.cell(x, self.h)
            self.h = h.detach()
        elif self.model_type == ModelType.LSTM:
            out, h = self.cell(x, self.h)
            self.h = [h_.detach() for h_ in h]
        else:
            raise NotImplementedError("Model only supports RNN and LSTM")
        
        out = self.fc(out)
        return out.squeeze()

    def reset(self):
        if type(self.h) is list:
            for h in self.h: 
                h.zero_()
        else:
            self.h.zero_()

    def get_model_name(self) -> str:
        if self.input_size > 1:
            return f"{self.model_type.name}_MultiSeriesModel"
        else:
            return f"{self.model_type.name}_SingleSeriesModel"


In [18]:
from typing import Dict


class MultiSeriesTimeModel(nn.Module):
    savable = True
    def __init__(
            self, 
            zone_count,
            int_features: Dict[int, int],  # index and number of distinct values
            model_type: ModelType, 
            batch_size: int, 
            input_size=1, 
            hidden_size=50, 
            output_size=1, 
            num_layers=1
        ):
        super(MultiSeriesTimeModel, self).__init__()
        self.model_type = model_type
        self.embed_size = embed_size(zone_count)
        self.input_size = input_size
        
        self.zone_embed = nn.Embedding(zone_count, self.embed_size)

        self.int_embeds = {}

        int_embed_size = 0
        for idx, num_unique in int_features.items():
            self.int_embeds[idx] = nn.Embedding(num_unique, embed_size(num_unique)).to(device)
            int_embed_size += embed_size(num_unique)


        recurrent_size = self.embed_size + input_size + int_embed_size
        if self.model_type == ModelType.RNN:
            self.cell = nn.RNN(recurrent_size, hidden_size, num_layers, batch_first=True)
            self.h = torch.zeros(num_layers, batch_size, hidden_size, device=device)
        elif self.model_type == ModelType.LSTM:
            self.cell = nn.LSTM(recurrent_size, hidden_size, num_layers, batch_first=True)
            self.h = [torch.zeros(num_layers, batch_size, hidden_size, device=device) for _ in range(2)]
        else:
            raise NotImplementedError("Model only supports RNN and LSTM")
        
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, zones, continuous, int_features):
        # Zones is (zone_count, 1), emb (zone_count, embed_size(zone_count))
        sequence_length = continuous.shape[-2]
        embed_result = self.zone_embed(zones).expand((-1, sequence_length, -1))

        int_embed_results = {}
        for idx, embed in self.int_embeds.items():
            int_embed_results[idx] = embed(int_features[:, :, idx])

        embed_result_arr = []
        for idx in sorted(list(int_embed_results.keys())):
            embed_result_arr.append(int_embed_results[idx])

        x = torch.cat(embed_result_arr + [embed_result, continuous], dim=-1)
        if self.model_type == ModelType.RNN:
            out, h = self.cell(x, self.h)
            self.h = h.detach()
        elif self.model_type == ModelType.LSTM:
            out, h = self.cell(x, self.h)
            self.h = [h_.detach() for h_ in h]
        else:
            raise NotImplementedError("Model only supports RNN and LSTM")
        
        out = self.fc(out)
        return out.squeeze()

    def reset(self):
        if type(self.h) is list:
            for h in self.h: 
                h.zero_()
        else:
            self.h.zero_()

    def get_model_name(self) -> str:
        if self.input_size > 1:
            return f"{self.model_type.name}_MultiSeriesTimeModel"
        else:
            return f"{self.model_type.name}_SingleSeriesTimeModel"

In [None]:
# base_model = BaselineModel().to(device)

In [None]:
# Multiple Variable

In [None]:
# Multiple and time series variables

## Fit Models

In [20]:
class TrainingLoop:
    NUM_EPOCHS = 1000
    MODEL_FOLDER = "../models/model_validation/"
    PRINT_EVERY = 10

    def __init__(self, model, train_loader, validation_loader):
        self.model: nn.Module = model
        self.optimizer: optim.Optimizer = optim.Adam(model.parameters(), lr=3e-3)
        self.criterion: nn.MSELoss = nn.MSELoss()

        self.train_loader = train_loader
        self.validation_loader = validation_loader

        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        self.min_valid_loss = float("inf")
        self.best_model_path = ""

        self.valid_loss_list = []
        self.consecutive_loss_increases = 0


    def train_epoch(self) -> float:
        self.model.train()
        epoch_train_loss = 0
        for *vars, target in self.train_loader:
            preds = self.model(*vars)
            train_loss = self.criterion(preds, target)
            
            self.optimizer.zero_grad()
            train_loss.backward()
            self.optimizer.step()

            epoch_train_loss += train_loss.item()
        self.model.reset()

        avg_train_loss = epoch_train_loss / len(self.train_loader)
        return avg_train_loss

    def validate_epoch(self) -> float:
        self.model.eval()
        epoch_valid_loss = 0
        with torch.no_grad():
            for *vars, target in self.validation_loader:
                preds = self.model(*vars)            
                valid_loss = self.criterion(preds, target)

                epoch_valid_loss += valid_loss.item()
        self.model.reset()
        
        avg_valid_loss = epoch_valid_loss / len(self.validation_loader)
        return avg_valid_loss
    
    def print_loss(self, epoch, train_loss, valid_loss):
        print(f'Epoch [{epoch + 1:04}/{self.NUM_EPOCHS}], Train Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}')
    
    def fit_one_epoch(self):
        avg_train_loss = self.train_epoch()
        avg_valid_loss = self.validate_epoch()
        return avg_train_loss, avg_valid_loss
    
    def save_model(self, epoch) -> str:
        if self.model.savable:
            model_name = f"{self.model.get_model_name()}_{self.timestamp}_{epoch}"
            model_path = self.MODEL_FOLDER + model_name
            torch.save(self.model.state_dict(), model_path)
            return model_path

    def handle_early_stopping(self, epoch) -> bool:
        stop = False
        if self.valid_loss_list[-1] < self.min_valid_loss:
            # Save model
            self.min_valid_loss = self.valid_loss_list[-1]
            self.best_model_path = self.save_model(epoch)
            # Reset count
            self.consecutive_loss_increases = 0
        else:
            self.consecutive_loss_increases += 1
            if self.consecutive_loss_increases >= 20:
                stop = True

        return stop

    def train(self):
        for epoch in range(self.NUM_EPOCHS):
            avg_train_loss, avg_valid_loss = self.fit_one_epoch()

            if (epoch + 1) % self.PRINT_EVERY == 0:
                self.print_loss(epoch, avg_train_loss, avg_valid_loss)

            ## Early Stopping: check if the error went up or down
            self.valid_loss_list.append(avg_valid_loss)
            stop = self.handle_early_stopping(epoch)
            if stop or not self.model.savable:
                break
        if self.model.savable:
            self.model.load_state_dict(torch.load(self.best_model_path, weights_only=True))

In [26]:
# Better than baseline !!!
# 0.08
# np.mean(np.power(np.array(train_set.continuous_features.cpu()[:-1, :] - train_set.continuous_features.cpu()[1:, :]), 2))

## Validate Models

In [27]:
class Validator:
    def __init__(
            self,
            model: nn.Module,
            train_set: TaxiDataset,
            validation_set: TaxiDataset
        ) -> None:
        validation_set.set_max_sequence_length()
        self.train_set = train_set
        self.validation_set = validation_set
        self.validation_loader = DataLoader(validation_set, validation_set.batch_size, shuffle=False)
        
        self.model = model

        # TODO: Validation set needs to be scaled by train set scalers
        
    def get_preds(self):
        self.model.eval()
        with torch.no_grad():
            for *vars, target in self.validation_loader:
                preds = self.model(*vars)
        self.model.reset()
        # Transpose so that time is along vertical
        preds = preds.cpu().detach().numpy().T
        target = target.cpu().detach().numpy().T
        
        return preds, target
    
    def unscale_preds_and_target(self, scaled_preds, scaled_target):
        preds = self.train_set.count_scaler.inverse_transform(scaled_preds)
        target = self.validation_set.count_scaler.inverse_transform(scaled_target)
        return preds, target

    def mse_loss(self, preds, target):
        # mse_loss = nn.MSELoss()
        # scaled_loss = mse_loss(preds, target)
        # print(type(scaled_loss))
        # return scaled_loss.item()
        return ((preds - target) ** 2).mean()

    def smape_loss(self, preds, target):
        # TODO: Eliminiate runtime warning when denominator == 0
        smape_mat = (2 * np.abs(preds - target)) / (np.abs(preds) + np.abs(target))
        smape_mat = np.mean(np.nan_to_num(smape_mat))
        return smape_mat

    def validate(self):
        scaled_preds, scaled_target = self.get_preds()
        preds, target = self.unscale_preds_and_target(scaled_preds, scaled_target)
        
        self.preds = preds
        self.target = target
        self.scaled_mse_loss = self.mse_loss(scaled_preds, scaled_target)
        self.unscaled_mse_loss = self.mse_loss(preds, target)
        self.smape_loss = self.smape_loss(preds, target)

    def print_results(self):
        print(f"Scaled MSE: {self.scaled_mse_loss:.4f}, Unscaled MSE: {self.unscaled_mse_loss}, sMAPE: {self.smape_loss}")
    
    # TODO: Some kind of plot support

In [28]:
class ModelContainer:
    # Act as a container for all of the other models
    def __init__(self, taxi_df, model_type, sequence_length, continuous_features, time_features) -> None:
        # Configuration options
        self.continuous_features = continuous_features
        self.time_features = time_features

        self.data = taxi_df
        self.model_type = model_type
        self.sequence_length = sequence_length

        # Setup
        self.create_datasets()
        self.setup_model()

    def run(self):
        self.train_model()
        self.validate_model()

    def create_datasets(self):
        train_df, validation_df = split_taxi_data_on_timestamp(self.data, split_timestamp)
        train_set, validation_set = create_datasets(
            train_df, validation_df, self.sequence_length, 
            self.continuous_features, self.time_features)

        self.train_set = train_set

        train_loader, validation_loader = create_dataloaders(train_set, validation_set)

        self.train_loader = train_loader
        self.validation_loader = validation_loader

        self.validation_set_for_validation = TaxiDataset(validation_df, self.sequence_length, self.continuous_features, self.time_features)

    def setup_model(self):
        if self.model_type == 'rnn':
            model_type = ModelType.RNN
        elif self.model_type == 'lstm':
            model_type = ModelType.LSTM
        else:
            raise ValueError(f"mode_type {self.model_type} not supported")
        

        if self.model_type == 'baseline':
            self.model = BaselineModel()
        elif self.time_features:
            self.model = MultiSeriesTimeModel(train_set.num_zones, train_set.int_features_unique,  model_type, train_set.batch_size, self.train_set.num_continuous_features).to(device)
        else:
            self.model = MultiSeriesModel(train_set.num_zones, model_type, train_set.batch_size, self.train_set.num_continuous_features).to(device)

    def train_model(self):
        loop = TrainingLoop(self.model, self.train_loader, self.validation_loader)
        loop.train()
        
    def validate_model(self):
        valid = Validator(self.model, self.train_set, self.validation_set_for_validation)
        valid.validate()
        valid.print_results()
    
    # Dataset generation
    # Create the Model
    # Fit the model
    # Validate the model

In [43]:
# Varies by 10
test = ModelContainer(mean_taxi_df, 'lstm', 96, True, True)
test.run()

Epoch [0010/1000], Train Loss: 0.0057, Validation Loss: 0.0103
Epoch [0020/1000], Train Loss: 0.0051, Validation Loss: 0.0057
Epoch [0030/1000], Train Loss: 0.0042, Validation Loss: 0.0056
Epoch [0040/1000], Train Loss: 0.0042, Validation Loss: 0.0059
Epoch [0050/1000], Train Loss: 0.0038, Validation Loss: 0.0042
Epoch [0060/1000], Train Loss: 0.0038, Validation Loss: 0.0044
Epoch [0070/1000], Train Loss: 0.0037, Validation Loss: 0.0042
Epoch [0080/1000], Train Loss: 0.0036, Validation Loss: 0.0041
Epoch [0090/1000], Train Loss: 0.0036, Validation Loss: 0.0040
Epoch [0100/1000], Train Loss: 0.0035, Validation Loss: 0.0040
Epoch [0110/1000], Train Loss: 0.0035, Validation Loss: 0.0040
Epoch [0120/1000], Train Loss: 0.0035, Validation Loss: 0.0039
Epoch [0130/1000], Train Loss: 0.0035, Validation Loss: 0.0039
Epoch [0140/1000], Train Loss: 0.0034, Validation Loss: 0.0039
Epoch [0150/1000], Train Loss: 0.0034, Validation Loss: 0.0040
Scaled MSE: 0.0051, Unscaled MSE: 271.08770751953125, s

In [44]:
test = ModelContainer(mean_taxi_df, 'lstm', 96, True, False)
test.run()

Epoch [0010/1000], Train Loss: 0.0050, Validation Loss: 0.0050
Epoch [0020/1000], Train Loss: 0.0046, Validation Loss: 0.0046
Epoch [0030/1000], Train Loss: 0.0045, Validation Loss: 0.0045
Epoch [0040/1000], Train Loss: 0.0044, Validation Loss: 0.0044
Epoch [0050/1000], Train Loss: 0.0042, Validation Loss: 0.0043
Epoch [0060/1000], Train Loss: 0.0041, Validation Loss: 0.0042
Epoch [0070/1000], Train Loss: 0.0041, Validation Loss: 0.0042
Epoch [0080/1000], Train Loss: 0.0040, Validation Loss: 0.0041
Epoch [0090/1000], Train Loss: 0.0040, Validation Loss: 0.0041
Epoch [0100/1000], Train Loss: 0.0039, Validation Loss: 0.0041
Epoch [0110/1000], Train Loss: 0.0039, Validation Loss: 0.0041
Epoch [0120/1000], Train Loss: 0.0039, Validation Loss: 0.0041
Epoch [0130/1000], Train Loss: 0.0039, Validation Loss: 0.0041
Epoch [0140/1000], Train Loss: 0.0038, Validation Loss: 0.0040
Epoch [0150/1000], Train Loss: 0.0038, Validation Loss: 0.0040
Epoch [0160/1000], Train Loss: 0.0038, Validation Loss:

In [45]:
test = ModelContainer(mean_taxi_df, 'lstm', 96, False, True)
test.run()

Epoch [0010/1000], Train Loss: 0.0053, Validation Loss: 0.0075
Epoch [0020/1000], Train Loss: 0.0044, Validation Loss: 0.0051
Epoch [0030/1000], Train Loss: 0.0040, Validation Loss: 0.0046
Epoch [0040/1000], Train Loss: 0.0039, Validation Loss: 0.0046
Epoch [0050/1000], Train Loss: 0.0038, Validation Loss: 0.0044
Epoch [0060/1000], Train Loss: 0.0037, Validation Loss: 0.0043
Epoch [0070/1000], Train Loss: 0.0036, Validation Loss: 0.0043
Epoch [0080/1000], Train Loss: 0.0036, Validation Loss: 0.0042
Epoch [0090/1000], Train Loss: 0.0036, Validation Loss: 0.0042
Epoch [0100/1000], Train Loss: 0.0035, Validation Loss: 0.0041
Epoch [0110/1000], Train Loss: 0.0035, Validation Loss: 0.0041
Epoch [0120/1000], Train Loss: 0.0034, Validation Loss: 0.0041
Epoch [0130/1000], Train Loss: 0.0034, Validation Loss: 0.0041
Epoch [0140/1000], Train Loss: 0.0034, Validation Loss: 0.0040
Epoch [0150/1000], Train Loss: 0.0034, Validation Loss: 0.0040
Epoch [0160/1000], Train Loss: 0.0034, Validation Loss:

In [46]:
test = ModelContainer(mean_taxi_df, 'lstm', 96, False, False)
test.run()

Epoch [0010/1000], Train Loss: 0.0050, Validation Loss: 0.0049
Epoch [0020/1000], Train Loss: 0.0046, Validation Loss: 0.0046
Epoch [0030/1000], Train Loss: 0.0044, Validation Loss: 0.0045
Epoch [0040/1000], Train Loss: 0.0043, Validation Loss: 0.0044
Epoch [0050/1000], Train Loss: 0.0042, Validation Loss: 0.0043
Epoch [0060/1000], Train Loss: 0.0041, Validation Loss: 0.0042
Epoch [0070/1000], Train Loss: 0.0040, Validation Loss: 0.0042
Epoch [0080/1000], Train Loss: 0.0040, Validation Loss: 0.0042
Epoch [0090/1000], Train Loss: 0.0040, Validation Loss: 0.0041
Epoch [0100/1000], Train Loss: 0.0040, Validation Loss: 0.0042
Epoch [0110/1000], Train Loss: 0.0039, Validation Loss: 0.0042
Scaled MSE: 0.0052, Unscaled MSE: 274.9544982910156, sMAPE: 0.5505231618881226


In [None]:
test1 = ModelContainer(mean_taxi_df, 'lstm', 1000)
test1.run()

Epoch [0010/1000], Train Loss: 0.0076, Validation Loss: 0.0075
Epoch [0020/1000], Train Loss: 0.0067, Validation Loss: 0.0066
Epoch [0030/1000], Train Loss: 0.0057, Validation Loss: 0.0056
Epoch [0040/1000], Train Loss: 0.0054, Validation Loss: 0.0054
Epoch [0050/1000], Train Loss: 0.0052, Validation Loss: 0.0052
Epoch [0060/1000], Train Loss: 0.0052, Validation Loss: 0.0051
Epoch [0070/1000], Train Loss: 0.0050, Validation Loss: 0.0050
Epoch [0080/1000], Train Loss: 0.0049, Validation Loss: 0.0050
Epoch [0090/1000], Train Loss: 0.0049, Validation Loss: 0.0050
Epoch [0100/1000], Train Loss: 0.0050, Validation Loss: 0.0049
Epoch [0110/1000], Train Loss: 0.0048, Validation Loss: 0.0049
Epoch [0120/1000], Train Loss: 0.0048, Validation Loss: 0.0048
Epoch [0130/1000], Train Loss: 0.0047, Validation Loss: 0.0048
Epoch [0140/1000], Train Loss: 0.0047, Validation Loss: 0.0047
Epoch [0150/1000], Train Loss: 0.0046, Validation Loss: 0.0047
Epoch [0160/1000], Train Loss: 0.0046, Validation Loss:

In [None]:
test2 = ModelContainer(zero_taxi_df, 'lstm', 1000)
test2.run()

Epoch [0010/1000], Train Loss: 0.0075, Validation Loss: 0.0074
Epoch [0020/1000], Train Loss: 0.0062, Validation Loss: 0.0060
Epoch [0030/1000], Train Loss: 0.0056, Validation Loss: 0.0055
Epoch [0040/1000], Train Loss: 0.0053, Validation Loss: 0.0053
Epoch [0050/1000], Train Loss: 0.0052, Validation Loss: 0.0052
Epoch [0060/1000], Train Loss: 0.0051, Validation Loss: 0.0051
Epoch [0070/1000], Train Loss: 0.0050, Validation Loss: 0.0050
Epoch [0080/1000], Train Loss: 0.0049, Validation Loss: 0.0050
Epoch [0090/1000], Train Loss: 0.0049, Validation Loss: 0.0049
Epoch [0100/1000], Train Loss: 0.0048, Validation Loss: 0.0049
Epoch [0110/1000], Train Loss: 0.0047, Validation Loss: 0.0048
Epoch [0120/1000], Train Loss: 0.0047, Validation Loss: 0.0048
Epoch [0130/1000], Train Loss: 0.0047, Validation Loss: 0.0047
Epoch [0140/1000], Train Loss: 0.0046, Validation Loss: 0.0047
Epoch [0150/1000], Train Loss: 0.0046, Validation Loss: 0.0046
Epoch [0160/1000], Train Loss: 0.0046, Validation Loss:

In [None]:
def create_train_and_validation_sets(taxi_df, split_timestamp):
    train_df, validation_df = split_taxi_data_on_timestamp(taxi_df, split_timestamp)
    train_set, validation_set = create_datasets(train_df, validation_df, sequence_length, continuous_features=True)
    return train_set, validation_set

def setup_models(train_set: TaxiDataset):
    baseline = BaselineModel()
    rnn = MultiSeriesModel(train_set.num_zones, ModelType.RNN, train_set.batch_size, train_set.num_continuous_features).to(device)
    lstm = MultiSeriesModel(train_set.num_zones, ModelType.LSTM, train_set.batch_size, train_set.num_continuous_features).to(device)
    models = [baseline, rnn, lstm]
    return models

In [None]:
def create_models(train_set):
    models = setup_models(train_set)
    return models


def train_model(model, train_loader, validation_loader):
    # The models should have the best parameters by the end
    loop = TrainingLoop(model, train_loader, validation_loader)
    loop.train()
    return loop


def validate_models(model, train_set, taxi_df, split_timestamp, continuous_features):
    _, validation_df = split_taxi_data_on_timestamp(taxi_df, split_timestamp)
    validation_set = TaxiDataset(validation_df, sequence_length, continuous_features)
    valid = Validator(model, train_set, validation_set)
    valid.validate()
    valid.print_results()

In [None]:
train_set, validation_set = create_train_and_validation_sets(mean_taxi_df, split_timestamp)
train_loader, validation_loader = create_dataloaders(train_set, validation_set)
models = create_models(train_set)

In [None]:
baseline = BaselineModel().to(device)
rnn = MultiSeriesModel(train_set.num_zones, ModelType.RNN, train_set.batch_size, train_set.num_continuous_features).to(device)
lstm = MultiSeriesModel(train_set.num_zones, ModelType.LSTM, train_set.batch_size, train_set.num_continuous_features).to(device)

In [None]:
train_set_single.num_continuous_features

1

In [None]:
train_df, validation_df = split_taxi_data_on_timestamp(mean_taxi_df, split_timestamp)
train_set_single, validation_set_single = create_datasets(train_df, validation_df, sequence_length)
train_loader_single, validation_loader_single = create_dataloaders(train_set_single, validation_set_single)
lstm_single = MultiSeriesModel(train_set.num_zones, ModelType.LSTM, train_set_single.batch_size, train_set_single.num_continuous_features).to(device)

In [None]:
lstm_single_loop = train_model(lstm_single, train_loader_single, validation_loader_single)

Epoch [0010/1000], Train Loss: 0.0049, Validation Loss: 0.0049
Epoch [0020/1000], Train Loss: 0.0046, Validation Loss: 0.0046
Epoch [0030/1000], Train Loss: 0.0044, Validation Loss: 0.0045
Epoch [0040/1000], Train Loss: 0.0043, Validation Loss: 0.0044
Epoch [0050/1000], Train Loss: 0.0041, Validation Loss: 0.0043
Epoch [0060/1000], Train Loss: 0.0041, Validation Loss: 0.0042
Epoch [0070/1000], Train Loss: 0.0040, Validation Loss: 0.0042
Epoch [0080/1000], Train Loss: 0.0039, Validation Loss: 0.0041
Epoch [0090/1000], Train Loss: 0.0039, Validation Loss: 0.0041
Epoch [0100/1000], Train Loss: 0.0039, Validation Loss: 0.0041
Epoch [0110/1000], Train Loss: 0.0039, Validation Loss: 0.0041
Epoch [0120/1000], Train Loss: 0.0039, Validation Loss: 0.0042
Epoch [0130/1000], Train Loss: 0.0038, Validation Loss: 0.0041


In [None]:
baseline_loop = train_model(baseline, train_loader, validation_loader)

In [None]:
validate_models(baseline, train_set_single, mean_taxi_df, split_timestamp, continuous_features=False)

In [None]:
rnn_loop = train_model(rnn, train_loader, validation_loader)

Epoch [0010/1000], Train Loss: 0.0057, Validation Loss: 0.0058
Epoch [0020/1000], Train Loss: 0.0052, Validation Loss: 0.0054
Epoch [0030/1000], Train Loss: 0.0049, Validation Loss: 0.0050
Epoch [0040/1000], Train Loss: 0.0047, Validation Loss: 0.0049
Epoch [0050/1000], Train Loss: 0.0046, Validation Loss: 0.0049
Epoch [0060/1000], Train Loss: 0.0046, Validation Loss: 0.0048
Epoch [0070/1000], Train Loss: 0.0045, Validation Loss: 0.0048
Epoch [0080/1000], Train Loss: 0.0045, Validation Loss: 0.0047
Epoch [0090/1000], Train Loss: 0.0044, Validation Loss: 0.0047
Epoch [0100/1000], Train Loss: 0.0044, Validation Loss: 0.0046
Epoch [0110/1000], Train Loss: 0.0044, Validation Loss: 0.0048
Epoch [0120/1000], Train Loss: 0.0044, Validation Loss: 0.0045
Epoch [0130/1000], Train Loss: 0.0044, Validation Loss: 0.0046
Epoch [0140/1000], Train Loss: 0.0044, Validation Loss: 0.0045
Epoch [0150/1000], Train Loss: 0.0044, Validation Loss: 0.0045
Epoch [0160/1000], Train Loss: 0.0045, Validation Loss:

In [None]:
lstm_loop = train_model(lstm, train_loader, validation_loader)

Epoch [0010/1000], Train Loss: 0.0049, Validation Loss: 0.0049
Epoch [0020/1000], Train Loss: 0.0046, Validation Loss: 0.0046
Epoch [0030/1000], Train Loss: 0.0044, Validation Loss: 0.0044
Epoch [0040/1000], Train Loss: 0.0043, Validation Loss: 0.0043
Epoch [0050/1000], Train Loss: 0.0042, Validation Loss: 0.0043
Epoch [0060/1000], Train Loss: 0.0041, Validation Loss: 0.0042
Epoch [0070/1000], Train Loss: 0.0040, Validation Loss: 0.0042
Epoch [0080/1000], Train Loss: 0.0040, Validation Loss: 0.0042
Epoch [0090/1000], Train Loss: 0.0039, Validation Loss: 0.0042
Epoch [0100/1000], Train Loss: 0.0039, Validation Loss: 0.0041
Epoch [0110/1000], Train Loss: 0.0039, Validation Loss: 0.0041
Epoch [0120/1000], Train Loss: 0.0038, Validation Loss: 0.0041
Epoch [0130/1000], Train Loss: 0.0038, Validation Loss: 0.0041
Epoch [0140/1000], Train Loss: 0.0038, Validation Loss: 0.0041
Epoch [0150/1000], Train Loss: 0.0038, Validation Loss: 0.0040
Epoch [0160/1000], Train Loss: 0.0038, Validation Loss:

In [None]:
lstm_loop = train_model(lstm, train_loader, validation_loader)

In [None]:
validate_models(lstm_single, train_set_single, mean_taxi_df, split_timestamp, continuous_features=False)

Scaled MSE: 0.0051, Unscaled MSE: 274.3553771972656, sMAPE: 0.5782305598258972


In [None]:
validate_models(rnn, train_set, mean_taxi_df, split_timestamp, continuous_features=True)

Scaled MSE: 0.0063, Unscaled MSE: 390.16009521484375, sMAPE: 0.6303727626800537


In [None]:
validate_models(lstm, train_set, mean_taxi_df, split_timestamp, continuous_features=True)

Scaled MSE: 0.0058, Unscaled MSE: 342.6379699707031, sMAPE: 0.5724579691886902


In [None]:

validation_set = TaxiDataset(validation_df, sequence_length, continuous_features)
valid = Validator(model, train_set, validation_set)
valid.validate()
valid.print_results()