In [1]:
import math
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb

# IMPORT DATA

In [2]:
IS_IOWA_DATASET = True  # iowa dataset : true, simulation : false
IOWA_PATH = '../../datasets/train_data_iowa.csv'
SIMULATION_PATH = '../../datasets/datensatz_emre.csv'
CSV_PATH = IOWA_PATH if IS_IOWA_DATASET else SIMULATION_PATH

cols_to_skip = [
    "simulated_etd",
    "restaurant_name", 
    "vehicle_name",
]

In [3]:
import re
meta = pd.read_csv(CSV_PATH, header=0, sep=';', usecols=lambda x : x not in cols_to_skip, nrows=1)

s = meta.dtypes
columns = []
dtypes = {}

#drop xy_pos AND every vehicle_i_*_j AND vehicle_total_route_* since they're not needed 
s.drop([k for k, _ in s.items() if (re.match(r"[a-zA-Z_]+_[xy]_?\d?\d?", k))], inplace=True)
s.drop([k for k, _ in s.items() if (re.match(r"",k))],  inplace = True)
s.drop([k for k, _ in s.items() if (re.match(r"vehicle_[0-9]+_[a-zA-Z_]+_[0-9]+", k))], inplace=True)
#s.drop([k for k, _ in s.items() if (re.match(r"vehicle_total_route[a-zA-Z0-9_]+",k))], inplace = True)

# These rows are nan rows
s.drop([k for k, _ in s.items() if (re.match(r"vehicle_route_to_customer_(pos|action|time_action)_23",k))], inplace=True)

for key in s.items():
    if key[1] == "int64":
        columns.append(key[0])
        dtypes[key[0]] = "int16"
        
data = pd.read_csv(CSV_PATH, header=0, sep=";", usecols=columns, dtype=dtypes)
data = data.apply(lambda x : pd.to_numeric(x, 'raise', 'signed'))

pd.set_option("display.max_columns", len(data.columns))
data

Unnamed: 0,location,order_time,atd,etd,restaurant_location,restaurant_queue,max_pre_shift,max_post_shift,restaurants_before_customer,customers_before_customer,len_vehicle_route_to_customer,len_vehicle_route_total,vehicle_route_to_customer_pos_0,vehicle_route_to_customer_action_0,vehicle_route_to_customer_time_action_0,vehicle_route_to_customer_pos_1,vehicle_route_to_customer_action_1,vehicle_route_to_customer_time_action_1,vehicle_route_to_customer_pos_2,vehicle_route_to_customer_action_2,vehicle_route_to_customer_time_action_2,vehicle_route_to_customer_pos_3,vehicle_route_to_customer_action_3,vehicle_route_to_customer_time_action_3,vehicle_route_to_customer_pos_4,vehicle_route_to_customer_action_4,vehicle_route_to_customer_time_action_4,vehicle_route_to_customer_pos_5,vehicle_route_to_customer_action_5,vehicle_route_to_customer_time_action_5,vehicle_route_to_customer_pos_6,vehicle_route_to_customer_action_6,vehicle_route_to_customer_time_action_6,vehicle_route_to_customer_pos_7,vehicle_route_to_customer_action_7,vehicle_route_to_customer_time_action_7,vehicle_route_to_customer_pos_8,vehicle_route_to_customer_action_8,vehicle_route_to_customer_time_action_8,vehicle_route_to_customer_pos_9,vehicle_route_to_customer_action_9,vehicle_route_to_customer_time_action_9,vehicle_route_to_customer_pos_10,vehicle_route_to_customer_action_10,vehicle_route_to_customer_time_action_10,vehicle_route_to_customer_pos_11,vehicle_route_to_customer_action_11,vehicle_route_to_customer_time_action_11,vehicle_route_to_customer_pos_12,vehicle_route_to_customer_action_12,vehicle_route_to_customer_time_action_12,vehicle_route_to_customer_pos_13,vehicle_route_to_customer_action_13,vehicle_route_to_customer_time_action_13,vehicle_route_to_customer_pos_14,vehicle_route_to_customer_action_14,vehicle_route_to_customer_time_action_14,vehicle_route_to_customer_pos_15,vehicle_route_to_customer_action_15,vehicle_route_to_customer_time_action_15,vehicle_route_to_customer_pos_16,vehicle_route_to_customer_action_16,vehicle_route_to_customer_time_action_16,vehicle_route_to_customer_pos_17,vehicle_route_to_customer_action_17,vehicle_route_to_customer_time_action_17,vehicle_route_to_customer_pos_18,vehicle_route_to_customer_action_18,vehicle_route_to_customer_time_action_18,vehicle_route_to_customer_pos_19,vehicle_route_to_customer_action_19,vehicle_route_to_customer_time_action_19,vehicle_route_to_customer_pos_20,vehicle_route_to_customer_action_20,vehicle_route_to_customer_time_action_20,vehicle_route_to_customer_pos_21,vehicle_route_to_customer_action_21,vehicle_route_to_customer_time_action_21,vehicle_route_to_customer_pos_22,vehicle_route_to_customer_action_22,vehicle_route_to_customer_time_action_22,len_vehicle_route_0,len_vehicle_route_1,len_vehicle_route_2,len_vehicle_route_3,len_vehicle_route_4,len_vehicle_route_5,len_vehicle_route_6,len_vehicle_route_7,len_vehicle_route_8,len_vehicle_route_9,len_vehicle_route_10,len_vehicle_route_11,len_vehicle_route_12,len_vehicle_route_13,restaurant_0_queue,restaurant_1_queue,restaurant_2_queue,restaurant_3_queue,restaurant_4_queue,restaurant_5_queue,restaurant_6_queue,restaurant_7_queue,restaurant_8_queue,restaurant_9_queue,restaurant_10_queue,restaurant_11_queue,restaurant_12_queue,restaurant_13_queue,restaurant_14_queue
0,2097,567,583,584,2602,8,0,0,1,1,4,4,2602,1,4,2602,3,5,2097,2,5,2097,4,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0
1,997,587,613,610,976,8,0,0,1,1,4,4,976,1,6,976,3,3,997,2,11,997,4,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0
2,1857,602,623,623,2602,8,0,0,1,1,4,4,2602,1,5,2602,3,4,1857,2,9,1857,4,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0
3,1857,617,641,639,2345,8,0,0,1,1,4,4,2345,1,4,2345,3,5,1857,2,10,1857,4,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8
4,1563,618,646,645,1665,8,0,0,1,1,4,4,1665,1,8,1665,3,3,1563,2,13,1563,4,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,1,1,1,1,1,1,1,1,1,4,1,1,8,0,0,0,0,0,0,0,0,0,0,0,0,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
850464,2788,1191,1224,1225,2345,8,11,0,1,3,8,9,1788,4,3,1724,2,1,1724,4,3,2920,2,10,2920,4,3,2345,1,6,2345,3,3,2788,2,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,10,10,6,8,9,8,11,9,3,11,8,9,8,0,0,0,2,0,1,-3,-6,10,-5,5,0,0,8
850465,1249,1200,1246,1246,2548,8,9,0,1,2,7,8,1094,2,8,1094,4,3,2677,2,14,2677,4,3,2548,1,2,2548,3,3,1249,2,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,6,6,8,7,6,8,6,9,6,2,9,6,6,-1,0,0,0,0,0,0,8,0,0,0,0,0,0,0
850466,617,1202,1244,1242,1361,8,12,0,1,2,7,8,1565,2,5,1565,4,3,1361,1,8,1361,3,3,1502,2,8,1502,4,3,617,2,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,6,5,7,8,5,7,6,8,5,2,8,5,6,0,0,0,0,0,0,0,6,0,8,0,0,0,0,0
850467,2586,1207,1246,1247,1361,11,14,0,1,1,4,5,964,4,3,1361,1,11,1361,3,3,2586,2,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,5,4,6,7,6,4,5,6,7,4,7,4,4,0,0,0,0,0,0,0,1,0,11,0,0,0,0,0


# DATA PREPROCESSING

In [None]:
# Missing values?
np.isnan(np.array(data)).sum()

In [None]:
# Encoding categorical variables ("insertion index")
#--> Later, when Florentins features are crafted

In [None]:
# Outlier Detection (by means of KDE probably)

In [None]:
# Data description
def plot_histogram(x):
    plt.hist(x, color='gray', alpha=0.5)
    plt.title(f"Histogram of {x.name}")
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    plt.show()

# Feature engineering

Features used in Hildebrandt et al. (2020):
<ul>
    <li>n_stops: sum(vehicle_route_to_customer_action_i = 1 or 2)</li>
    <li>n_pickup_stops: sum(vehicle_route_to_customer_action_i = 1)</li>
    <li>n_delivery_stops: sum(vehicle_route_to_customer_action_i = 2)</li>
    <li>max_pre_shift: already given</li>    
    <li>max_post_shift: already given</li>
    <li>prep_time: sum(v_r_t_c_time_action_*) where v_r_t_c_action_i = 3 and v_r_t_c_pos_j == restaurant_location</li>
    <li>order_time: already given</li>
    <li>eta_pom: already given</li>
    <li>customer_location: already given</li>
    <li>restaurant_location: already given</li>
</ul>
Couple more? Brainstorming
<ul>
    <li>Split up eta_pom (i.e sum of estimated waiting times, delivery times (not anticipative)</li>
    <li>

In [9]:
#Define strings to identify needed columns for each feature we want to craft
query_strings = {
    "n_stops" : ["vehicle_route_to_customer_action"],
        
    "n_pickup_stops" : ["vehicle_route_to_customer_action"],
    
    "n_delivery_stops" : ["vehicle_route_to_customer_action"],
    
    "prep_time" : ["vehicle_route_to_customer_time_action", 
                   "vehicle_route_to_customer_action",
                   "order_time", "restaurant_location", "vehicle_route_to_customer_pos"]
}

raw_feats = ["location", "restaurant_location", "etd", "atd", "order_time", "max_pre_shift", "max_post_shift", "restaurant_queue"]

mask = pd.DataFrame()
feats = pd.DataFrame()

# First, add used raw features to feats
for feat in raw_feats:
    feats[feat] = data[feat]

# Craft features and add to feats
for key,value in query_strings.items():
    
    needed_columns = [col for col in data.columns if any(x in col for x in value)]
    inp = data[needed_columns]
    
    if key == "n_stops":
        for col in inp:
            mask[col] = (inp[col] > 0) & (inp[col] < 3)
            feats[key] = mask.sum(axis=1)
    
    if key == "n_pickup_stops": 
        for col in inp:
            mask[col] = inp[col] == 1
            feats[key] = mask.sum(axis=1)
    
    if key == "n_delivery_stops": 
        for col in inp:
            mask[col] = inp[col] == 2
            feats[key] = mask.sum(axis=1)

In [10]:
feats.to_csv("crafted_features.csv", sep=";")

In [11]:
crafted_features = pd.read_csv("crafted_features.csv", sep=";", index_col=[0])
crafted_features

Unnamed: 0,location,restaurant_location,etd,atd,order_time,max_pre_shift,max_post_shift,restaurant_queue,n_stops,n_pickup_stops,n_delivery_stops
0,2097,2602,584,583,567,0,0,8,2,1,1
1,997,976,610,613,587,0,0,8,2,1,1
2,1857,2602,623,623,602,0,0,8,2,1,1
3,1857,2345,639,641,617,0,0,8,2,1,1
4,1563,1665,645,646,618,0,0,8,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...
850464,2788,2345,1225,1224,1191,11,0,8,4,1,3
850465,1249,2548,1246,1246,1200,9,0,8,4,1,3
850466,617,1361,1242,1244,1202,12,0,8,4,1,3
850467,2586,1361,1247,1246,1207,14,0,11,2,1,1


# ENSEMBLE LEARNING

# Train models on both datasets (raw vs. crafted)

## Split data

In [12]:
X = data.loc[:, data.columns != 'atd']
y = data['atd'] - data['etd']

X_train, X_test, y_train , y_test = train_test_split(X,y, train_size=0.8)

X_crafted = crafted_features.loc[:, crafted_features.columns != 'atd']
y_crafted = crafted_features['atd'] - crafted_features['etd']

X_train_c, X_test_c, y_train_c , y_test_c = train_test_split(X_crafted,y_crafted, train_size=0.8)

In [13]:
params = {
    "lgbm_rf" : {
        "boosting_type" : "rf",
        "objective" : "regression",
        "learning_rate" : 0.0005,
        "random_state" : 42,
        "metrics" : "l2",
        "bagging_freq" : 10,
        "bagging_fraction" : 0.8,
    },
    
    "lgbm_gbdt" : {
        "boosting_type" : "gbdt",
        "objective" : "regression",
        "learning_rate" : 0.05,
        "num_leaves" : 20,
        "random_state" : 42,
        'metrics' : 'l2',    
    },
    
    #"lgbm_goss" : {
    #    "boosting_type" : "goss",
    #    "objective" : "regression",
    #     "n_estimators" : 500,
    #    "learning_rate" : 0.05,
    #    "random_state" : 42,
    #    'metric' : 'l2'
    #},
    #"lgbm_dart" : {
    #    "boosting_type" : "dart",
    #    "objective" : "regression",
    #     "n_estimators" : 500,
    #    "learning_rate" : 0.05,
    #    "random_state" : 42,
    #    'metric' : 'l2'
    #}
}

## Train models on raw set

In [16]:
raw_set = lgb.Dataset(X,y)

trained_models = []
for model in params:
    bst = lgb.cv(
        params[model],
        raw_set,
        num_boost_round = 500,
        early_stopping_rounds = 10,
        verbose_eval = 5,
        seed = 42,
        return_cvbooster = True,
        stratified=False
    )
    trained_models.append(bst)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7530
[LightGBM] [Info] Number of data points in the train set: 680372, number of used features: 106
[LightGBM] [Info] Start training from score 4.174475
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7530
[LightGBM] [Info] Number of data points in the train set: 680372, number of used features: 106
[LightGBM] [Info] Start training from score 4.179596
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7530
[LightGBM] [Info] Number of data points in the train set: 680372, number of used features: 106
[LightGBM] [Info] Start training from score 4.179968
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y

[490]	cv_agg's l2: 25.8295 + 0.13425
[495]	cv_agg's l2: 25.8267 + 0.133066
[500]	cv_agg's l2: 25.8232 + 0.132789


In [None]:
# Always scale the input. The most convenient way is to use a pipeline.
reg = make_pipeline(StandardScaler(),
                    SGDRegressor(
                        max_iter=1000,
                        validation_fraction=0.2,
                        learning_rate="adaptive",
                        verbose = 1)
                   )
reg.fit(X_crafted, y_crafted)
mean_squared_error(y_test_c, reg.predict(X_test_c))

In [None]:
crafted_set = lgb.Dataset(X_crafted,y_crafted)

trained_models = []
for model in params:
    bst = lgb.cv(
        params[model],
        train_set,
        num_boost_round = 500,
        early_stopping_rounds = 10,
        verbose_eval = 5,
        seed = 42,
        return_cvbooster = True,
    )
    trained_models.append(bst)

# NEURAL NETWORK (Pytorch or Tensorflow)

### TODO:
- Experiment with different architectures and techniques (i.e. MLP, Convolutional NNs (?) , ...) 

# 1. Model definition

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CancelOut(nn.Module):
    '''
    CancelOut Layer
    
    x - an input data (vector, matrix, tensor)
    '''

    def __init__(self, inp, *kargs, **kwargs):
        super(CancelOut, self).__init__()
        self.weights = nn.Parameter(torch.zeros(inp, requires_grad=True) + 4)

    def forward(self, x):
        return x * torch.sigmoid(self.weights.float())

class Autoencoder(torch.nn.Module):
    def __init__(self, n_features, n_hidden, n_encode):
        super(Autoencoder, self).__init__()
        self.hidden_enc = nn.Linear(n_features, n_hidden)
        self.encode = nn.Linear(n_hidden, n_encode)
        self.hidden_dec = nn.Linear(n_encode, n_hidden)
        self.decode = nn.Linear(n_hidden, n_features)

    def forward(self, x):
        x = F.leaky_relu(self.hidden_enc(x))
        x = F.leaky_relu(self.encode(x))
        x = F.leaky_relu(self.hidden_dec(x))
        x = torch.sigmoid(self.decode(x))

        return x
    
class Model(torch.nn.Module):
    def __init__(self, n_features, n_hidden, n_output):
        super(Model, self).__init__()
        self.dropout = nn.Dropout(p=0.2)
        self.hidden = nn.Linear(n_features, n_hidden)
        self.dropout = nn.Dropout(p=0.2)
        self.predict = nn.Linear(n_hidden, n_output)
    
    def forward(self, x):
        x = F.leaky_relu(self.dropout(x))
        x = F.leaky_relu(self.hidden(x))
        x = F.leaky_relu(self.dropout(x))
        x = self.predict(x)

        return x

# 2. Training loop

In [7]:
from torch import nn, optim
from utils import *

#Hyperparameter values DL
LR = 0.001
EARLY_STOPPING_PATIENCE = 10

#reproducibility
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Importing data.")
feature_list = [f for f in list(data.columns.values) if f != 'atd']

etd_dataset = ETDData(data=data, feature_list=feature_list)
split = DataSplit(etd_dataset, shuffle=True)
trainloader, _, testloader = split.get_split(batch_size=50, num_workers=8)

print("Data imported.")
print("Instanciating model.")
n_features = len(feature_list)
n_hidden = math.ceil(n_features * (1 / 2))
n_hidden_2 = math.ceil(n_hidden * (1 / 2))
n_encode = math.ceil(n_hidden_2 * (1 / 2))

ae = Autoencoder(
        n_features=n_features,
        n_hidden=n_hidden,
        n_encode=n_encode,
    )

ae.to(device)
criterion = nn.MSELoss()  # define your loss function and optimizer
optimizer = optim.AdamW(ae.parameters(), lr=LR)

print("Start training.")
train_losses = []
test_losses = []
early_stopping = EarlyStopping(patience=EARLY_STOPPING_PATIENCE,
                                verbose=True)  # TODO: Define your early stopping

epochs = 100  # How many epochs do you want to train?
for epoch in range(epochs):
    running_loss = 0.0
    for inputs, labels in trainloader:
        # get the inputs; data is a list of [inputs, labels]
        inputs = inputs.float().to(device)
        #labels = labels.float().view(-1, 1).to(device)
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        outputs = ae(inputs)
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()
        # print statistics
        running_loss += loss.item()
    test_loss = 0
    ae.eval()
    with torch.no_grad():
        for inputs, labels in testloader:
            inputs = inputs.float().to(device)
            #labels = labels.float().view(-1, 1).to(device)
            logps = ae.forward(inputs)
            batch_loss = criterion(logps, inputs)
            test_loss += batch_loss.item()
    train_losses.append(running_loss / len(trainloader))
    test_losses.append(test_loss / len(testloader))
    print(f"Epoch {epoch}/{epochs}.. "
            f"Train loss: {running_loss / len(trainloader):.3f}.. "
            f"Test loss: {test_loss / len(testloader):.3f}.. ")
    early_stopping(test_loss / len(testloader), ae)
    if early_stopping.early_stop:
        print("Early stopping")
        break
    ae.train()

print('Finished Training')

Importing data.
Data imported.
Instanciating model.
Start training.
Epoch 0/100.. Train loss: 0.655.. Test loss: 0.778.. 
Validation loss decreased (inf --> 0.777819).  Saving model ...


KeyboardInterrupt: 

In [None]:
ae.load_state_dict(torch.load('checkpoint.pt'))
torch.save(ae, 'perceptron.pth')