In [44]:
import argparse
import time
import os

import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader

from mscn.util import *
from mscn.data import get_train_datasets, load_data, make_dataset
from mscn.model import SetConv


In [45]:
def unnormalize_torch(vals, min_val, max_val):
    vals = (vals * (max_val - min_val)) + min_val
    return torch.exp(vals)


def qerror_loss(preds, targets, min_val, max_val):
    qerror = []
    preds = unnormalize_torch(preds, min_val, max_val)
    targets = unnormalize_torch(targets, min_val, max_val)

    for i in range(len(targets)):
        if (preds[i] > targets[i]).cpu().data.numpy()[0]:
            qerror.append(preds[i] / targets[i])
        else:
            qerror.append(targets[i] / preds[i])
    return torch.mean(torch.cat(qerror))

def print_qerror(preds_unnorm, labels_unnorm):
    qerror = []
    for i in range(len(preds_unnorm)):
        if preds_unnorm[i] > float(labels_unnorm[i]):
            qerror.append(preds_unnorm[i] / float(labels_unnorm[i]))
        else:
            qerror.append(float(labels_unnorm[i]) / float(preds_unnorm[i]))

    print("Median: {}".format(np.median(qerror)))
    print("90th percentile: {}".format(np.percentile(qerror, 90)))
    print("95th percentile: {}".format(np.percentile(qerror, 95)))
    print("99th percentile: {}".format(np.percentile(qerror, 99)))
    print("Max: {}".format(np.max(qerror)))
    print("Mean: {}".format(np.mean(qerror)))


In [46]:
dicts, column_min_max_vals, min_val, max_val, labels_train, labels_test, max_num_joins, max_num_predicates, train_data, test_data = get_train_datasets(
        1000, 1000)



Loaded queries
Loaded bitmaps
min log(label): 0.0
max log(label): 19.94772801931604
Number of training samples: 900
Number of validation samples: 100
Created TensorDataset for training data
Created TensorDataset for validation data


In [47]:
max_num_joins, max_num_predicates

(2, 5)

In [50]:
train_data[99][0].shape

torch.Size([3, 1006])

In [13]:
table2vec, column2vec, op2vec, join2vec = dicts

In [48]:
sample_feats = len(table2vec) + 1000
predicate_feats = len(column2vec) + len(op2vec) + 1
join_feats = len(join2vec)

In [49]:
sample_feats , predicate_feats, join_feats

(1006, 13, 6)

In [20]:
model = SetConv(sample_feats, predicate_feats, join_feats, 256)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [22]:
train_data_loader = DataLoader(train_data, batch_size=1024)
test_data_loader = DataLoader(test_data, batch_size=1024)

model.train()
    

SetConv(
  (sample_mlp1): Linear(in_features=1006, out_features=256, bias=True)
  (sample_mlp2): Linear(in_features=256, out_features=256, bias=True)
  (predicate_mlp1): Linear(in_features=13, out_features=256, bias=True)
  (predicate_mlp2): Linear(in_features=256, out_features=256, bias=True)
  (join_mlp1): Linear(in_features=6, out_features=256, bias=True)
  (join_mlp2): Linear(in_features=256, out_features=256, bias=True)
  (out_mlp1): Linear(in_features=768, out_features=256, bias=True)
  (out_mlp2): Linear(in_features=256, out_features=1, bias=True)
)

In [30]:
cuda = False
for epoch in range(100):
    loss_total = 0.

    for batch_idx, data_batch in enumerate(train_data_loader):

        samples, predicates, joins, targets, sample_masks, predicate_masks, join_masks = data_batch

        samples, predicates, joins, targets = Variable(samples), Variable(predicates), Variable(joins), Variable(
            targets)
        sample_masks, predicate_masks, join_masks = Variable(sample_masks), Variable(predicate_masks), Variable(
            join_masks)

        optimizer.zero_grad()
        outputs = model(samples, predicates, joins, sample_masks, predicate_masks, join_masks)
        loss = qerror_loss(outputs, targets.float(), min_val, max_val)
        loss_total += loss.item()
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        print("Epoch {}, loss: {}".format(epoch, loss_total / len(train_data_loader)))



Epoch 0, loss: 3.276638984680176
Epoch 10, loss: 3.1433732509613037
Epoch 20, loss: 3.0158510208129883
Epoch 30, loss: 2.9168648719787598
Epoch 40, loss: 2.8361403942108154
Epoch 50, loss: 2.7466354370117188
Epoch 60, loss: 2.705319881439209
Epoch 70, loss: 2.6309967041015625
Epoch 80, loss: 2.6024208068847656
Epoch 90, loss: 2.600196599960327


In [34]:
workload_name = 'sample'
num_materialized_samples = 1000


In [36]:

# Get final training and validation set predictions
preds_train, t_total = predict(model, train_data_loader, cuda)
print("Prediction time per training sample: {}".format(t_total / len(labels_train) * 1000))

preds_test, t_total = predict(model, test_data_loader, cuda)
print("Prediction time per validation sample: {}".format(t_total / len(labels_test) * 1000))

# Unnormalize
preds_train_unnorm = unnormalize_labels(preds_train, min_val, max_val)
labels_train_unnorm = unnormalize_labels(labels_train, min_val, max_val)

preds_test_unnorm = unnormalize_labels(preds_test, min_val, max_val)
labels_test_unnorm = unnormalize_labels(labels_test, min_val, max_val)

# Print metrics
print("\nQ-Error training set:")
print_qerror(preds_train_unnorm, labels_train_unnorm)

print("\nQ-Error validation set:")
print_qerror(preds_test_unnorm, labels_test_unnorm)
print("")



Prediction time per training sample: 0.06440215640597872
Prediction time per validation sample: 0.07446050643920898

Q-Error training set:
Median: 1.4701089086899124
90th percentile: 4.5302793730633715
95th percentile: 6.662839525210195
99th percentile: 18.0
Max: 38.333333333333336
Mean: 2.4878464752518052

Q-Error validation set:
Median: 1.810483919854768
90th percentile: 7.734071349329144
95th percentile: 16.53554852320675
99th percentile: 20.587663636363853
Max: 62.266363636363636
Mean: 4.103475171588795



In [51]:
preds_test

[tensor([0.6852]),
 tensor([0.2350]),
 tensor([0.1759]),
 tensor([0.7172]),
 tensor([0.8332]),
 tensor([0.2502]),
 tensor([0.8048]),
 tensor([0.7262]),
 tensor([0.4365]),
 tensor([0.8305]),
 tensor([0.3695]),
 tensor([0.3982]),
 tensor([0.8009]),
 tensor([0.3955]),
 tensor([0.5152]),
 tensor([0.8401]),
 tensor([0.6140]),
 tensor([0.7636]),
 tensor([0.6170]),
 tensor([0.7109]),
 tensor([0.2741]),
 tensor([0.8982]),
 tensor([0.6471]),
 tensor([0.1589]),
 tensor([0.7095]),
 tensor([0.8086]),
 tensor([0.7880]),
 tensor([0.3298]),
 tensor([0.6090]),
 tensor([0.1609]),
 tensor([0.3511]),
 tensor([0.8275]),
 tensor([0.8391]),
 tensor([0.8752]),
 tensor([0.5225]),
 tensor([0.7655]),
 tensor([0.6385]),
 tensor([0.8481]),
 tensor([0.6294]),
 tensor([0.3185]),
 tensor([0.1568]),
 tensor([0.6091]),
 tensor([0.6061]),
 tensor([0.7913]),
 tensor([0.8101]),
 tensor([0.7488]),
 tensor([0.6900]),
 tensor([0.6559]),
 tensor([0.7834]),
 tensor([0.2878]),
 tensor([0.2534]),
 tensor([0.6450]),
 tensor([0.7

In [42]:
example_labels = labels_train[-1:]
example_data = train_data[-1:]
example_labels,'',example_data

(array([0.6829955]), '', (tensor([[[0., 0., 0.,  ..., 0., 1., 0.],
           [0., 0., 1.,  ..., 1., 1., 1.],
           [0., 0., 0.,  ..., 1., 0., 0.]]]),
  tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000,
            0.0000, 1.0000, 0.0000, 0.0000, 1.0000],
           [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
            1.0000, 0.0000, 0.0000, 1.0000, 0.8705],
           [0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000,
            0.0000, 1.0000, 0.0000, 0.0000, 0.0550],
           [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000,
            0.0000, 1.0000, 0.0000, 0.0000, 0.0714],
           [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
            0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]),
  tensor([[[0., 0., 0., 1., 0., 0.],
           [0., 0., 0., 0., 1., 0.]]]),
  tensor([0.6830]),
  tensor([[[1.],
           [1.],
           [1.]]]),
  tensor([[[1.],
           [1.],
           

In [43]:

example_data_loader = DataLoader(example_data, batch_size=1)

preds_example, t_total = predict(model, example_data_loader, cuda)
print("Prediction time per test sample: {}".format(t_total / len(example_labels) * 1000))

# Unnormalize
preds_example_unnorm = unnormalize_labels(preds_example, min_val, max_val)
# Print metrics
print("\nQ-Error example:")
print_qerror(preds_example_unnorm, label)


ValueError: not enough values to unpack (expected 7, got 1)