In [2]:
import csv
import torch
from torch.utils.data import dataset

from mscn.util import *
 

In [35]:
joins = []
predicates = []
tables = []
samples = []
label = []

with open("./data/train.csv", 'rU') as f:
    data_raw = list(list(rec) for rec in csv.reader(f, delimiter='#'))
    for row in data_raw:
        tables.append(row[0].split(','))
        joins.append(row[1].split(','))
        predicates.append(row[2].split(','))
        if int(row[3]) < 1:
            print("Queries must have non-zero cardinalities")
            exit(1)
        label.append(row[3])
predicates = [list(chunks(d, 3)) for d in predicates]
print("Loaded queries")

  import sys


Loaded queries


In [36]:
tables[0], joins[0], predicates[0]

(['title t', 'movie_info_idx mi_idx'],
 ['t.id=mi_idx.movie_id'],
 [['t.kind_id', '=', '7'], ['mi_idx.info_type_id', '>', '99']])

In [37]:
# Load bitmaps
num_bytes_per_bitmap = int((1000 + 7) >> 3)
with open("./data/train.bitmaps", 'rb') as f:
    for i in range(len(tables)):
        four_bytes = f.read(4)
        if not four_bytes:
            print("Error while reading 'four_bytes'")
            exit(1)
        num_bitmaps_curr_query = int.from_bytes(four_bytes, byteorder='little')
        bitmaps = np.empty((num_bitmaps_curr_query, num_bytes_per_bitmap * 8), dtype=np.uint8)
        for j in range(num_bitmaps_curr_query):
            # Read bitmap
            bitmap_bytes = f.read(num_bytes_per_bitmap)
            if not bitmap_bytes:
                print("Error while reading 'bitmap_bytes'")
                exit(1)
            bitmaps[j] = np.unpackbits(np.frombuffer(bitmap_bytes, dtype=np.uint8))
        samples.append(bitmaps)
print("Loaded bitmaps")

Loaded bitmaps


In [38]:
# Get column name dict
column_names = get_all_column_names(predicates)
column2vec, idx2column = get_set_encoding(column_names)

In [143]:
# Get table name dict
table_names = get_all_table_names(tables)
table2vec, idx2table = get_set_encoding(table_names)
len(table2vec)

6

In [40]:
# Get operator name dict
operators = get_all_operators(predicates)
op2vec, idx2op = get_set_encoding(operators)
op2vec

{'<': array([1., 0., 0.], dtype=float32),
 '=': array([0., 1., 0.], dtype=float32),
 '>': array([0., 0., 1.], dtype=float32)}

In [41]:
# Get join name dict
join_set = get_all_joins(joins)
join2vec, idx2join = get_set_encoding(join_set)
join2vec

{'': array([1., 0., 0., 0., 0., 0.], dtype=float32),
 't.id=ci.movie_id': array([0., 1., 0., 0., 0., 0.], dtype=float32),
 't.id=mc.movie_id': array([0., 0., 1., 0., 0., 0.], dtype=float32),
 't.id=mi.movie_id': array([0., 0., 0., 1., 0., 0.], dtype=float32),
 't.id=mi_idx.movie_id': array([0., 0., 0., 0., 1., 0.], dtype=float32),
 't.id=mk.movie_id': array([0., 0., 0., 0., 0., 1.], dtype=float32)}

In [42]:
# Get min and max values for each column
with open('./data/column_min_max_vals.csv', 'rU') as f:
    data_raw = list(list(rec) for rec in csv.reader(f, delimiter=','))
    column_min_max_vals = {}
    for i, row in enumerate(data_raw):
        if i == 0:
            continue
        column_min_max_vals[row[0]] = [float(row[1]), float(row[2])]

  


In [43]:
# Get feature encoding and proper normalization
samples_enc = encode_samples(tables, samples, table2vec)
predicates_enc, joins_enc = encode_data(predicates, joins, column_min_max_vals, column2vec, op2vec, join2vec)
label_norm, min_val, max_val = normalize_labels(label)

min log(label): 0.0
max log(label): 19.94772801931604


In [74]:
entry_at=5
tables[entry_at],samples_enc[entry_at],'', joins[entry_at],joins_enc[entry_at],'' , predicates[entry_at], predicates_enc[entry_at]

(['title t', 'movie_companies mc', 'movie_info mi'],
 [array([0., 0., 0., ..., 0., 1., 1.], dtype=float32),
  array([0., 1., 0., ..., 0., 0., 1.], dtype=float32),
  array([0., 0., 1., ..., 0., 0., 0.], dtype=float32)],
 '',
 ['t.id=mc.movie_id', 't.id=mi.movie_id'],
 [array([0., 0., 1., 0., 0., 0.], dtype=float32),
  array([0., 0., 0., 1., 0., 0.], dtype=float32)],
 '',
 [['t.production_year', '>', '1977'],
  ['mc.company_id', '>', '71403'],
  ['mi.info_type_id', '<', '4']],
 [array([0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
         0.       , 0.       , 1.       , 0.       , 0.       , 1.       ,
         0.6978417], dtype=float32),
  array([0.        , 0.        , 1.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 1.        , 0.30384347], dtype=float32),
  array([0.        , 0.        , 0.        , 0.        , 1.        ,
         0.        , 0.        , 0.        , 0.        , 1.     

In [49]:
# Split in training and validation samples
num_queries = 1000
num_train = int(num_queries * 0.9)
num_test = num_queries - num_train

samples_train = samples_enc[:num_train]
predicates_train = predicates_enc[:num_train]
joins_train = joins_enc[:num_train]
labels_train = label_norm[:num_train]

samples_test = samples_enc[num_train:num_train + num_test]
predicates_test = predicates_enc[num_train:num_train + num_test]
joins_test = joins_enc[num_train:num_train + num_test]
labels_test = label_norm[num_train:num_train + num_test]

print("Number of training samples: {}".format(len(labels_train)))
print("Number of validation samples: {}".format(len(labels_test)))

max_num_joins = max(max([len(j) for j in joins_train]), max([len(j) for j in joins_test]))
max_num_predicates = max(max([len(p) for p in predicates_train]), max([len(p) for p in predicates_test]))

dicts = [table2vec, column2vec, op2vec, join2vec]
train_data = [samples_train, predicates_train, joins_train]
test_data = [samples_test, predicates_test, joins_test]

Number of training samples: 900
Number of validation samples: 100


In [None]:
samples_train[0], predicates_train[0], joins_train[0]

In [None]:
# Up to this point, basic encoding
# below part is casting to tensor

In [59]:
sample_masks = []
sample_tensors = []
for sample in samples_train:
    sample_tensor = np.vstack(sample)
    num_pad = max_num_joins + 1 - sample_tensor.shape[0]
    sample_mask = np.ones_like(sample_tensor).mean(1, keepdims=True)
    sample_tensor = np.pad(sample_tensor, ((0, num_pad), (0, 0)), 'constant')
    sample_mask = np.pad(sample_mask, ((0, num_pad), (0, 0)), 'constant')
    sample_tensors.append(np.expand_dims(sample_tensor, 0))
    sample_masks.append(np.expand_dims(sample_mask, 0))
sample_tensors = np.vstack(sample_tensors)
sample_tensors = torch.FloatTensor(sample_tensors)
sample_masks = np.vstack(sample_masks)
sample_masks = torch.FloatTensor(sample_masks)

In [132]:
example = samples_train[3]
example

[array([0., 0., 0., ..., 1., 1., 1.], dtype=float32),
 array([0., 1., 0., ..., 0., 0., 0.], dtype=float32)]

In [133]:
num_pad = 3 + 1 - example_tensor.shape[0]

In [134]:
example_tensor = np.vstack(example)
example_tensor

array([[0., 0., 0., ..., 1., 1., 1.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [135]:
example_mask = np.ones_like(example_tensor).mean(1, keepdims=True)
example_mask

array([[1.],
       [1.]], dtype=float32)

In [136]:
example_tensor = np.pad(example_tensor, ((0, num_pad), (0, 0)), 'constant')
example_tensor

array([[0., 0., 0., ..., 1., 1., 1.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [137]:
np.ones_like(example_tensor).mean(1, keepdims=True)

array([[1.],
       [1.],
       [1.]], dtype=float32)

In [130]:
example_tensors = []

In [138]:
example_tensors.append(np.expand_dims(example_tensor, 0))

In [139]:
example_tensors

[array([[[0., 0., 0., ..., 0., 1., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]], dtype=float32),
 array([[[0., 0., 0., ..., 1., 1., 1.],
         [0., 1., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]], dtype=float32)]

In [142]:
np.vstack(example_tensors)

array([[[0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 1., 1., 1.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]], dtype=float32)

In [141]:
torch.FloatTensor(example_tensors)

tensor([[[[0., 0., 0.,  ..., 0., 1., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 1., 1., 1.],
          [0., 1., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]])

In [58]:
predicate_masks = []
predicate_tensors = []
for predicate in predicates_train:
    predicate_tensor = np.vstack(predicate)
    num_pad = max_num_predicates - predicate_tensor.shape[0]
    predicate_mask = np.ones_like(predicate_tensor).mean(1, keepdims=True)
    predicate_tensor = np.pad(predicate_tensor, ((0, num_pad), (0, 0)), 'constant')
    predicate_mask = np.pad(predicate_mask, ((0, num_pad), (0, 0)), 'constant')
    predicate_tensors.append(np.expand_dims(predicate_tensor, 0))
    predicate_masks.append(np.expand_dims(predicate_mask, 0))
predicate_tensors = np.vstack(predicate_tensors)
predicate_tensors = torch.FloatTensor(predicate_tensors)
predicate_masks = np.vstack(predicate_masks)
predicate_masks = torch.FloatTensor(predicate_masks)

In [57]:
join_masks = []
join_tensors = []
for join in joins_train:
    join_tensor = np.vstack(join)
    num_pad = max_num_joins - join_tensor.shape[0]
    join_mask = np.ones_like(join_tensor).mean(1, keepdims=True)
    join_tensor = np.pad(join_tensor, ((0, num_pad), (0, 0)), 'constant')
    join_mask = np.pad(join_mask, ((0, num_pad), (0, 0)), 'constant')
    join_tensors.append(np.expand_dims(join_tensor, 0))
    join_masks.append(np.expand_dims(join_mask, 0))
join_tensors = np.vstack(join_tensors)
join_tensors = torch.FloatTensor(join_tensors)
join_masks = np.vstack(join_masks)
join_masks = torch.FloatTensor(join_masks)

In [61]:
target_tensor = torch.FloatTensor(labels_train)

dataset.TensorDataset(sample_tensors, predicate_tensors, join_tensors, target_tensor, sample_masks,
                             predicate_masks, join_masks)

<torch.utils.data.dataset.TensorDataset at 0x1234e53c8>

In [None]:
train_dataset = make_dataset(*train_data, labels=labels_train, max_num_joins=max_num_joins,
                                 max_num_predicates=max_num_predicates)
print("Created TensorDataset for training data")
test_dataset = make_dataset(*test_data, labels=labels_test, max_num_joins=max_num_joins,
                            max_num_predicates=max_num_predicates)
print("Created TensorDataset for validation data")
dicts, column_min_max_vals, min_val, max_val, labels_train, labels_test, max_num_joins, max_num_predicates, train_dataset, test_dataset

In [145]:
train_dataset[99]

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 1., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000,
          0.0000, 1.0000, 0.0000, 1.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000,
          0.0000, 0.0000, 1.0000, 0.9281],
         [0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          1.0000, 0.0000, 0.0000, 0.6606],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 1.0000, 0.0398],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000]]),
 tensor([[0., 0., 0., 1., 0., 0.],
         [0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.]]),
 tensor(0.6623),
 tensor