In [1]:
import moon.data
import moon.problem

In [2]:
probs = moon.data.read_problems('data/cleaned_probs.csv')

In [3]:
import numpy as np

x = np.array([p.array for p in probs])
y = np.array([p.grade.ordinal[1:] for p in probs])  # Convert "ordinal" to "rank"

print(x.shape, y.shape)

split = 0.2
indices = np.arange(len(probs))
np.random.shuffle(indices)
n_test = int(len(probs) * split)

x_train, y_train = x[indices[n_test:]], y[indices[n_test:]]
x_test, y_test = x[indices[0:n_test]], y[indices[0:n_test]]
print((x_train.shape, y_train.shape), (x_test.shape, y_test.shape))

(30991, 18, 11) (30991, 16)
((24793, 18, 11), (24793, 16)) ((6198, 18, 11), (6198, 16))


In [34]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow.keras.backend as K

def accuracy_k_coral(k):
    def fn(y_true, y_pred):
        pred_rank = tf.reduce_sum(tf.cast(y_pred > 0.5, 'float32'), axis=-1)
        true_rank = tf.reduce_sum(y_true, axis=-1)
        score_bools = tf.abs(pred_rank - true_rank) <= k
        return tf.reduce_mean(tf.cast(score_bools, 'float32'), axis=-1)
    fn.__name__ = 'acc' if k == 0 else f'CS{k}'  # https://stackoverflow.com/questions/57910680/how-to-name-custom-metrics-in-keras-fit-output
    return fn

def mae_coral():
    def fn(y_true, y_pred):
        pred_rank = tf.reduce_sum(tf.cast(y_pred > 0.5, 'float32'), axis=-1)
        true_rank = tf.reduce_sum(y_true, axis=-1)
        dist_abs = tf.abs(pred_rank - true_rank)
        return tf.reduce_mean(dist_abs, axis=-1)
    fn.__name__ = 'mae'
    return fn

def rmse_coral():
    def fn(y_true, y_pred):
        pred_rank = tf.reduce_sum(tf.cast(y_pred > 0.5, 'float32'), axis=-1)
        true_rank = tf.reduce_sum(y_true, axis=-1)
        dist_sqr = (pred_rank - true_rank)**2
        return tf.math.sqrt(tf.reduce_mean(dist_sqr, axis=-1))
    fn.__name__ = 'rmse'
    return fn

# see
# https://arxiv.org/abs/1901.07884 and https://github.com/Raschka-research-group/coral-cnn/issues/9
# more background https://arxiv.org/abs/1705.05278

class CoralOutput(tf.keras.layers.Layer):
  def __init__(self, output_len):
    super(CoralOutput, self).__init__()
    self.output_len = output_len

  def build(self, input_shape):
    self.kernel = self.add_weight(
        'kernel',
        shape=[int(input_shape[-1]), 1],
        initializer='glorot_uniform',
        dtype='float32',
        trainable=True
    )
    self.biases = self.add_weight(
        'biases',
        shape=[self.output_len,],
        initializer='zeros',
        dtype='float32',
        trainable=True
    )

  def call(self, input):
    fc = tf.matmul(input, self.kernel)
    fc = tf.tile(fc, [1, self.output_len])
    logits = tf.nn.bias_add(fc, self.biases, name='logits')
    probits = tf.math.sigmoid(logits, name='probits')
    return logits, probits


def _task_importance_weighting(y):
    n_ranks = y.shape[-1]
    ranks = 1 + np.sum(y, axis=-1)
    n_examples = len(y)
    m = np.zeros(n_ranks)
    for k in range(n_ranks):
        s_k = np.sum(ranks > (k + 1))
        m[k] = np.sqrt(max(s_k, n_examples - s_k))
    return (m / np.max(m))


# # adapation of https://github.com/Raschka-research-group/coral-cnn/blob/master/model-code/resnet34/cacd-coral.py#L326
# the authors describe this as "weighted cross-entropy" of the K-1 binary classifiers.
# note (1) we calculate on the output logits, (2) the form in code differs from the form
# in the paper for numerical stability, (3) importance weightings are TODO.
def coral_loss(rank_data=None):
    importance = 1 if rank_data is None else _task_importance_weighting(rank_data)
    def loss_logits(y_true, y_pred):
        unweighted = (tf.math.log_sigmoid(y_pred) * y_true) + (tf.math.log_sigmoid(y_pred) - y_pred) * (1 - y_true)
        return tf.reduce_mean(-1 * tf.reduce_sum(importance * unweighted, axis=1))
    return loss_logits

# see
# https://arxiv.org/abs/1901.07884 and https://github.com/Raschka-research-group/coral-cnn/issues/9
# more background https://arxiv.org/abs/1705.05278

class CoralOutput(tf.keras.layers.Layer):
  def __init__(self, output_len):
    super(CoralOutput, self).__init__()
    self.output_len = output_len

  def build(self, input_shape):
    self.kernel = self.add_weight(
        'kernel',
        shape=[int(input_shape[-1]), 1],
        initializer='glorot_uniform',
        dtype='float32',
        trainable=True
    )
    self.biases = self.add_weight(
        'biases',
        shape=[self.output_len,],
        initializer='zeros',
        dtype='float32',
        trainable=True
    )

  def call(self, input):
    fc = tf.matmul(input, self.kernel)
    fc = tf.tile(fc, [1, self.output_len])
    logits = tf.nn.bias_add(fc, self.biases, name='logits')
    probits = tf.math.sigmoid(logits, name='probits')
    return logits, probits


def _task_importance_weighting(y):
    n_ranks = y.shape[-1]
    ranks = 1 + np.sum(y, axis=-1)
    n_examples = len(y)
    m = np.zeros(n_ranks)
    for k in range(n_ranks):
        s_k = np.sum(ranks > (k + 1))
        m[k] = np.sqrt(max(s_k, n_examples - s_k))
    return (m / np.max(m))


# # adapation of https://github.com/Raschka-research-group/coral-cnn/blob/master/model-code/resnet34/cacd-coral.py#L326
# the authors describe this as "weighted cross-entropy" of the K-1 binary classifiers.
# note (1) we calculate on the output logits, (2) the form in code differs from the form
# in the paper for numerical stability, (3) importance weightings are TODO.
def coral_loss(rank_data=None):
    importance = 1 if rank_data is None else _task_importance_weighting(rank_data)
    def loss_logits(y_true, y_pred):
        unweighted = (tf.math.log_sigmoid(y_pred) * y_true) + (tf.math.log_sigmoid(y_pred) - y_pred) * (1 - y_true)
        return tf.reduce_mean(-1 * tf.reduce_sum(importance * unweighted, axis=1))
    return loss_logits

In [47]:
p = .5
input_shape = moon.problem.Problem.GRID_SHAPE
hiddens = [20]
hidden_activation = 'swish'
output_len = y.shape[-1]  # moon.problem.Grade.N_GRADES - 1
output_activation = 'sigmoid'

metrics = {'probits': [accuracy_k_coral(0), accuracy_k_coral(1), accuracy_k_coral(2), mae_coral(), rmse_coral()]}
loss = {'logits': coral_loss(rank_data=y)}
adam_lr = 1e-3
optim = tf.keras.optimizers.Adam(lr=adam_lr)

in_x = layers.Input(shape=input_shape)
features = layers.Flatten()(in_x)
for nodes in hiddens:
    features = layers.Dense(nodes)(features)
    features = layers.Activation(hidden_activation)(features)
    if p > 0: features = layers.Dropout(p)(features)
out = CoralOutput(output_len)(features)
logits, probits = layers.Lambda(lambda x: x, name='logits')(out[0]), layers.Lambda(lambda x: x, name='probits')(out[1])

model = tf.keras.Model(in_x, [logits, probits], name='mlp_ord_regress')
model.compile(loss=loss, optimizer=optim, metrics=metrics)
# model.summary()

callbacks = [tf.keras.callbacks.EarlyStopping(patience=3, verbose=1)]
batch_size = 64
max_epochs = 300

history = model.fit(x_train, y_train, batch_size=batch_size, validation_data=(x_test, y_test), epochs=max_epochs, callbacks=callbacks)

# [64. 64], swish, 1e-4, weighted : e222
# loss: 1.9850 - logits_loss: 1.9850 - probits_acc: 0.3914 - probits_CS1: 0.7283 - probits_CS2: 0.9021 - probits_mae: 1.0237 - probits_rmse: 1.4996 - val_loss: 1.9964 - val_logits_loss: 1.9964 - val_probits_acc: 0.4009 - val_probits_CS1: 0.7228 - val_probits_CS2: 0.8974 - val_probits_mae: 1.0311 - val_probits_rmse: 1.5261

# [32, 32], swish, 1e-4, unweighted
# loss: 2.3904 - logits_loss: 2.3904 - probits_acc: 0.3860 - probits_CS1: 0.7195 - probits_CS2: 0.8967 - val_loss: 2.3429 - val_logits_loss: 2.3429 - val_probits_acc: 0.4045 - val_probits_CS1: 0.7236 - val_probits_CS2: 0.8988

# [32, 32], swish, 1e-3, weighted : e69
# loss: 1.9615 - logits_loss: 1.9615 - probits_acc: 0.3893 - probits_CS1: 0.7298 - probits_CS2: 0.9044 - val_loss: 1.9623 - val_logits_loss: 1.9623 - val_probits_acc: 0.4011 - val_probits_CS1: 0.7304 - val_probits_CS2: 0.9017

# [32, 32], swish, 1e-3, unweighted : e59
# loss: 2.3329 - logits_loss: 2.3329 - probits_acc: 0.3941 - probits_CS1: 0.7271 - probits_CS2: 0.9024 - val_loss: 2.3219 - val_logits_loss: 2.3219 - val_probits_acc: 0.3998 - val_probits_CS1: 0.7296 - val_probits_CS2: 0.9029

# [32, 32], relu, 1e-3, weighted : e39
# loss: 1.9442 - logits_loss: 1.9442 - probits_acc: 0.3801 - probits_CS1: 0.7346 - probits_CS2: 0.9086 - val_loss: 2.0306 - val_logits_loss: 2.0306 - val_probits_acc: 0.3871 - val_probits_CS1: 0.7162 - val_probits_CS2: 0.8927

# [20], swish, 1e-3, weighted : e56
# loss: 2.0464 - logits_loss: 2.0464 - probits_acc: 0.3785 - probits_CS1: 0.7118 - probits_CS2: 0.8934 - probits_mae: 1.0683 - probits_rmse: 1.5502 - val_loss: 2.0134 - val_logits_loss: 2.0134 - val_probits_acc: 0.3918 - val_probits_CS1: 0.7149 - val_probits_CS2: 0.8949 - val_probits_mae: 1.0516 - val_probits_rmse: 1.5428

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300


Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300


Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 00056: early stopping


In [46]:
print(history.history.keys())  # stats
print(len(history.history['loss']))  # epochs

dict_keys(['loss', 'logits_loss', 'probits_acc', 'probits_CS1', 'probits_CS2', 'probits_mae', 'probits_rmse', 'val_loss', 'val_logits_loss', 'val_probits_acc', 'val_probits_CS1', 'val_probits_CS2', 'val_probits_mae', 'val_probits_rmse'])
222


In [18]:
y_true = y_test[[2, 4, 8, 16, 32, 64, 128]]
y_pred = model.predict(x_test[[2, 4, 8, 16, 32, 64, 128]])

print(1+np.sum(y_true, axis=-1))
print(1+np.sum(y_pred[1] > .5, axis=-1))
print(y_pred[1])

[ 5 11  8  5  8 14  6]
[ 6  9  8  5  7 11  5]
[[9.9937040e-01 9.9937040e-01 9.9937040e-01 9.9930549e-01 5.3878462e-01
  3.3787346e-01 1.5452674e-01 5.6651533e-02 2.0606816e-02 1.0650635e-02
  3.8231611e-03 1.0341406e-03 3.6516786e-04 1.3330579e-04 8.4384490e-05
  5.7297973e-05]
 [9.9996597e-01 9.9996597e-01 9.9996597e-01 9.9996245e-01 9.5578682e-01
  9.0424240e-01 7.7180529e-01 5.2636230e-01 2.8024450e-01 1.6612193e-01
  6.6311300e-02 1.8797606e-02 6.7144930e-03 2.4591386e-03 1.5591383e-03
  1.0591745e-03]
 [9.9992853e-01 9.9992853e-01 9.9992853e-01 9.9992108e-01 9.1147768e-01
  8.1810713e-01 6.1699939e-01 3.4611639e-01 1.5644103e-01 8.6663932e-02
  3.2720566e-02 9.0423524e-03 3.2094121e-03 1.1727810e-03 7.4315071e-04
  5.0482154e-04]
 [9.9895829e-01 9.9895829e-01 9.9895829e-01 9.9885082e-01 4.1373944e-01
  2.3563454e-01 9.9435955e-02 3.5009652e-02 1.2551397e-02 6.4615309e-03
  2.3131669e-03 6.2501431e-04 2.2062659e-04 8.0475562e-05 5.0978546e-05
  3.4614979e-05]
 [9.9987292e-01 9.9987

In [15]:
print(_task_importance_weighting(y))

[1.         1.         0.99998387 0.99988706 0.81527716 0.73880707
 0.77520879 0.85887471 0.91571489 0.94216616 0.96967674 0.98878876
 0.99519675 0.99830452 0.9991284  0.99959658]
