In [1]:
from statistics import mean

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.multiclass import OneVsRestClassifier

import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)
tf.random.set_random_seed(47)

from keras import optimizers
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout

Using TensorFlow backend.


In [2]:
df = pd.read_csv("data/multilabel_dataset.csv")
df.describe(include="all")

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,target_4,target_5,target_6,target_7,target_8,target_9,target_10,target_11,target_12,target_13
count,2417.0,2417.0,2417.0,2417.0,2417.0,2417.0,2417.0,2417.0,2417.0,2417.0,...,2417.0,2417.0,2417.0,2417.0,2417.0,2417.0,2417.0,2417.0,2417.0,2417.0
mean,0.001173,-0.000436,-0.000257,0.000265,0.001228,0.000475,0.001107,0.00042,0.001076,-9e-06,...,0.298717,0.247,0.177079,0.198593,0.073645,0.104675,0.11957,0.751345,0.744311,0.014067
std,0.097411,0.097885,0.097746,0.096969,0.096909,0.097306,0.09717,0.096803,0.096326,0.096805,...,0.45779,0.431356,0.381815,0.399024,0.261246,0.306198,0.324525,0.432323,0.436338,0.117792
min,-0.371146,-0.472632,-0.339195,-0.467945,-0.367044,-0.509447,-0.319928,-0.594498,-0.369712,-0.767128,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.053655,-0.058734,-0.057526,-0.057149,-0.058461,-0.060212,-0.058445,-0.062849,-0.063472,-0.06501,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,0.003649,-0.003513,0.002892,-0.000153,0.005565,0.000321,0.006179,0.001436,0.003515,0.002432,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
75%,0.057299,0.048047,0.061007,0.054522,0.066286,0.059908,0.068892,0.061418,0.064958,0.063096,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
max,0.520272,0.614114,0.353241,0.56896,0.307649,0.336971,0.351401,0.454591,0.419852,0.420876,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
X = df.iloc[:, :-14].values
Y = df.iloc[:, -14:].values
cv = KFold(n_splits=5, random_state=37)

# scikit-learn baseline

In [4]:
lr = LogisticRegression(solver="lbfgs")
clf = OneVsRestClassifier(lr)

In [5]:
scores = []

for train, test in cv.split(X, Y):
    clf.fit(X[train], Y[train])
    Y_pred = clf.predict(X[test])
    score = f1_score(Y[test], Y_pred, average="micro")
    scores.append(score)

print(f"Micro-averaged f1 on cross validation: {mean(scores)}")

Micro-averaged f1 on cross validation: 0.6342384766906916


# pure tensorflow

In [6]:
# Model parameters
learning_rate = 0.03
num_epochs = 100

# Dimensions
num_features = len(X[0])
num_labels = len(Y[0])

In [7]:
# Create placeholders for features and labels
X_tensor = tf.placeholder(tf.float32, name="features")
Y_tensor = tf.placeholder(tf.float32, name="labels")

# Create variables for weights and bias
w = tf.get_variable(
    shape=(num_features, num_labels),
    initializer=tf.random_normal_initializer(),
    name="weights",
)
b = tf.get_variable(
    shape=(1, num_labels), initializer=tf.zeros_initializer(), name="bias"
)

# Build a model returning logits
logits = tf.matmul(X_tensor, w) + b

# Define loss function. Unlike the single-label case, we should not output
# a softmax probability distribultion as labels are classified independently.
# Instead we apply a sigmoid on the logits as they are independent logistic regressions.
# Since we treat each logit as an independent logistic regression, we need to sum
# so that the whole model's performance is the sum of its per-class performances
loss = tf.reduce_mean(
    tf.reduce_sum(
        tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=Y_tensor), axis=1
    )
)

# Define training operation
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)

# Make prediction
def multi_label_hot(prediction, threshold=0.5):
    prediction = tf.cast(prediction, tf.float32)
    return tf.cast(tf.greater(prediction, threshold), tf.int64)


prediction = tf.sigmoid(logits)
one_hot_prediction = multi_label_hot(prediction)

In [8]:
scores = []

for train, test in cv.split(X, Y):
    with tf.Session() as sess:
        # Initialize variables
        sess.run(tf.global_variables_initializer())

        # Train model
        for epoch in range(num_epochs):
            _, l = sess.run(
                [optimizer, loss], feed_dict={X_tensor: X[train], Y_tensor: Y[train]}
            )

        # Calculate predicted values
        Y_pred = sess.run(one_hot_prediction, {X_tensor: X[test], Y_tensor: Y[test]})

    score = f1_score(Y[test], Y_pred, average="micro")
    scores.append(score)

print(f"Micro-averaged f1 on cross validation: {mean(scores)}")

Micro-averaged f1 on cross validation: 0.6306492826435719


# keras

In [9]:
scores = []

for train, test in cv.split(X, Y):
    # Create and compile model
    model = Sequential()
    model.add(Dense(num_labels, activation="sigmoid", input_shape=(num_features,)))

    adam = optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=adam, loss="binary_crossentropy", metrics=["accuracy"])

    # Fit and make prediction
    model.fit(X[train], Y[train], epochs=num_epochs, batch_size=200, verbose=0)
    Y_pred = (model.predict(X[test]) > 0.5).astype(np.uint8)

    score = f1_score(Y[test], Y_pred, average="micro")
    scores.append(score)

print(f"Micro-averaged f1 on cross validation: {mean(scores)}")

Micro-averaged f1 on cross validation: 0.6309615543717694


# keras with nonlinearity

In [10]:
scores = []

for train, test in cv.split(X, Y):
    # Create and compile model
    model = Sequential()
    model.add(Dense(200, activation="relu", input_shape=(num_features,)))
    model.add(Dropout(0.3))
    model.add(Dense(200, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(200, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(num_labels, activation="sigmoid"))

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

    # Fit and make prediction
    es = EarlyStopping(monitor="val_loss", mode="min", patience=5, verbose=1)
    model.fit(
        X[train],
        Y[train],
        epochs=num_epochs,
        batch_size=200,
        verbose=1,
        validation_split=0.3,
        callbacks=[es]
    )
    Y_pred = (model.predict(X[test]) > 0.5).astype(np.uint8)

    score = f1_score(Y[test], Y_pred, average="micro")
    scores.append(score)

print(f"Micro-averaged f1 on cross validation: {mean(scores)}")

Train on 1353 samples, validate on 580 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 00028: early stopping
Train on 1353 samples, validate on 580 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 00026: early stopping
Train on 1353 samples, validate on 581 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/10