# Imbalanced Classes

In [119]:
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import tensorflow as tf

n_epochs = 300
batch_size = 128

def batch_generator(data, labels, batch_size):
    n_batches = int(np.ceil(data.shape[0] / batch_size))
    start = 0
    for batch in range(n_batches):
        yield data[start:(start+batch_size)], labels[start:(start+batch_size)]
        start += batch_size

X_train, Y_train = make_classification(weights=[0.9], n_samples=50000)
X_dev, Y_dev = make_classification(weights=[0.9], n_samples=10000)
X_test, Y_test = make_classification(weights=[0.9], n_samples=10000)

X_train, X_dev, X_test = X_train.astype('float32'), X_dev.astype('float32'), X_test.astype('float32')

ohe = OneHotEncoder(sparse=False)

y_train = ohe.fit_transform(Y_train.reshape(-1, 1))
y_dev = ohe.fit_transform(Y_dev.reshape(-1, 1))
y_test = ohe.fit_transform(Y_test.reshape(-1, 1))

print(y_train.shape)

(50000, 2)


In [120]:
tf.reset_default_graph()

X = tf.placeholder(tf.float32, shape=[None, 20], name='X')
y = tf.placeholder(tf.float32, shape=[None, 2], name='y')

with tf.name_scope('hidden1'):
    hidden1 = tf.layers.dense(inputs=X, units=20, activation=tf.nn.relu, 
                              kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
                              name='hidden1')
    logits = tf.layers.dense(inputs=hidden1, units=2, name='logits')

with tf.name_scope('loss_function'):
    # xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
    xentropy = tf.nn.weighted_cross_entropy_with_logits(logits=logits, targets=y, pos_weight=9)
    loss = tf.reduce_mean(xentropy)

with tf.name_scope('training_step'):
    optimizer = tf.train.MomentumOptimizer(learning_rate=0.1, momentum=0.9)
    train_step = optimizer.minimize(loss)
    y_pred = tf.argmax(logits, axis=1)
    # correct = tf.nn.in_top_k(k=1, predictions=logits, targets=y)
    # accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))
    
init = tf.global_variables_initializer()

In [121]:
with tf.Session() as sess:
    sess.run(init)
    n_batches = int(np.ceil(X_train.shape[0]/batch_size))
    for epoch in range(n_epochs):
        train_pred = np.array([])
        batchgen_train = batch_generator(X_train, y_train, batch_size)
        for Xb, yb in batchgen_train:
            _, tmp = sess.run([train_step, y_pred], feed_dict={X: Xb, y: yb})
            train_pred = np.append(train_f1, tmp)
        
        # Full pass on the dev set
        batchgen_dev = batch_generator(X_dev, y_dev, batch_size)
        dev_pred = np.array([])
        for Xbdev, ybdev in batchgen_dev:
            tmp = sess.run(y_pred, feed_dict={X: Xbdev, y:ybdev})
            dev_pred = np.append(dev_pred, tmp)
            
        # print('Train F1: ', f1_score(Y_train, train_pred))
        if epoch % 100 == 0:
            print('Epoch: ', epoch, 'Dev F1: ', f1_score(Y_dev, dev_pred))

Epoch:  0 Dev F1:  0.119009584665
Epoch:  100 Dev F1:  0.16112198304
Epoch:  200 Dev F1:  0.146969183881
Epoch:  300 Dev F1:  0.137552470132


KeyboardInterrupt: 

In [None]:
train_pred