In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

import io

import tensorflow as tf

In [3]:
tf.logging.set_verbosity(tf.logging.ERROR)
tf.set_random_seed(1)

### Load data

In [4]:
# download data
train_url = 'https://download.mlcc.google.com/mledu-datasets/sparse-data-embedding/train.tfrecord'
train_path = tf.keras.utils.get_file(train_url.split('/')[-1], train_url)
test_url = 'https://download.mlcc.google.com/mledu-datasets/sparse-data-embedding/test.tfrecord'
test_path = tf.keras.utils.get_file(test_url.split('/')[-1], test_url)

In [5]:
# load data
def parse_fn(record):
    features = {
        "terms": tf.VarLenFeature(dtype=tf.string), # terms are strings of varying lengths
        "labels": tf.FixedLenFeature(shape=[1], dtype=tf.float32) # labels are 0 or 1
    }
  
    parsed_features = tf.parse_single_example(record, features)

    terms = parsed_features['terms'].values
    labels = parsed_features['labels']

    return  {'terms': terms}, labels

def input_fn(input_filenames, num_epochs=None, batch_size=25, shuffle=True):  
    ds = tf.data.TFRecordDataset(input_filenames)
    ds = ds.map(parse_fn)

    if shuffle:
        ds = ds.shuffle(100000)

    ds = ds.padded_batch(batch_size, ds.output_shapes).repeat(num_epochs)

    return ds.make_one_shot_iterator().get_next()

In [6]:
# download the vocabulary file.
terms_url = 'https://download.mlcc.google.com/mledu-datasets/sparse-data-embedding/terms.txt'
terms_path = tf.keras.utils.get_file(terms_url.split('/')[-1], terms_url)

vocab = None
with io.open(terms_path, 'r', encoding='utf8') as f:
    vocab = list(set(f.read().split()))

### Build model with classifier

In [7]:
# build classifier
terms_feature_column = tf.feature_column.categorical_column_with_vocabulary_list(key='terms', 
                                                                                 vocabulary_list=vocab)

terms_embedding_column = tf.feature_column.embedding_column(terms_feature_column, dimension=10)
feature_columns = [terms_embedding_column]

optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, 5.0)

classifier = tf.estimator.DNNClassifier(
    feature_columns=feature_columns,
    hidden_units=[10, 10],
    dropout=0.3,
    activation_fn=tf.nn.leaky_relu,
    optimizer=optimizer
)

In [8]:
# train
classifier.train(input_fn=lambda: input_fn([train_path], num_epochs=5))

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7fcf06c24b38>

### Evaluate classifier

In [9]:
train_stats = classifier.evaluate(input_fn=lambda: input_fn([train_path], num_epochs=1))
test_stats = classifier.evaluate(input_fn=lambda: input_fn([test_path], num_epochs=1))

train_stats = pd.DataFrame.from_dict(train_stats, orient='index', columns=['train'])
test_stats = pd.DataFrame.from_dict(test_stats, orient='index', columns=['test'])
stats = train_stats.join(test_stats)
stats

Unnamed: 0,train,test
accuracy,0.9558,0.87512
accuracy_baseline,0.5,0.5
auc,0.986446,0.945513
auc_precision_recall,0.986157,0.943763
average_loss,0.138444,0.344261
label/mean,0.5,0.5
loss,3.461092,8.606529
precision,0.956787,0.88023
prediction/mean,0.494096,0.48858
recall,0.95472,0.8684
