In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import os
import shutil
import sys

import tensorflow as tf  # pylint: disable=g-bad-import-order

# from official.utils.arg_parsers import parsers
# from official.utils.logs import hooks_helper


In [2]:
_fields = [
 'trafficsource_referralpath',
 'geonetwork_city',
 'geonetwork_country',
 'geonetwork_region',
 'geonetwork_metro',
 'device_browser',
 'trafficsource_campaign',
 'trafficsource_source',
 'trafficsource_medium',
 'device_mobiledevicemodel',
 'device_operatingsystemversion',
 'hits_appinfo_appname',
 'hits_appinfo_appversion',
 'hits_appinfo_screenname',
 'hits_page_pagetitle',
 'hits_page_pagepath'
]

_metrics = [
    'pageviews',
    'screenviews'
]
_label = ['article_consumption_level']

_CSV_COLUMNS = _fields + _metrics + _label

_fields_defaults = [['']] * len(_fields)
_metrics_defaults = [[0]] * len(_metrics)

_CSV_COLUMN_DEFAULTS = _fields_defaults + _metrics_defaults + [[0]]

_target_labels_idx = {
    '50': 0,
    '75': 1,
    '100': 2
}

# TODO: Select small dataset for 
_NUM_EXAMPLES = {
    'train': 32561,
    'validation': 16281,
}


LOSS_PREFIX = {'wide': 'linear/', 'deep': 'dnn/'}

def build_model_columns():
    """
        Builds a set of wide and deep feature columns.
    """
    # Continuous columns

    pageviews = tf.feature_column.numeric_column('pageviews')
    screenviews = tf.feature_column.numeric_column('screenviews')

    geonetwork_country = tf.feature_column.categorical_column_with_hash_bucket(
        'geonetwork_country', hash_bucket_size=200)
    geonetwork_region = tf.feature_column.categorical_column_with_hash_bucket(
        'geonetwork_region', hash_bucket_size=500)
#     hits_hour = tf.feature_column.categorical_column_with_hash_bucket(
#         'hits_hour', hash_bucket_size=20
#     )
    trafficsource_referralpath = tf.feature_column.categorical_column_with_hash_bucket(
        'trafficsource_referralpath', hash_bucket_size=25000)
    geonetwork_city = tf.feature_column.categorical_column_with_hash_bucket(
        'geonetwork_city', hash_bucket_size=5000)
    geonetwork_metro = tf.feature_column.categorical_column_with_hash_bucket(
        'geonetwork_metro', hash_bucket_size=200)
    device_browser = tf.feature_column.categorical_column_with_hash_bucket(
        'device_browser', hash_bucket_size=200)
    trafficsource_campaign = tf.feature_column.categorical_column_with_hash_bucket(
        'trafficsource_campaign', hash_bucket_size=10000)
    trafficsource_source = tf.feature_column.categorical_column_with_hash_bucket(
        'trafficsource_source', hash_bucket_size=200)
    trafficsource_medium = tf.feature_column.categorical_column_with_hash_bucket(
        'trafficsource_medium', hash_bucket_size=300)
    device_mobiledevicemodel = tf.feature_column.categorical_column_with_hash_bucket(
        'device_mobiledevicemodel', hash_bucket_size=500)
    device_operatingsystemversion = tf.feature_column.categorical_column_with_hash_bucket(
        'device_operatingsystemversion', hash_bucket_size=100)
    hits_appinfo_appname = tf.feature_column.categorical_column_with_hash_bucket(
        'hits_appinfo_appname', hash_bucket_size=500)
    hits_appinfo_appversion = tf.feature_column.categorical_column_with_hash_bucket(
        'hits_appinfo_appversion', hash_bucket_size=25)
    hits_appinfo_screenname = tf.feature_column.categorical_column_with_hash_bucket(
        'hits_appinfo_screenname', hash_bucket_size=12000)
    hits_page_pagetitle = tf.feature_column.categorical_column_with_hash_bucket(
        'hits_page_pagetitle', hash_bucket_size=18000)
    hits_page_pagepath = tf.feature_column.categorical_column_with_hash_bucket(
        'hits_page_pagepath', hash_bucket_size=15000)

    # Wide columns and deep columns.
    base_columns = [
        pageviews, screenviews, geonetwork_region, geonetwork_city, geonetwork_metro, geonetwork_country, 
        device_browser, trafficsource_campaign, trafficsource_source, trafficsource_medium, device_mobiledevicemodel,
        device_operatingsystemversion, hits_appinfo_appname, hits_appinfo_appversion, hits_appinfo_screenname,
        hits_page_pagetitle, hits_page_pagepath, trafficsource_referralpath
    ]

    crossed_columns = [
      tf.feature_column.crossed_column(
          ['hits_appinfo_appname', 'hits_appinfo_appversion'], hash_bucket_size=5000),
      tf.feature_column.crossed_column(
          ['geonetwork_city', 'hits_page_pagepath'], hash_bucket_size=25000),
      tf.feature_column.crossed_column(
          ['geonetwork_city', 'geonetwork_metro'], hash_bucket_size=25000)
    ]

    wide_columns = base_columns + crossed_columns

    deep_columns = [
        pageviews,
        screenviews,
        tf.feature_column.embedding_column(geonetwork_metro, dimension=8),
        tf.feature_column.embedding_column(geonetwork_city, dimension=8),
        tf.feature_column.embedding_column(geonetwork_country, dimension=8),
        tf.feature_column.embedding_column(device_browser, dimension=8),
        tf.feature_column.embedding_column(trafficsource_source, dimension=8),
        tf.feature_column.embedding_column(trafficsource_campaign, dimension=8),
        tf.feature_column.embedding_column(device_operatingsystemversion, dimension=8),
        tf.feature_column.embedding_column(device_mobiledevicemodel, dimension=8),
        tf.feature_column.embedding_column(hits_appinfo_appname, dimension=8),
        tf.feature_column.embedding_column(hits_appinfo_appversion, dimension=8),
        tf.feature_column.embedding_column(hits_page_pagetitle, dimension=8),
        tf.feature_column.embedding_column(hits_page_pagepath, dimension=8),
        tf.feature_column.embedding_column(trafficsource_referralpath, dimension=8),
        tf.feature_column.embedding_column(trafficsource_medium, dimension=8)
    ]

    return wide_columns, deep_columns


In [3]:
def build_estimator(model_dir, model_type):
  """Build an estimator appropriate for the given model type."""
  wide_columns, deep_columns = build_model_columns()
  hidden_units = [20, 15, 10, 5]

  # Create a tf.estimator.RunConfig to ensure the model is run on CPU, which
  # trains faster than GPU for this model.
  run_config = tf.estimator.RunConfig().replace(
      session_config=tf.ConfigProto(device_count={'GPU': 0}))

  if model_type == 'wide':
    print('wide')
    return tf.estimator.LinearClassifier(
        model_dir=model_dir,
        n_classes=3,
        feature_columns=wide_columns,
        config=run_config)
  elif model_type == 'deep':
    print('deep')
    return tf.estimator.DNNClassifier(
        model_dir=model_dir,
        n_classes=3,
        feature_columns=deep_columns,
        hidden_units=hidden_units,
        config=run_config)
  else:
    print ("wide + deep")
    return tf.estimator.DNNLinearCombinedClassifier(
        model_dir=model_dir,
        n_classes=3,
        linear_optimizer=tf.train.FtrlOptimizer(
            learning_rate=0.01,
            l1_regularization_strength=0.0001,
            l2_regularization_strength=0.01),
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=hidden_units,
        config=run_config)

In [4]:
def input_fn(data_file, num_epochs, shuffle, batch_size):
    """Generate an input function for the Estimator."""
    assert tf.gfile.Exists(data_file), (
      '%s not found. Please make sure you have run data_download.py and '
      'set the --data_dir argument to the correct path.' % data_file)

    def parse_csv(value):
        print('Parsing', data_file)
        columns = tf.decode_csv(value, field_delim='|', record_defaults=_CSV_COLUMN_DEFAULTS)
        features = dict(zip(_CSV_COLUMNS, columns))
        labels = features.pop('article_consumption_level')
        return features, labels

    # Extract lines from input files using the Dataset API.
    dataset = tf.data.TextLineDataset(data_file)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=_NUM_EXAMPLES['train'])

    dataset = dataset.map(parse_csv, num_parallel_calls=4)

    # We call repeat after shuffling, rather than before, to prevent separate
    # epochs from blending together.
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)
    return dataset

In [5]:
batch_size = 64
model_dir = "../model_dir/"
model_type = 'wide+deep'
# model_type = 'wide'
# model_type = 'deep'
data_dir = '../data/'

train_epochs = 20
epochs_between_evals = 10

# Clean up the model directory if present
shutil.rmtree(model_dir, ignore_errors=True)

model = build_estimator(model_dir, model_type)

train_file = os.path.join(data_dir, 'scroll_traffic.train.small')
validate_file = os.path.join(data_dir, 'scroll_traffic.valid.small')
test_file = os.path.join(data_dir, 'scroll_traffic.test.small')

wide + deep
INFO:tensorflow:Using config: {'_model_dir': '../model_dir/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': device_count {
  key: "GPU"
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11e574358>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [6]:

# Train and evaluate the model every `flags.epochs_between_evals` epochs.
def train_input_fn():
    return input_fn(
        train_file, epochs_between_evals, True, batch_size)

def eval_input_fn():
    return input_fn(validate_file, 1, False, batch_size)

def test_input_fn():
    return input_fn(test_file, 1, False, batch_size)

loss_prefix = LOSS_PREFIX.get(model_type, '')
# train_hooks = hooks_helper.get_train_hooks(
#   hooks, batch_size=batch_size,
tensors_to_log={'average_loss': loss_prefix + 'head/truediv',
                  'loss': loss_prefix + 'head/weighted_loss/Sum'}
print(tensors_to_log)

# Train and evaluate the model every `flags.epochs_between_evals` epochs.
for n in range(train_epochs // epochs_between_evals):
    model.train(input_fn=train_input_fn)
    results = model.evaluate(input_fn=eval_input_fn)

    # Display evaluation metrics
    print('Results at epoch', (n + 1) * epochs_between_evals)
    print('-' * 60)

    for key in sorted(results):
        print('%s: %s' % (key, results[key]))

#     if model_helpers.past_stop_threshold(
#         stop_threshold, results['accuracy']):
#         break

# pred_and_probs = list(model.predict(input_fn=test_input_fn))

{'average_loss': 'head/truediv', 'loss': 'head/weighted_loss/Sum'}
Parsing ../data/scroll_traffic.train.small
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into ../model_dir/model.ckpt.
INFO:tensorflow:loss = 82.11897, step = 1
INFO:tensorflow:global_step/sec: 66.6932
INFO:tensorflow:loss = 48.97396, step = 101 (1.500 sec)
INFO:tensorflow:global_step/sec: 173.347
INFO:tensorflow:loss = 39.52893, step = 201 (0.577 sec)
INFO:tensorflow:global_step/sec: 173.51
INFO:tensorflow:loss = 36.83058, step = 301 (0.576 sec)
INFO:tensorflow:global_step/sec: 112.923
INFO:tensorflow:loss = 41.057983, step = 401 (0.886 sec)
INFO:tensorflow:global_step/sec: 124.125
INFO:tensorflow:loss = 42.43494, step = 501 (0.806 sec)
INFO:tensorflow:global_step/sec: 129
INFO:tensorflow:lo

INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-05-01-14:40:03
INFO:tensorflow:Saving dict for global step 6250: accuracy = 0.7879, average_loss = 0.6271991, global_step = 6250, loss = 39.94899
Results at epoch 20
------------------------------------------------------------
accuracy: 0.7879
average_loss: 0.6271991
global_step: 6250
loss: 39.94899


In [263]:
preds = list(model.predict(input_fn=test_input_fn))

Parsing ../data/scroll.test.small
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../model_dir/model.ckpt-10932
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [264]:
import numpy as np
pred_classes = [x['class_ids'] for x in preds]
pred_probs = [x['probabilities'] for x in preds]
perc_not_2 = sum(np.array(pred_classes) != 2)
perc_not_2/len(pred_classes)

array([0.35904847])

In [250]:
# Definitely the class imbalance is causing the model to predict class 2 in most cases than the other classes
# This can be mitigated with a more balanced training and testing set
len(pred_probs)

8996

In [251]:
np.array(pred_probs)[0:4]

array([[0.32171628, 0.30503032, 0.37325343],
       [0.14107876, 0.27234718, 0.5865741 ],
       [0.46001828, 0.3357787 , 0.20420308],
       [0.13506429, 0.18011568, 0.68482006]], dtype=float32)

In [266]:
from sklearn.metrics import classification_report, confusion_matrix
def get_pred(arr, cutoff=.7):
    if arr[-1] < cutoff:
        return np.argmax(arr[:-1])
    else:
        return np.argmax(arr)
df = pd.read_table("../data/scroll.test.small", sep='|', header=None, names=_CSV_COLUMNS)
df.shape

(8996, 19)

In [253]:
my_preds = [get_pred(x) for x in pred_probs]
len(my_preds)

8996

In [265]:
print(classification_report(np.array(df[_CSV_COLUMNS[-1]]), np.array(pred_classes)))

             precision    recall  f1-score   support

          0       0.34      0.28      0.31      2000
          1       0.34      0.27      0.30      1996
          2       0.66      0.76      0.70      5000

avg / total       0.52      0.54      0.53      8996



In [272]:
print(classification_report(np.array(df[_CSV_COLUMNS[-1]]), np.array(my_preds)))

             precision    recall  f1-score   support

          0       0.26      0.56      0.35      2000
          1       0.26      0.51      0.34      1996
          2       0.92      0.14      0.25      5000

avg / total       0.63      0.32      0.29      8996



In [271]:
print(confusion_matrix(np.array(df[_CSV_COLUMNS[-1]]), np.array(my_preds)))

[[1112  859   29]
 [ 953 1013   30]
 [2263 2016  721]]
