In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import os
import shutil
import sys

import tensorflow as tf  # pylint: disable=g-bad-import-order

# from official.utils.arg_parsers import parsers
# from official.utils.logs import hooks_helper


In [19]:
_fields = [
 'hits_hour',
 'geonetwork_city',
 'geonetwork_metro',
 'device_browser',
 'trafficsource_campaign',
 'trafficsource_source',
 'device_mobiledevicemodel',
 'device_operatingsystemversion',
 'hits_appinfo_appname',
 'hits_appinfo_appversion',
 'hits_appinfo_screenname',
 'hits_page_pagetitle',
 'hits_page_pagepath'
]

_metrics = [
    'pageviews',
    'screenviews',
    'scroll_50_count',
    'scroll_75_count',
    'scroll_100_count'
]

_CSV_COLUMNS = _fields + _metrics

_fields_defaults = [['']] * len(_fields)
_metrics_defaults = [[0]] * len(_metrics)

_target_labels_idx = {
    '50': 0,
    '75': 1,
    '100': 2
}

# TODO: Select small dataset for 
_NUM_EXAMPLES = {
    'train': 32561,
    'validation': 16281,
}


LOSS_PREFIX = {'wide': 'linear/', 'deep': 'dnn/'}

def build_model_columns():
    """
        Builds a set of wide and deep feature columns.
    """
    # Continuous columns

    pageviews = tf.feature_column.numeric_column('pageviews')
    screenviews = tf.feature_column.numeric_column('screenviews')

    #     geonetwork_country = tf.feature_column.categorical_column_with_hash_bucket(
    #         'geonetwork_country', hash_bucket_size=200
    #     )
    #     geonetwork_region = tf.feature_column.categorical_column_with_hash_bucket(
    #         'geonetwork_region', hash_bucket_size=1000
    #     )
    
    hits_hour = tf.feature_column.categorical_column_with_hash_bucket(
        'hits_hour', hash_bucket_size=20
    )
    geonetwork_city = tf.feature_column.categorical_column_with_hash_bucket(
        'geonetwork_city', hash_bucket_size=5000
    )
    geonetwork_metro = tf.feature_column.categorical_column_with_hash_bucket(
        'geonetwork_metro', hash_bucket_size=200
    )
    device_browser = tf.feature_column.categorical_column_with_hash_bucket(
        'device_browser', hash_bucket_size=50
    )
    trafficsource_campaign = tf.feature_column.categorical_column_with_hash_bucket(
        'trafficsource_campaign', hash_bucket_size=3000
    )
    trafficsource_source = tf.feature_column.categorical_column_with_hash_bucket(
        'trafficsource_source', hash_bucket_size=50
    )
    device_mobiledevicemodel = tf.feature_column.categorical_column_with_hash_bucket(
        'device_mobiledevicemodel', hash_bucket_size=300
    )
    device_operatingsystemversion = tf.feature_column.categorical_column_with_hash_bucket(
        'device_operatingsystemversion', hash_bucket_size=100
    )
    hits_appinfo_appname = tf.feature_column.categorical_column_with_hash_bucket(
        'hits_appinfo_appname', hash_bucket_size=500
    )
    hits_appinfo_appversion = tf.feature_column.categorical_column_with_hash_bucket(
        'hits_appinfo_appversion', hash_bucket_size=25
    )
    hits_appinfo_screenname = tf.feature_column.categorical_column_with_hash_bucket(
        'hits_appinfo_screenname', hash_bucket_size=10000
    )
    hits_page_pagetitle = tf.feature_column.categorical_column_with_hash_bucket(
        'hits_page_pagetitle', hash_bucket_size=15000
    )
    hits_page_pagepath = tf.feature_column.categorical_column_with_hash_bucket(
        'hits_page_pagepath', hash_bucket_size=8000
    )

    # Wide columns and deep columns.
    base_columns = [
        pageviews, screenviews, geonetwork_city, 
        geonetwork_metro, device_browser, trafficsource_campaign, trafficsource_source, 
        device_mobiledevicemodel, device_operatingsystemversion, hits_appinfo_appname, hits_appinfo_appversion, 
        hits_appinfo_screenname, hits_page_pagetitle, hits_page_pagepath, hour_buckets, geonetwork_city, hits_hour
    ]

    crossed_columns = [
      tf.feature_column.crossed_column(
          ['hits_appinfo_appname', 'hits_appinfo_appversion'], hash_bucket_size=5000),
      tf.feature_column.crossed_column(
          ['geonetwork_city', 'hits_page_pagepath'], hash_bucket_size=25000),
      tf.feature_column.crossed_column(
      ['hits_hour', 'geonetwork_city'], hash_bucket_size=10000),
    ]

    wide_columns = base_columns + crossed_columns

    deep_columns = [
        pageviews,
        screenviews,
        tf.feature_column.embedding_column(geonetwork_metro, dimension=8),
        tf.feature_column.embedding_column(geonetwork_city, dimension=8),
        tf.feature_column.embedding_column(device_browser, dimension=8),
        tf.feature_column.embedding_column(trafficsource_source, dimension=8),
        tf.feature_column.embedding_column(trafficsource_campaign, dimension=8),
        tf.feature_column.embedding_column(device_operatingsystemversion, dimension=8),
        tf.feature_column.embedding_column(device_mobiledevicemodel, dimension=8),
        tf.feature_column.embedding_column(device_language, dimension=8),
        tf.feature_column.embedding_column(hits_appinfo_appname, dimension=8),
        tf.feature_column.embedding_column(hits_appinfo_appversion, dimension=8),
        tf.feature_column.embedding_column(hits_page_pagetitle, dimension=8),
        tf.feature_column.embedding_column(hits_page_pagepath, dimension=8)
    ]

    return wide_columns, deep_columns


In [12]:
def build_estimator(model_dir, model_type):
  """Build an estimator appropriate for the given model type."""
  wide_columns, deep_columns = build_model_columns()
  hidden_units = [20, 15, 10, 5]

  # Create a tf.estimator.RunConfig to ensure the model is run on CPU, which
  # trains faster than GPU for this model.
  run_config = tf.estimator.RunConfig().replace(
      session_config=tf.ConfigProto(device_count={'GPU': 0}))

  if model_type == 'wide':
    return tf.estimator.LinearClassifier(
        model_dir=model_dir,
        feature_columns=wide_columns,
        config=run_config)
  elif model_type == 'deep':
    return tf.estimator.DNNClassifier(
        model_dir=model_dir,
        feature_columns=deep_columns,
        hidden_units=hidden_units,
        config=run_config)
  else:
    return tf.estimator.DNNLinearCombinedClassifier(
        model_dir=model_dir,
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=hidden_units,
        config=run_config)

14

In [18]:
def input_fn(data_file, num_epochs, shuffle, batch_size):
    """Generate an input function for the Estimator."""
    assert tf.gfile.Exists(data_file), (
      '%s not found. Please make sure you have run data_download.py and '
      'set the --data_dir argument to the correct path.' % data_file)

    def parse_csv(value):
        print('Parsing', data_file)
        columns = tf.decode_csv(value, field_delim='|', record_defaults=_CSV_COLUMN_DEFAULTS)
        features = dict(zip(_CSV_COLUMNS, columns))
        labels = features.pop('article_consumption_level')
        return features, tf.one_hot(indices=labels, depth=3)

    # Extract lines from input files using the Dataset API.
    dataset = tf.data.TextLineDataset(data_file)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=_NUM_EXAMPLES['train'])

    dataset = dataset.map(parse_csv, num_parallel_calls=3)

    # We call repeat after shuffling, rather than before, to prevent separate
    # epochs from blending together.
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)
    return dataset

In [None]:
batch_size = 64
model_dir = "./model_dir/"
model_type = 'wide+deep'
data_dir = './data/'

parser = WideDeepArgParser()
flags = parser.parse_args(args=argv[1:])

# Clean up the model directory if present
shutil.rmtree(flags.model_dir, ignore_errors=True)

model = build_estimator(model_dir, model_type)

train_file = os.path.join(data_dir, 'scroll_traffic.train.small')
test_file = os.path.join(data_dir, 'scroll_traffic.test.small')

In [None]:

# Train and evaluate the model every `flags.epochs_between_evals` epochs.
def train_input_fn():
    return input_fn(
        train_file, epochs_between_evals, True, batch_size)

def eval_input_fn():
    return input_fn(test_file, 1, False, batch_size)

loss_prefix = LOSS_PREFIX.get(model_type, '')
train_hooks = hooks_helper.get_train_hooks(
  hooks, batch_size=batch_size,
  tensors_to_log={'average_loss': loss_prefix + 'head/truediv',
                  'loss': loss_prefix + 'head/weighted_loss/Sum'})

# Train and evaluate the model every `flags.epochs_between_evals` epochs.
for n in range(flags.train_epochs // flags.epochs_between_evals):
    model.train(input_fn=train_input_fn, hooks=train_hooks)
    results = model.evaluate(input_fn=eval_input_fn)

    # Display evaluation metrics
    print('Results at epoch', (n + 1) * epochs_between_evals)
    print('-' * 60)

    for key in sorted(results):
        print('%s: %s' % (key, results[key]))

    if model_helpers.past_stop_threshold(
        stop_threshold, results['accuracy']):
        break
