In [1]:
import tensorflow as tf

In [3]:
C_COLUMNS = ['I' + str(i) for i in range(1, 14)]
D_COLUMNS = ['C' + str(i) for i in range(14, 40)]
LABEL_COLUMN = 'is_click'
CSV_COLUMNS = [LABEL_COLUMN] + C_COLUMNS + D_COLUMNS

In [5]:
CSV_COLUMN_DEFAULTS = [[0.0]]

In [7]:
C_COLUMN_DEFAULTS = [[0.0] for i in range(13)]
D_COLUMN_DEFAULTS = [[0] for i in range(26)]
CSV_COLUMN_DEFAULTS = CSV_COLUMN_DEFAULTS + C_COLUMN_DEFAULTS + D_COLUMN_DEFAULTS

In [70]:
def input_fn(filenames, num_epochs, batch_size=1):
    def parse_csv(line):
        print('Parsing', filenames)
        columns = tf.decode_csv(line, record_defaults=CSV_COLUMN_DEFAULTS)
        features = dict(zip(CSV_COLUMNS, columns))
        labels = features.pop(LABEL_COLUMN)
        return features, labels
    dataset = tf.data.TextLineDataset(filenames)
    dataset = dataset.map(parse_csv, num_parallel_calls=10).prefetch(500000)
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)
    features, labels = dataset.make_one_shot_iterator().get_next()
    return features, labels

In [71]:
features, labels = input_fn('sample.csv', 1, 1)

Parsing sample.csv


In [72]:
sess = tf.Session()
sess.run(features)

{'I1': array([0.1], dtype=float32),
 'I2': array([0.003322], dtype=float32),
 'I3': array([0.44], dtype=float32),
 'I4': array([0.02], dtype=float32),
 'I5': array([0.001594], dtype=float32),
 'I6': array([0.016], dtype=float32),
 'I7': array([0.02], dtype=float32),
 'I8': array([0.04], dtype=float32),
 'I9': array([0.008], dtype=float32),
 'I10': array([0.166667], dtype=float32),
 'I11': array([0.1], dtype=float32),
 'I12': array([0.], dtype=float32),
 'I13': array([0.08], dtype=float32),
 'C14': array([15], dtype=int32),
 'C15': array([56], dtype=int32),
 'C16': array([137], dtype=int32),
 'C17': array([167], dtype=int32),
 'C18': array([181], dtype=int32),
 'C19': array([196], dtype=int32),
 'C20': array([200], dtype=int32),
 'C21': array([258], dtype=int32),
 'C22': array([275], dtype=int32),
 'C23': array([277], dtype=int32),
 'C24': array([307], dtype=int32),
 'C25': array([408], dtype=int32),
 'C26': array([417], dtype=int32),
 'C27': array([508], dtype=int32),
 'C28': array([60

In [17]:
import pandas as pd
df = pd.read_table('dac_sample.txt', sep='\t', header=None)

In [37]:
df.to_csv('sample.csv', header=0, index=0)

In [29]:
len(D_COLUMN_DEFAULTS)

26

In [31]:
df2 = pd.read_csv('sample.csv')

In [36]:
f = open('sample.csv', 'r')
lines = f.readlines()
line = lines[1]
line

'0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,1.0,2.0,,2.0,68fd1e64,80e26c9b,fb936136,7b4723c4,25c83c98,7e0ccccf,de7995b8,1f89b562,a73ee510,a8cd5504,b2cb9c98,37c9c164,2824a5f6,1adce6ef,8ba8b39a,891b62e7,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16\n'

In [35]:
df.loc[0, :]

0            0
1            1
2            1
3            5
4            0
5         1382
6            4
7           15
8            2
9          181
10           1
11           2
12         NaN
13           2
14    68fd1e64
15    80e26c9b
16    fb936136
17    7b4723c4
18    25c83c98
19    7e0ccccf
20    de7995b8
21    1f89b562
22    a73ee510
23    a8cd5504
24    b2cb9c98
25    37c9c164
26    2824a5f6
27    1adce6ef
28    8ba8b39a
29    891b62e7
30    e5ba7672
31    f54016b9
32    21ddcdc9
33    b1252a9d
34    07b5194c
35         NaN
36    3a171ecb
37    c5c50484
38    e8b83407
39    9727dd16
Name: 0, dtype: object

In [41]:
import sklearn
from sklearn.datasets import load_svmlight_file

In [42]:
data = load_svmlight_file("va.libsvm")

ValueError: Feature indices in SVMlight/LibSVM data file should be sorted and unique.

In [49]:
def input_fn(filenames, batch_size=1, num_epochs=1, shuffle=False):
    def decode_libsvm(line):
        columns = tf.string_split([line], ' ')
        labels = tf.string_to_number(columns.values[0], out_type=tf.float32)
        splits = tf.string_split(columns.values[1:], ':')
        id_vals = tf.reshape(splits.values, splits.dense_shape)
        feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1)
        feat_ids = tf.string_to_number(feat_ids, out_type=tf.float32)
        feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32)
        return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels
    ds = tf.data.TextLineDataset(filenames).map(decode_libsvm, num_parallel_calls=10).prefetch(50000)
    if shuffle:
        ds = ds.shuffle(256)
    ds = ds.repeat(num_epochs).batch(batch_size)
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

In [57]:
features, labels = input_fn('va.libsvm', 1, 1)
sess = tf.Session()
sess.run(features)

{'feat_ids': array([[[  1.],
         [  2.],
         [  3.],
         [  4.],
         [  5.],
         [  6.],
         [  7.],
         [  8.],
         [  9.],
         [ 10.],
         [ 11.],
         [ 12.],
         [ 13.],
         [ 15.],
         [ 56.],
         [137.],
         [167.],
         [181.],
         [196.],
         [200.],
         [258.],
         [275.],
         [277.],
         [307.],
         [408.],
         [417.],
         [508.],
         [609.],
         [639.],
         [652.],
         [705.],
         [758.],
         [780.],
         [806.],
         [815.],
         [823.],
         [860.],
         [875.],
         [908.]]], dtype=float32), 'feat_vals': array([[[0.1     ],
         [0.003322],
         [0.44    ],
         [0.02    ],
         [0.001594],
         [0.016   ],
         [0.02    ],
         [0.04    ],
         [0.008   ],
         [0.166667],
         [0.1     ],
         [0.      ],
         [0.08    ],
         [1.      ],
 

In [53]:
df2 = pd.read_table('va.libsvm', sep=' ', header=None)

In [60]:
for i in range(2, 14):
    df2[i] = df2[i].map(lambda x: x.split(':')[1])

In [65]:
for i in range(14, 40):
    df2[i] = df2[i].map(lambda x: x.split(':')[0])

In [69]:
df2.to_csv('sample.csv', header=0, index=0)

In [73]:
def build_feature():
    deep_cbc = [tf.feature_column.numeric_column(colname) for colname in C_COLUMNS]
    deep_dbc = [tf.feature_column.categorical_column_with_identity(key=colname, num_buckets=10000, default_value=0) for colname in D_COLUMNS]
    deep_emb = [tf.feature_column.embedding_column(c, dimension=100)  for c in deep_dbc]
    wide_columns = deep_cbc + deep_dbc
    deep_columns = deep_cbc + deep_emb
    
    return wide_columns, deep_columns

In [74]:
def build_estimator(wide_columns, deep_columns):
    hidden_units = [128, 64, 32]
    estimator = tf.estimator.LinearClassifier(
        feature_columns=wide_columns)
    return estimator

In [75]:
wide_columns, deep_columns = build_feature()

In [78]:
w_n_d = build_estimator(wide_columns, deep_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/0q/hhpzjr8j67l6c15rvglzlh7c0000gn/T/tmple6ntk0x', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x116eec438>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [79]:
tf.logging.set_verbosity(tf.logging.INFO)

In [80]:
train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn("sample.csv", 1, batch_size=5))

In [82]:
eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn("sample.csv", 1, batch_size=1))

In [83]:
tf.estimator.train_and_evaluate(w_n_d, train_spec, eval_spec)

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 600 secs (eval_spec.throttle_secs) or training is finished.
Parsing sample.csv
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/0q/hhpzjr8j67l6c15rvglzlh7c0000gn/T/tmple6ntk0x/model.ckpt.
INFO:tensorflow:loss = 3.465736, step = 1
INFO:tensorflow:global_step/sec: 50.6594
INFO:tensorflow:loss = 3.4053702, step = 101 (1.975 sec)
INFO:tensorflow:global_step/sec: 231.267
INFO:tensorflow:loss = 2.3031926, step = 201 (0.432 sec)
INFO:tensorflow:global_step/sec: 236.365
INFO:tensorflow:loss = 0.28403622, step = 301 (0.423 sec)
INFO:tensorflow:global_step/sec: 229.551
INFO:tensorflow:loss = 3.3583543, 

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/0q/hhpzjr8j67l6c15rvglzlh7c0000gn/T/tmple6ntk0x/model.ckpt-4016
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 4017 into /var/folders/0q/hhpzjr8j67l6c15rvglzlh7c0000gn/T/tmple6ntk0x/model.ckpt.
INFO:tensorflow:loss = 1.6452619, step = 4017
INFO:tensorflow:global_step/sec: 48.9277
INFO:tensorflow:loss = 0.40575135, step = 4117 (2.045 sec)
INFO:tensorflow:global_step/sec: 227.699
INFO:tensorflow:loss = 1.6225551, step = 4217 (0.439 sec)
INFO:tensorflow:global_step/sec: 226.413
INFO:tensorflow:loss = 0.38692692, step = 4317 (0.442 sec)
INFO:tensorflow:global_step/sec: 218.863
INFO:tensorflow:loss = 2.5291603, step = 4417 (0.457 sec)
INFO:tensorflow:global_step/sec: 225.74
INFO:tensorflow:loss = 3.1674142, step = 4517 (0.443 sec)
INFO:tensorflo

INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/0q/hhpzjr8j67l6c15rvglzlh7c0000gn/T/tmple6ntk0x/model.ckpt-8032
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 8033 into /var/folders/0q/hhpzjr8j67l6c15rvglzlh7c0000gn/T/tmple6ntk0x/model.ckpt.
INFO:tensorflow:loss = 1.6390296, step = 8033
INFO:tensorflow:global_step/sec: 48.644
INFO:tensorflow:loss = 0.34608966, step = 8133 (2.058 sec)
INFO:tensorflow:global_step/sec: 228.201
INFO:tensorflow:loss = 1.6622076, step = 8233 (0.437 sec)
INFO:tensorflow:global_step/sec: 224.195
INFO:tensorflow:loss = 0.41064298, step = 8333 (0.446 sec)
INFO:tensorflow:global_step/sec: 226.129
INFO:tensorflow:loss = 2.5397441, step = 8433 (0.442 sec)
INFO:tensorflow:global_step/sec: 223.797
INFO:tensorflow:loss = 3.1091402, step = 8533 (0.447 sec)
INFO:tensorflow:global_step/sec: 225.093
INFO:tensorflow:loss = 0.81694734, step = 8633 (0.444 se

INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 12049 into /var/folders/0q/hhpzjr8j67l6c15rvglzlh7c0000gn/T/tmple6ntk0x/model.ckpt.
INFO:tensorflow:loss = 1.6320872, step = 12049
INFO:tensorflow:global_step/sec: 45.6183
INFO:tensorflow:loss = 0.322441, step = 12149 (2.194 sec)
INFO:tensorflow:global_step/sec: 200.238
INFO:tensorflow:loss = 1.6852139, step = 12249 (0.499 sec)
INFO:tensorflow:global_step/sec: 223.021
INFO:tensorflow:loss = 0.42248756, step = 12349 (0.449 sec)
INFO:tensorflow:global_step/sec: 189.146
INFO:tensorflow:loss = 2.548187, step = 12449 (0.528 sec)
INFO:tensorflow:global_step/sec: 152.844
INFO:tensorflow:loss = 3.0808227, step = 12549 (0.654 sec)
INFO:tensorflow:global_step/sec: 189.008
INFO:tensorflow:loss = 0.791618, step = 12649 (0.530 sec)
INFO:tensorflow:global_step/sec: 209.605
INFO:tensorflow:loss = 2.386805, step = 12749 (0.476 sec)
INFO:tensorflow:global_step/sec: 185.507
INFO:tens

INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 16065 into /var/folders/0q/hhpzjr8j67l6c15rvglzlh7c0000gn/T/tmple6ntk0x/model.ckpt.
INFO:tensorflow:loss = 1.6269758, step = 16065
INFO:tensorflow:global_step/sec: 49.8483
INFO:tensorflow:loss = 0.3095515, step = 16165 (2.008 sec)
INFO:tensorflow:global_step/sec: 221.268
INFO:tensorflow:loss = 1.7002603, step = 16265 (0.452 sec)
INFO:tensorflow:global_step/sec: 226.618
INFO:tensorflow:loss = 0.4297736, step = 16365 (0.441 sec)
INFO:tensorflow:global_step/sec: 218.425
INFO:tensorflow:loss = 2.5535433, step = 16465 (0.458 sec)
INFO:tensorflow:global_step/sec: 223.755
INFO:tensorflow:loss = 3.0643828, step = 16565 (0.447 sec)
INFO:tensorflow:global_step/sec: 220.6
INFO:tensorflow:loss = 0.77356505, step = 16665 (0.453 sec)
INFO:tensorflow:global_step/sec: 226.028
INFO:tensorflow:loss = 2.3324847, step = 16765 (0.442 sec)
INFO:tensorflow:global_step/sec: 228.074
INFO:tensorflow:loss = 3.8232236, step = 16865

INFO:tensorflow:Saving checkpoints for 20081 into /var/folders/0q/hhpzjr8j67l6c15rvglzlh7c0000gn/T/tmple6ntk0x/model.ckpt.
INFO:tensorflow:loss = 1.6232915, step = 20081
INFO:tensorflow:global_step/sec: 50.7498
INFO:tensorflow:loss = 0.30144024, step = 20181 (1.972 sec)
INFO:tensorflow:global_step/sec: 212.57
INFO:tensorflow:loss = 1.7111244, step = 20281 (0.471 sec)
INFO:tensorflow:global_step/sec: 225.638
INFO:tensorflow:loss = 0.43469983, step = 20381 (0.442 sec)
INFO:tensorflow:global_step/sec: 223.197
INFO:tensorflow:loss = 2.5572195, step = 20481 (0.448 sec)
INFO:tensorflow:global_step/sec: 228.765
INFO:tensorflow:loss = 3.0543625, step = 20581 (0.437 sec)
INFO:tensorflow:global_step/sec: 228.06
INFO:tensorflow:loss = 0.7597748, step = 20681 (0.439 sec)
INFO:tensorflow:global_step/sec: 227.058
INFO:tensorflow:loss = 2.2988083, step = 20781 (0.440 sec)
INFO:tensorflow:global_step/sec: 226.865
INFO:tensorflow:loss = 3.8425794, step = 20881 (0.441 sec)
INFO:tensorflow:global_step/se

INFO:tensorflow:loss = 1.6206304, step = 24097
INFO:tensorflow:global_step/sec: 51.0574
INFO:tensorflow:loss = 0.29588506, step = 24197 (1.961 sec)
INFO:tensorflow:global_step/sec: 231.178
INFO:tensorflow:loss = 1.7195324, step = 24297 (0.432 sec)
INFO:tensorflow:global_step/sec: 239.658
INFO:tensorflow:loss = 0.43822888, step = 24397 (0.417 sec)
INFO:tensorflow:global_step/sec: 228.343
INFO:tensorflow:loss = 2.5599532, step = 24497 (0.438 sec)
INFO:tensorflow:global_step/sec: 227.157
INFO:tensorflow:loss = 3.048283, step = 24597 (0.440 sec)
INFO:tensorflow:global_step/sec: 229.815
INFO:tensorflow:loss = 0.74880904, step = 24697 (0.435 sec)
INFO:tensorflow:global_step/sec: 226.914
INFO:tensorflow:loss = 2.2759712, step = 24797 (0.440 sec)
INFO:tensorflow:global_step/sec: 229.649
INFO:tensorflow:loss = 3.8570921, step = 24897 (0.436 sec)
INFO:tensorflow:global_step/sec: 234.813
INFO:tensorflow:loss = 3.9966774, step = 24997 (0.426 sec)
INFO:tensorflow:global_step/sec: 244.916
INFO:tenso

KeyboardInterrupt: 