<a href="https://colab.research.google.com/github/earlSagrada/machine_learning_notes/blob/master/embedding_try.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Embedding

In [11]:
from __future__ import print_function

# implements specialized container datatypes
import collections
import io
import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%tensorflow_version 1.x
import tensorflow as tf
from IPython import display
from sklearn import metrics

tf.logging.set_verbosity(tf.logging.ERROR)
'''
in tensorflow2 we should use:
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
but one is suggested to use python logging module instead:
  import logging
  logging.getLogger("tensorflow").setLevel(logging.ERROR)
before importing tensorflow
'''
train_url = 'https://download.mlcc.google.cn/mledu-datasets/sparse-data-embedding/train.tfrecord'
# Downloads a file from a URL if it not already in the cache
# tf.keras.utils.get_file(fname, origin)
# [-1] the last but one item in the URL
# just a filename
train_path = tf.keras.utils.get_file(train_url.split('/')[-1], train_url)

test_url = 'https://download.mlcc.google.cn/mledu-datasets/sparse-data-embedding/test.tfrecord'
test_path = tf.keras.utils.get_file(test_url.split('/')[-1], test_url)


def _parse_function(record):
  """Extracts features and labels

  Args:
    record: File path to a TFRecord file
  Returns:
    A `tuple` `(labels, features)`:
      features: A dict of tensors representing the features
      labels: A tensor with the corresponding labels.
  """
  features = {
      "terms": tf.VarLenFeature(dtype=tf.string), # terms are strings of varying lengths
      "labels": tf.FixedLenFeature(shape=[1], dtype=tf.float32) # labels are 0 or 1
  }
  
  # parse 解析 ???
  parsed_features = tf.parse_single_example(record, features)

  terms = parsed_features['terms'].values
  labels = parsed_features['labels']

  return {'terms':terms}, labels

## Ensure the function works well
# Create the Dataset object
ds = tf.data.TFRecordDataset(train_path)
# Map features and labels with the parse function
# ds(dataset) --> _parse_function --> parsed ???
ds = ds.map(_parse_function)

ds

# Get the first sample from ds
# Use __iter__() in tensorflow2 ???
n = ds.make_one_shot_iterator().get_next()
# Create a Session object
# tf.compat.v1.Session
sess = tf.Session()
sess.run(n)


## Create an input_fn that parses the tf.Examples from the given files,
## and split them into features and targets
def _input_fn(input_filenames, num_epochs=None, shuffle=True):

  # Same code above; create a dataset and map features and labels
  ds = tf.data.TFRecordDataset(input_filenames)
  ds = ds.map(_parse_function)

  if shuffle:
    # Randomly shufles the items in the dataset
    ds = ds.shuffle(10000)
  
  # Our feature data is variable-length, so we pad 填补 and batch
  # each field of the dataset structure to whatever size is necessary
  ds = ds.padded_batch(25, ds.output_shapes) # batch_size, padded_shapes ???

  # Repeats this dataset so each original value is seen count times
  ds = ds.repeat(num_epochs)


  # Return the next batch of data
  features, labels = ds.make_one_shot_iterator().get_next()
  return features, labels


# 50 informative terms that compose our model vocabulary
informative_terms = ("bad", "great", "best", "worst", "fun", "beautiful",
                     "excellent", "poor", "boring", "awful", "terrible",
                     "definitely", "perfect", "liked", "worse", "waste",
                     "entertaining", "loved", "unfortunately", "amazing",
                     "enjoyed", "favorite", "horrible", "brilliant", "highly",
                     "simple", "annoying", "today", "hilarious", "enjoyable",
                     "dull", "fantastic", "poorly", "fails", "disappointing",
                     "disappointment", "not", "him", "her", "good", "time",
                     "?", ".", "!", "movie", "film", "action", "comedy",
                     "drama", "family")

# key: A unique string identifying the input feature
terms_feature_column = tf.feature_column.categorical_column_with_vocabulary_list(key="terms", vocabulary_list=informative_terms)


# Construct LinearClassifier
my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)

feature_columns = [terms_feature_column]


classifier = tf.estimator.LinearClassifier(
    feature_columns=feature_columns,
    optimizer=my_optimizer,
)

classifier.train(
    input_fn=lambda: _input_fn([train_path]),
    steps=1000
)

evaluation_metrics = classifier.evaluate(
    input_fn=lambda: _input_fn([train_path]),
    steps=1000
)
print("Training set metrics:")
for m in evaluation_metrics:
  print(m, evaluation_metrics[m])
print("---")

evaluation_metrics = classifier.evaluate(
    input_fn=lambda: _input_fn([train_path]),
    steps=1000
)
print("Training set metrics:")
for m in evaluation_metrics:
  print(m, evaluation_metrics[m])
print("---")



## Use a DNN model
classifier_dnn = tf.estimator.DNNClassifier(
    feature_columns=[tf.feature_column.indicator_column(terms_feature_column)],
    hidden_units=[20,20],
    optimizer=my_optimizer,
)


try:
  classifier_dnn.train(
    input_fn=lambda: _input_fn([train_path]),
    steps=1000
  )
  
  evaluation_metrics = classifier_dnn.evaluate(
    input_fn=lambda: _input_fn([train_path]),
    steps=1
  )
  print("Training set metrics:")
  for m in evaluation_metrics:
    print(m, evaluation_metrics[m])
  print("---")

  evaluation_metrics = classifier_dnn.evaluate(
    input_fn=lambda: _input_fn([train_path]),
    steps=1
  )
  print("Training set metrics:")
  for m in evaluation_metrics:
    print(m, evaluation_metrics[m])
  print("---")
except ValueError as err:
  print(err)








Training set metrics:
accuracy 0.78764
accuracy_baseline 0.5
auc 0.8718103
auc_precision_recall 0.86407375
average_loss 0.45212543
label/mean 0.5
loss 11.303136
precision 0.7827318
prediction/mean 0.48114967
recall 0.79632
global_step 1000
---
Training set metrics:
accuracy 0.78764
accuracy_baseline 0.5
auc 0.8718103
auc_precision_recall 0.86407375
average_loss 0.45212457
label/mean 0.5
loss 11.303114
precision 0.7827318
prediction/mean 0.48114967
recall 0.79632
global_step 1000
---
Training set metrics:
accuracy 0.84
accuracy_baseline 0.56
auc 0.9577922
auc_precision_recall 0.94508237
average_loss 0.32146832
label/mean 0.44
loss 8.036708
precision 0.7692308
prediction/mean 0.4295966
recall 0.90909094
global_step 1000
---
Training set metrics:
accuracy 0.76
accuracy_baseline 0.56
auc 0.82142854
auc_precision_recall 0.6763135
average_loss 0.51706225
label/mean 0.44
loss 12.926556
precision 0.6923077
prediction/mean 0.47098482
recall 0.8181818
global_step 1000
---
