# Intro to Sparse Data and Embeddings - Solutions

In [None]:
!wget https://storage.googleapis.com/advanced-solutions-lab/mlcc/sparse_data_embedding/test.tfrecord -O /tmp/test.tfrecord
!wget https://storage.googleapis.com/advanced-solutions-lab/mlcc/sparse_data_embedding/train.tfrecord -O /tmp/train.tfrecord

## Solution to Task 3: Use an embedding with a DNN model

**NOTE:** *In practice, we might project to dimensions higher than 2, like 50 or 100.  But for now, 2 dimensions is easy to visualize.*

In [None]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from IPython import display
from sklearn import metrics

tf.logging.set_verbosity(tf.logging.ERROR)

# First, set up a dictionary that allows us to parse out the features from the
# tf.Examples
features_to_types_dict = {
    "terms": tf.VarLenFeature(dtype=tf.string),
    "labels": tf.FixedLenFeature(shape=[1], dtype=tf.float32)}

# Create an input_fn that parses the tf.Examples from the given file pattern,
# and split these into features and targets.
def _input_fn(input_file_pattern):
  features = tf.contrib.learn.io.read_batch_features(
    file_pattern=input_file_pattern,
    batch_size=25,
    features=features_to_types_dict,
    reader=tf.TFRecordReader)
  targets = features.pop("labels")
  return features, targets


informative_terms = [ "bad", "great", "best", "worst", "fun", "beautiful",
                      "excellent", "poor", "boring", "awful", "terrible",
                      "definitely", "perfect", "liked", "worse", "waste",
                      "entertaining", "loved", "unfortunately", "amazing",
                      "enjoyed", "favorite", "horrible", "brilliant", "highly",
                      "simple", "annoying", "today", "hilarious", "enjoyable",
                      "dull", "fantastic", "poorly", "fails", "disappointing",
                      "disappointment", "not", "him", "her", "good", "time",
                       "?", ".", "!", "movie", "film", "action", "comedy",
                       "drama", "family", "man", "woman", "boy", "girl" ]

# Create a feature column from "terms", using feature hashing.
terms_feature_column = tf.contrib.layers.sparse_column_with_keys(column_name="terms",
                                                                 keys=informative_terms)

########################## SOLUTION CODE ########################################
terms_embedding_column = tf.contrib.layers.embedding_column(terms_feature_column, dimension=2)
feature_columns = [ terms_embedding_column ]

classifier = tf.contrib.learn.DNNClassifier(
  feature_columns=feature_columns,
  hidden_units=[10,10],
  optimizer=tf.train.AdagradOptimizer(
    learning_rate=0.1),
  gradient_clip_norm=5.0
)
#################################################################################

classifier.fit(
  input_fn=lambda: _input_fn("/tmp/train.tfrecord"),
  steps=1000)

evaluation_metrics = classifier.evaluate(
  input_fn=lambda: _input_fn("/tmp/train.tfrecord"),
  steps=1000)
print "Training set metrics:"
for m in evaluation_metrics:
  print m, evaluation_metrics[m]
print "---"

evaluation_metrics = classifier.evaluate(
  input_fn=lambda: _input_fn("/tmp/test.tfrecord"),
  steps=1000)

print "Test set metrics:"
for m in evaluation_metrics:
  print m, evaluation_metrics[m]
print "---"