# Feature Crosses - Solutions

In [None]:
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

california_housing_dataframe = pd.read_csv("https://storage.googleapis.com/ml_universities/california_housing_train.csv", sep=",")

california_housing_dataframe = california_housing_dataframe.reindex(
    np.random.permutation(california_housing_dataframe.index))

In [None]:
def preprocess_features(california_housing_dataframe):
  """Prepares input features from California housing data set.

  Args:
    california_housing_dataframe: A Pandas DataFrame expected to contain data
      from the California housing data set.
  Returns:
    A DataFrame that contains the features to be used for the model, including
    synthetic features.
  """
  selected_features = california_housing_dataframe[
    ["latitude",
     "longitude",
     "housing_median_age",
     "total_rooms",
     "total_bedrooms",
     "population",
     "households",
     "median_income"]]
  processed_features = selected_features.copy()
  # Create a synthetic feature.
  processed_features["rooms_per_person"] = (
    california_housing_dataframe["total_rooms"] /
    california_housing_dataframe["population"])
  return processed_features

def preprocess_targets(california_housing_dataframe):
  """Prepares target features (i.e., labels) from California housing data set.

  Args:
    california_housing_dataframe: A Pandas DataFrame expected to contain data
      from the California housing data set.
  Returns:
    A DataFrame that contains the target feature.
  """
  output_targets = pd.DataFrame()
  # Scale the target to be in units of thousands of dollars.
  output_targets["median_house_value"] = (
    california_housing_dataframe["median_house_value"] / 1000.0)
  return output_targets

In [None]:
training_examples = preprocess_features(california_housing_dataframe.head(12000))
training_targets = preprocess_targets(california_housing_dataframe.head(12000))
validation_examples = preprocess_features(california_housing_dataframe.tail(5000))
validation_targets = preprocess_targets(california_housing_dataframe.tail(5000))

In [None]:
def input_function(examples_df, targets_df, single_read=False):
  """Converts a pair of examples/targets `DataFrame`s to `Tensor`s.
  
  The `Tensor`s are reshaped to `(N,1)` where `N` is number of examples in the `DataFrame`s.
  
  Args:
    examples_df: A `DataFrame` that contains the input features. All its columns will be
      transformed into corresponding input feature `Tensor` objects.
    targets_df: A `DataFrame` that contains a single column, the targets corresponding to
      each example in `examples_df`.
    single_read: A `bool` that indicates whether this function should stop after reading
      through the dataset once. If `False`, the function will loop through the data set.
      This stop mechanism is user by the estimator's `predict()` to limit the number of
      values it reads.
  Returns:
    A tuple `(input_features, target_tensor)`:
      input_features: A `dict` mapping string values (the column name of the feature) to
        `Tensor`s (the actual values of the feature).
      target_tensor: A `Tensor` representing the target values.
  """
  features = {}
  for column_name in examples_df.keys():
    batch_tensor = tf.to_float(
        tf.reshape(tf.constant(examples_df[column_name].values), [-1, 1]))
    if single_read:
      features[column_name] = tf.train.limit_epochs(batch_tensor, num_epochs=1)
    else:
      features[column_name] = batch_tensor
  target_tensor = tf.to_float(
      tf.reshape(tf.constant(targets_df[targets_df.keys()[0]].values), [-1, 1]))

  return features, target_tensor

In [None]:
def train_model(
    learning_rate,
    steps,
    feature_columns,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):
  """Trains a linear regression model of one feature.
  
  In addition to training, this function also prints training progress information,
  as well as a plot of the training and validation loss over time.
  
  Args:
    learning_rate: A `float`, the learning rate.
    steps: A non-zero `int`, the total number of training steps. A training step
      consists of a forward and backward pass using a single batch.
    feature_columns: A `set` specifying the input feature columns to use.
    training_targets: A `DataFrame` containing exactly one column from
      `california_housing_dataframe` to use as target for training.
    validation_examples: A `DataFrame` containing one or more columns from
      `california_housing_dataframe` to use as input features for validation.
    validation_targets: A `DataFrame` containing exactly one column from
      `california_housing_dataframe` to use as target for validation.
      
  Returns:
    A `LinearRegressor` object trained on the training data.
  """

  periods = 10
  steps_per_period = steps / periods

  # Create a linear regressor object.
  linear_regressor = tf.contrib.learn.LinearRegressor(
      feature_columns=feature_columns,
      optimizer=tf.train.FtrlOptimizer(learning_rate=learning_rate),
      gradient_clip_norm=5.0
  )
  
  training_input_function = lambda: input_function(
      training_examples, training_targets)
  training_input_function_for_predict = lambda: input_function(
      training_examples, training_targets, single_read=True)
  validation_input_function_for_predict = lambda: input_function(
      validation_examples, validation_targets, single_read=True)

  # Train the model, but do so inside a loop so that we can periodically assess
  # loss metrics.
  print "Training model..."
  print "RMSE (on training data):"
  training_rmse = []
  validation_rmse = []
  for period in range (0, periods):
    # Train the model, starting from the prior state.
    linear_regressor.fit(
        input_fn=training_input_function,
        steps=steps_per_period
    )
    # Take a break and compute predictions.
    training_predictions = list(linear_regressor.predict(
        input_fn=training_input_function_for_predict))
    validation_predictions = list(linear_regressor.predict(
        input_fn=validation_input_function_for_predict))
    # Compute training and validation loss.
    training_root_mean_squared_error = math.sqrt(
        metrics.mean_squared_error(training_predictions, training_targets))
    validation_root_mean_squared_error = math.sqrt(
        metrics.mean_squared_error(validation_predictions, validation_targets))
    # Occasionally print the current loss.
    print "  period %02d : %0.2f" % (period, training_root_mean_squared_error)
    # Add the loss metrics from this period to our list.
    training_rmse.append(training_root_mean_squared_error)
    validation_rmse.append(validation_root_mean_squared_error)
  print "Model training finished."

  # Output a graph of loss metrics over periods.
  plt.ylabel("RMSE")
  plt.xlabel("Periods")
  plt.title("Root Mean Squared Error vs. Periods")
  plt.tight_layout()
  plt.plot(training_rmse, label="training")
  plt.plot(validation_rmse, label="validation")
  plt.legend()

  return linear_regressor

In [None]:
longitude = tf.contrib.layers.real_valued_column("longitude")
latitude = tf.contrib.layers.real_valued_column("latitude")
housing_median_age = tf.contrib.layers.real_valued_column("housing_median_age")
households = tf.contrib.layers.real_valued_column("households")
median_income = tf.contrib.layers.real_valued_column("median_income")
rooms_per_person = tf.contrib.layers.real_valued_column("rooms_per_person")

### Solution to Task 1

You may be wondering what is the best number of buckets to select. That is of course data-dependent. Here, we just selected arbitrary values so as to obtain a not-too-large model.

In [None]:
def get_quantile_based_boundaries(feature_values, num_buckets):
  boundaries = np.arange(1.0, num_buckets) / num_buckets
  quantiles = feature_values.quantile(boundaries)
  return [quantiles[q] for q in quantiles.keys()]

bucketized_households = tf.contrib.layers.bucketized_column(
  households, boundaries=get_quantile_based_boundaries(
    california_housing_dataframe["households"], 7))
bucketized_longitude = tf.contrib.layers.bucketized_column(
  longitude, boundaries=get_quantile_based_boundaries(
    california_housing_dataframe["longitude"], 10))
bucketized_latitude = tf.contrib.layers.bucketized_column(
  latitude, boundaries=get_quantile_based_boundaries(
    training_examples["latitude"], 10))
bucketized_housing_median_age = tf.contrib.layers.bucketized_column(
  housing_median_age, boundaries=get_quantile_based_boundaries(
    training_examples["housing_median_age"], 7))
bucketized_median_income = tf.contrib.layers.bucketized_column(
  median_income, boundaries=get_quantile_based_boundaries(
    training_examples["median_income"], 7))
bucketized_rooms_per_person = tf.contrib.layers.bucketized_column(
  rooms_per_person, boundaries=get_quantile_based_boundaries(
    training_examples["rooms_per_person"], 7))

bucketized_feature_columns = set([
  bucketized_longitude,
  bucketized_latitude,
  bucketized_housing_median_age,
  bucketized_households,
  bucketized_median_income,
  bucketized_rooms_per_person])

_ = train_model(
    learning_rate=1.5,
    steps=500,
    feature_columns=bucketized_feature_columns,
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets)

### Solution to Task 2

We've added the cross to the previous list.

In [None]:
long_x_lat = tf.contrib.layers.crossed_column(
  set([bucketized_longitude, bucketized_latitude]), hash_bucket_size=1000)

feature_columns_with_cross = set([
  long_x_lat,
  bucketized_longitude,
  bucketized_latitude,
  bucketized_housing_median_age,
  bucketized_households,
  bucketized_median_income,
  bucketized_rooms_per_person])

_ = train_model(
    learning_rate=3.0,
    steps=500,
    feature_columns=feature_columns_with_cross,
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets)