In [1]:
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

california_housing_dataframe = pd.read_csv("https://dl.google.com/mlcc/mledu-datasets/california_housing_train.csv", sep=",")

california_housing_dataframe = california_housing_dataframe.reindex(
    np.random.permutation(california_housing_dataframe.index))
california_housing_dataframe["median_house_value"] /= 1000.0
print(california_housing_dataframe)
california_housing_dataframe["rooms_per_person"] = (
    california_housing_dataframe["total_rooms"] / california_housing_dataframe["population"])

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
10351     -120.1      36.0                 7.0       2049.0           482.0   
867       -117.1      33.1                19.0       2629.0           494.0   
13236     -121.9      36.6                42.0       2689.0           510.0   
3667      -117.9      33.8                26.0       2737.0           614.0   
6421      -118.3      33.9                24.0       9071.0          1335.0   
...          ...       ...                 ...          ...             ...   
9607      -119.4      36.5                28.0       2201.0           429.0   
76        -115.6      33.0                21.0       2164.0           480.0   
4979      -118.1      34.0                34.0       1723.0           279.0   
239       -116.5      33.8                26.0       5032.0          1229.0   
14943     -122.2      37.5                39.0       5264.0          1259.0   

       population  households  median_income  media

In [2]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    """Trains a linear regression model of one feature.
  
    Args:
      features: pandas DataFrame of features
      targets: pandas DataFrame of targets
      batch_size: Size of batches to be passed to the model
      shuffle: True or False. Whether to shuffle the data.
      num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
    Returns:
      Tuple of (features, labels) for next data batch
    """
    
    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}                                           
 
    # Construct a dataset, and configure batching/repeating
    ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)
    # repeat라는 함수는 데이터셋을 읽다가 마지막에 도달했을 경우, 다시 처음부터 조회하는 함수입니다. 
    # 그리고 batch 함수는 데이터를 읽어올 개수를 지정하는 함수입니다.
    # Shuffle the data, if specified
    if shuffle:
      ds = ds.shuffle(buffer_size=10000)
    
    # Return the next batch of data
    features, labels = ds.make_one_shot_iterator().get_next()
    # iteroator를 만든 다음 get_next()가 tf.Tensor객체를 반환해준다.
    return features, labels

In [3]:
def train_model(learning_rate, steps, batch_size, input_feature):
  """
  Trains a linear regression model.
  
  Args:
    learning_rate: A `float`, the learning rate.
    steps: A non-zero `int`, the total number of training steps. A training step
      consists of a forward and backward pass using a single batch.
    batch_size: A non-zero `int`, the batch size.
    input_feature: A `string` specifying a column from `california_housing_dataframe`
      to use as input feature.
      
  Returns:
    A Pandas `DataFrame` containing targets and the corresponding predictions done
    after training the model.
  """
  
  periods = 10
  steps_per_period = steps / periods # 각 기간당 몇단계의 학습을 진행할건지

  my_feature = input_feature
  my_feature_data = california_housing_dataframe[[my_feature]].astype('float32')
  #데이터 읽어와서 
  # 타입 바꾸기
  my_label = "median_house_value"
  targets = california_housing_dataframe[my_label].astype('float32')

  # Create input functions
  training_input_fn = lambda: my_input_fn(my_feature_data, targets, batch_size=batch_size)
  # lambda로 함수를 담기 호출이 안되고 함수인채로 전달 할 수 있다. 
  predict_training_input_fn = lambda: my_input_fn(my_feature_data, targets, num_epochs=1, 
                                                 shuffle=False)
  
  # Create feature columns
  feature_columns = [tf.feature_column.numeric_column(my_feature)]
    #숫자형으로 된 이러한 함수가 특징이 있습니다. 라고 알려줄 데이터
  # Create a linear regressor object.
  my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
  my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
  # 경사하강법이 너무 깊은 단계로 들어가서 폭파하는 것을 막아준다.  
  linear_regressor = tf.estimator.LinearRegressor(
      feature_columns=feature_columns,
      optimizer=my_optimizer
  )

  # Set up to plot the state of our model's line each period.
  plt.figure(figsize=(15, 6))
  plt.subplot(1, 2, 1)
  plt.title("Learned Line by Period")
  plt.ylabel(my_label)
  plt.xlabel(my_feature)
  sample = california_housing_dataframe.sample(n=300)
  plt.scatter(sample[my_feature], sample[my_label])
  colors = [cm.coolwarm(x) for x in np.linspace(-1, 1, periods)]

  # Train the model, but do so inside a loop so that we can periodically assess
  # loss metrics.
  print("Training model...")
  print("RMSE (on training data):")
  root_mean_squared_errors = []
  for period in range (0, periods):
    # Train the model, starting from the prior state.
    linear_regressor.train(
        input_fn=training_input_fn,
        steps=steps_per_period,
    ) # 훈련중
    # Take a break and compute predictions.
    predictions = linear_regressor.predict(input_fn=predict_training_input_fn) #예측한다
    predictions = np.array([item['predictions'][0] for item in predictions])
    
    # Compute loss.
    root_mean_squared_error = math.sqrt(
      metrics.mean_squared_error(predictions, targets))
    # Occasionally print the current loss.
    print("  period %02d : %0.2f" % (period, root_mean_squared_error))
    # Add the loss metrics from this period to our list.
    root_mean_squared_errors.append(root_mean_squared_error)
    # Finally, track the weights and biases over time.
    # Apply some math to ensure that the data and line are plotted neatly.
    y_extents = np.array([0, sample[my_label].max()])
    
    weight = linear_regressor.get_variable_value('linear/linear_model/%s/weights' % input_feature)[0]
    bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')
    
    x_extents = (y_extents - bias) / weight
    x_extents = np.maximum(np.minimum(x_extents,
                                      sample[my_feature].max()),
                           sample[my_feature].min())
    y_extents = weight * x_extents + bias
    plt.plot(x_extents, y_extents, color=colors[period]) 
  print("Model training finished.")

  # Output a graph of loss metrics over periods.
  plt.subplot(1, 2, 2)
  plt.ylabel('RMSE')
  plt.xlabel('Periods')
  plt.title("Root Mean Squared Error vs. Periods")
  plt.tight_layout()
  plt.plot(root_mean_squared_errors)

  # Create a table with calibration data.
  calibration_data = pd.DataFrame()
  calibration_data["predictions"] = pd.Series(predictions)
  calibration_data["targets"] = pd.Series(targets)
  display.display(calibration_data.describe())

  print("Final RMSE (on training data): %0.2f" % root_mean_squared_error)
  
  return calibration_data

In [4]:
calibration_data = train_model(
    learning_rate=0.001,
    steps=100,
    batch_size=5000,
    input_feature="rooms_per_person")

Training model...
RMSE (on training data):


  period 00 : 237.44


  period 01 : 237.34


  period 02 : 237.23


  period 03 : 237.13


  period 04 : 237.03


  period 05 : 236.93


  period 06 : 236.83


  period 07 : 236.72


  period 08 : 236.62


  period 09 : 236.52
Model training finished.


Unnamed: 0,predictions,targets
count,17000.0,17000.0
mean,1.1,207.3
std,0.5,116.0
min,0.2,15.0
25%,0.9,119.4
50%,1.1,180.4
75%,1.3,265.0
max,25.2,500.0


Final RMSE (on training data): 236.52


In [5]:
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.scatter(calibration_data["predictions"], calibration_data["targets"])
plt.show()

In [6]:
plt.subplot(1, 2, 2)
_= california_housing_dataframe["rooms_per_person"].hist()
plt.show()

In [7]:
california_housing_dataframe["rooms_per_person"] = (
    california_housing_dataframe["rooms_per_person"]).apply(lambda x: min(x, 5))

_ = california_housing_dataframe["rooms_per_person"].hist()
plt.show()

In [8]:
calibration_data = train_model(
    learning_rate=0.05,
    steps=500,
    batch_size=5,
    input_feature="rooms_per_person")

Training model...
RMSE (on training data):


  period 00 : 212.82


  period 01 : 189.06


  period 02 : 166.74


  period 03 : 146.40


  period 04 : 130.46


  period 05 : 119.64


  period 06 : 113.30


  period 07 : 110.15


  period 08 : 108.83


  period 09 : 108.19
Model training finished.


Unnamed: 0,predictions,targets
count,17000.0,17000.0
mean,196.0,207.3
std,51.3,116.0
min,45.4,15.0
25%,163.2,119.4
50%,196.0,180.4
75%,223.9,265.0
max,435.6,500.0


Final RMSE (on training data): 108.19


In [9]:
_ = plt.scatter(calibration_data["predictions"], calibration_data["targets"])
plt.show()