In [186]:
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
#텐서플로우의 로그는 ( DEBUG, INFO, WARN, ERROR, FATAL ) 5개의 타입으로 이루어 져있는데 왼쪽으로 갈 수록 상위이다. 
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format #pandas 기본 설정 해주고 있다. 

In [187]:
california_housing_dataframe = pd.read_csv("https://dl.google.com/mlcc/mledu-datasets/california_housing_train.csv", sep=",")

In [188]:
california_housing_dataframe = california_housing_dataframe.reindex(
    np.random.permutation(california_housing_dataframe.index))
#permutation 무작위로 데이터를 바꾸어 준다.   
#reindex는 제일 왼쪽줄의 인덱스를 바꾸어준다. 
california_housing_dataframe["median_house_value"] /= 1000.0
california_housing_dataframe

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
8172,-118.4,34.0,16.0,14891.0,3984.0,6270.0,3595.0,5.1,283.2
11571,-121.3,38.8,7.0,9003.0,1739.0,4445.0,1591.0,3.8,147.9
9180,-119.0,36.1,26.0,2185.0,435.0,1108.0,419.0,2.2,78.0
7806,-118.4,34.0,47.0,1621.0,314.0,724.0,311.0,5.8,474.1
2126,-117.3,33.2,23.0,3297.0,728.0,1793.0,622.0,2.6,169.7
...,...,...,...,...,...,...,...,...,...
1103,-117.1,32.7,52.0,280.0,71.0,217.0,71.0,1.5,83.3
13557,-122.0,37.5,31.0,1949.0,344.0,986.0,322.0,4.6,196.2
6669,-118.3,34.1,49.0,2850.0,379.0,1113.0,380.0,13.0,500.0
14690,-122.2,37.7,35.0,504.0,126.0,323.0,109.0,1.8,90.5


In [189]:
california_housing_dataframe.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.6,35.6,28.6,2643.7,539.4,1429.6,501.2,3.9,207.3
std,2.0,2.1,12.6,2179.9,421.5,1147.9,384.5,1.9,116.0
min,-124.3,32.5,1.0,2.0,1.0,3.0,1.0,0.5,15.0
25%,-121.8,33.9,18.0,1462.0,297.0,790.0,282.0,2.6,119.4
50%,-118.5,34.2,29.0,2127.0,434.0,1167.0,409.0,3.5,180.4
75%,-118.0,37.7,37.0,3151.2,648.2,1721.0,605.2,4.8,265.0
max,-114.3,42.0,52.0,37937.0,6445.0,35682.0,6082.0,15.0,500.0


In [190]:
my_feature = california_housing_dataframe[["total_rooms"]]#[[]]는 객체를 반환해주고 []는 데이터들을 반환해줌 
feature_columns = [tf.feature_column.numeric_column("total_rooms")]#직접적으로 접근하지 하지않고 이러한 데이터들이 있습니다라고 알려줌 
print(feature_columns)

In [191]:
targets = california_housing_dataframe["median_house_value"]

In [192]:
# Use gradient descent as the optimizer for training the model.
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.00002) #학습률 0.0000001
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
# clip_norm: A 0-D (scalar) Tensor > 0. The clipping ratio.
# Gradient Clipping (그래디언트 클리핑) 
# RNN 같은 깊은 뉴럴 네트워크에서 폭팔하는 것을 방지하는 기법  
# L2 norm이 초과할 때 (threshold/L2 norm)을 곱해주는 것이다. 

# Configure the linear regression model with our feature columns and optimizer.
# Set a learning rate of 0.0000001 for Gradient Descent.
linear_regressor = tf.estimator.LinearRegressor(
    feature_columns=feature_columns,
    optimizer=my_optimizer
)

In [193]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    # 기본값인 num_epochs=None을 repeat()에 전달하면 입력 데이터가 무한정 반복됩니다.
    """Trains a linear regression model of one feature.
  
    Args:
      features: pandas DataFrame of features
      targets: pandas DataFrame of targets
      batch_size: Size of batches to be passed to the model
      shuffle: True or False. Whether to shuffle the data.
      num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
    Returns:
      Tuple of (features, labels) for next data batch
    """
  
    # Convert pandas data into a dict of np arrays.
    features = {key: np.array(value) for key, value in dict(features).items()}                                           
 
    # Construct a dataset, and configure batching/repeating
    #데이터 셋을 만드는 함수이다. 
    ds = Dataset.from_tensor_slices((features, targets)) # warning: 2GB limit
    ds = ds.batch(10).repeat(num_epochs) # num_epochs가 false면 계속 반복 
    #데이터를 읽어올 개수를 지정하는 batch_size
    # Shuffle the data, if specified
    if shuffle:
      ds = ds.shuffle(buffer_size=10000)
    
    # Return the next batch of data
    features, labels = ds.make_one_shot_iterator().get_next() 
    # 다음 항목에 연결되어 있는 tf.Tensor 객체를 리턴합니다.
    # 데이터 셋에 있는 열거가능한 요소들을 이터레이터로 만든다. 
    return features, labels

In [194]:
_ = linear_regressor.train(
    input_fn = lambda:my_input_fn(my_feature, targets),
    steps=1000
)


In [195]:
# Create an input function for predictions.
# Note: Since we're making just one prediction for each example, we don't 
# need to repeat or shuffle the data here.
prediction_input_fn =lambda : my_input_fn(my_feature, targets, num_epochs=1, shuffle=False)

# Call predict() on the linear_regressor to make predictions.
predictions = linear_regressor.predict(input_fn=prediction_input_fn)

# Format predictions as a NumPy array, so we can calculate error metrics.
predictions = np.array([item['predictions'][0] for item in predictions])

# Print Mean Squared Error and Root Mean Squared Error.
mean_squared_error = metrics.mean_squared_error(predictions, targets)
root_mean_squared_error = math.sqrt(mean_squared_error)
print("Mean Squared Error (on training data): %0.3f" % mean_squared_error) # MSE
print("Root Mean Squared Error (on training data): %0.3f" % root_mean_squared_error) 

Mean Squared Error (on training data): 28809.962
Root Mean Squared Error (on training data): 169.735


In [196]:
min_house_value = california_housing_dataframe["median_house_value"].min()
max_house_value = california_housing_dataframe["median_house_value"].max()
min_max_difference = max_house_value - min_house_value

print("Min. Median House Value: %0.3f" % min_house_value)
print("Max. Median House Value: %0.3f" % max_house_value)
print("Difference between Min. and Max.: %0.3f" % min_max_difference)
print("Root Mean Squared Error: %0.3f" % root_mean_squared_error)  # RMSE

Min. Median House Value: 14.999
Max. Median House Value: 500.001
Difference between Min. and Max.: 485.002
Root Mean Squared Error: 169.735


In [197]:
calibration_data = pd.DataFrame()
calibration_data["predictions"] = pd.Series(predictions)
calibration_data["targets"] = pd.Series(targets)
calibration_data

Unnamed: 0,predictions,targets
0,884.5,66.9
1,534.8,80.1
2,129.8,85.7
3,96.3,73.4
4,195.8,65.5
...,...,...
16995,16.6,111.4
16996,115.8,79.0
16997,169.3,103.6
16998,29.9,85.8


In [198]:
sample = california_housing_dataframe.sample(n=300)

In [199]:
# Get the min and max total_rooms values.
x_0 = sample["total_rooms"].min()
x_1 = sample["total_rooms"].max()

# Retrieve the final weight and bias generated during training.
weight = linear_regressor.get_variable_value('linear/linear_model/total_rooms/weights')[0]
bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')

# Get the predicted median_house_values for the min and max total_rooms values.
y_0 = weight * x_0 + bias 
y_1 = weight * x_1 + bias

# Plot our regression line from (x_0, y_0) to (x_1, y_1).
plt.plot([x_0, x_1], [y_0, y_1], c='r')

# Label the graph axes.
plt.ylabel("median_house_value")
plt.xlabel("total_rooms")

# Plot a scatter plot from our data sample.
plt.scatter(sample["total_rooms"], sample["median_house_value"])

# Display graph.
plt.show()