In [1]:
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
#텐서플로우의 로그는 ( DEBUG, INFO, WARN, ERROR, FATAL ) 5개의 타입으로 이루어 져있는데 왼쪽으로 갈 수록 상위이다. 
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format #pandas 기본 설정 해주고 있다. 

In [2]:
california_housing_dataframe = pd.read_csv("https://dl.google.com/mlcc/mledu-datasets/california_housing_train.csv", sep=",")

In [3]:
california_housing_dataframe = california_housing_dataframe.reindex(
    np.random.permutation(california_housing_dataframe.index))
#permutation 무작위로 데이터를 바꾸어 준다.   
#reindex는 제일 왼쪽줄의 인덱스를 바꾸어준다. 
california_housing_dataframe["median_house_value"] /= 1000.0
california_housing_dataframe

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
4068,-118.0,33.7,22.0,3914.0,600.0,1871.0,607.0,5.9,281.5
6047,-118.2,34.0,47.0,1147.0,297.0,1097.0,307.0,2.6,162.9
5753,-118.2,34.0,34.0,1022.0,286.0,1058.0,275.0,2.6,156.7
3496,-117.9,33.9,27.0,698.0,116.0,391.0,126.0,5.9,267.6
2709,-117.7,36.1,25.0,1709.0,439.0,632.0,292.0,1.8,45.5
...,...,...,...,...,...,...,...,...,...
12245,-121.5,38.6,23.0,7709.0,1279.0,4147.0,1262.0,3.8,96.6
15563,-122.3,38.0,33.0,1817.0,441.0,1220.0,389.0,2.5,103.6
13136,-121.9,36.5,28.0,2830.0,458.0,898.0,370.0,5.8,500.0
6371,-118.3,34.1,45.0,2174.0,627.0,1992.0,557.0,2.5,167.8


In [4]:
california_housing_dataframe.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.6,35.6,28.6,2643.7,539.4,1429.6,501.2,3.9,207.3
std,2.0,2.1,12.6,2179.9,421.5,1147.9,384.5,1.9,116.0
min,-124.3,32.5,1.0,2.0,1.0,3.0,1.0,0.5,15.0
25%,-121.8,33.9,18.0,1462.0,297.0,790.0,282.0,2.6,119.4
50%,-118.5,34.2,29.0,2127.0,434.0,1167.0,409.0,3.5,180.4
75%,-118.0,37.7,37.0,3151.2,648.2,1721.0,605.2,4.8,265.0
max,-114.3,42.0,52.0,37937.0,6445.0,35682.0,6082.0,15.0,500.0


In [5]:
my_feature = california_housing_dataframe[["total_rooms"]]# [[]]는 객체를 반환해주고 []는 데이터들을 반환해줌 
feature_columns = [tf.feature_column.numeric_column("total_rooms"), tf.feature_column.numeric_column("population")]# 직접적으로 접근하지 하지않고 이러한 데이터들이 있습니다라고 알려줌 
print(feature_columns) 
# 텐서들이 그래프를 만들고 tf.session을 만들어서 run을해야 그래프를 갖고 훈련하기 시작한다. 

[_NumericColumn(key='total_rooms', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]


In [6]:
targets = california_housing_dataframe["median_house_value"]

In [7]:
# Use gradient descent as the optimizer for training the model.
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.00002) # 학습률 0.0000001
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
# clip_norm: A 0-D (scalar) Tensor > 0. The clipping ratio.
# Gradient Clipping (그래디언트 클리핑) 
# RNN 같은 깊은 뉴럴 네트워크에서 폭팔하는 것을 방지하는 기법  
# L2 norm이 초과할 때 (threshold/L2 norm)을 곱해주는 것이다. 

# Configure the linear regression model with our feature columns and optimizer.
# Set a learning rate of 0.0000001 for Gradient Descent.
linear_regressor = tf.estimator.LinearRegressor(
    feature_columns=feature_columns,
    optimizer=my_optimizer
)

In [8]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    # 기본값인 num_epochs=None을 repeat()에 전달하면 입력 데이터가 무한정 반복됩니다.
    """Trains a linear regression model of one feature.
  
    Args:
      features: pandas DataFrame of features
      targets: pandas DataFrame of targets
      batch_size: Size of batches to be passed to the model
      shuffle: True or False. Whether to shuffle the data.
      num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
    Returns:
      Tuple of (features, labels) for next data batch
    """
  
    # Convert pandas data into a dict of np arrays.
    features = {key: np.array(value) for key, value in dict(features).items()}                                           
 
    # Construct a dataset, and configure batching/repeating
    #데이터 셋을 만드는 함수이다. 
    ds = Dataset.from_tensor_slices((features, targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs) # num_epochs가 false면 계속 반복 
    #데이터를 읽어올 개수를 지정하는 batch_size
    # Shuffle the data, if specified
    if shuffle:
      ds = ds.shuffle(buffer_size=10000)
    
    # Return the next batch of data
    features, labels = ds.make_one_shot_iterator().get_next() 
    # 다음 항목에 연결되어 있는 tf.Tensor 객체를 리턴합니다.
    # 데이터 셋에 있는 열거가능한 요소들을 이터레이터로 만든다. 
    return features, labels
# 역활분리를 잘해놓은파이썬 랜덤에서 

In [9]:
_ = linear_regressor.train(
    input_fn = lambda: my_input_fn(my_feature, targets),
    # input_fn 데이터를 반환해주는 애이다. 
    # 직접적으로 호출이 아니라 함수를 전달해주는 방식이다. 
    steps=1000
)


In [10]:
# Create an input function for predictions.
# Note: Since we're making just one prediction for each example, we don't 
# need to repeat or shuffle the data here.
prediction_input_fn = lambda : my_input_fn(my_feature, targets, num_epochs=1, shuffle=False)

# Call predict() on the linear_regressor to make predictions.
predictions = linear_regressor.predict(input_fn=prediction_input_fn)

# Format predictions as a NumPy array, so we can calculate error metrics.
predictions = np.array([item['predictions'][0] for item in predictions])

# Print Mean Squared Error and Root Mean Squared Error.
mean_squared_error = metrics.mean_squared_error(predictions, targets)
root_mean_squared_error = math.sqrt(mean_squared_error)
print("Mean Squared Error (on training data): %0.3f" % mean_squared_error) # MSE
print("Root Mean Squared Error (on training data): %0.3f" % root_mean_squared_error) 

Mean Squared Error (on training data): 28589.500
Root Mean Squared Error (on training data): 169.084


In [11]:
min_house_value = california_housing_dataframe["median_house_value"].min()
max_house_value = california_housing_dataframe["median_house_value"].max()
min_max_difference = max_house_value - min_house_value

print("Min. Median House Value: %0.3f" % min_house_value)
print("Max. Median House Value: %0.3f" % max_house_value)
print("Difference between Min. and Max.: %0.3f" % min_max_difference)
print("Root Mean Squared Error: %0.3f" % root_mean_squared_error)  # RMSE

Min. Median House Value: 14.999
Max. Median House Value: 500.001
Difference between Min. and Max.: 485.002
Root Mean Squared Error: 169.084


In [12]:
calibration_data = pd.DataFrame()
calibration_data["predictions"] = pd.Series(predictions)
calibration_data["targets"] = pd.Series(targets)
calibration_data

Unnamed: 0,predictions,targets
0,228.6,66.9
1,67.0,80.1
2,59.7,85.7
3,40.8,73.4
4,99.8,65.5
...,...,...
16995,450.2,111.4
16996,106.1,79.0
16997,165.3,103.6
16998,127.0,85.8


In [13]:
sample = california_housing_dataframe.sample(n=300)

In [14]:
# Get the min and max total_rooms values.
x_0 = sample["total_rooms"].min()
x_1 = sample["total_rooms"].max()

# Retrieve the final weight and bias generated during training.
weight = linear_regressor.get_variable_value('linear/linear_model/total_rooms/weights')[0]
bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')

# Get the predicted median_house_values for the min and max total_rooms values.
y_0 = weight * x_0 + bias 
y_1 = weight * x_1 + bias

# Plot our regression line from (x_0, y_0) to (x_1, y_1).
plt.plot([x_0, x_1], [y_0, y_1], c='r')

# Label the graph axes.
plt.ylabel("median_house_value")
plt.xlabel("total_rooms")

# Plot a scatter plot from our data sample.
plt.scatter(sample["total_rooms"], sample["median_house_value"])

# Display graph.
plt.show()