In [1]:
import matplotlib as mpl #画图用的库
import matplotlib.pyplot as plt
#下面这一句是为了可以在notebook中画图
%matplotlib inline
import numpy as np
import sklearn   #机器学习算法库
import pandas as pd #处理数据的库   
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras   #使用tensorflow中的keras
#import keras #单纯的使用keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, sklearn, pd, tf, keras:
    print(module.__name__, module.__version__)

2.0.0
sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.18.0
sklearn 0.21.3
pandas 0.25.3
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


In [2]:
#引用位于sklearn数据集中的房价预测数据集
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
print(housing.DESCR) #数据集的描述
print(housing.data.shape) #相当于 x
print(housing.target.shape) #相当于 y

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [3]:
#用sklearn中专门用于划分训练集和测试集的方法
from sklearn.model_selection import train_test_split

#train_test_split默认将数据划分为3:1，我们可以通过修改test_size值来改变数据划分比例(默认0.25，即3:1)
#将总数乘以test_size就表示test测试集、valid验证集数量
#将数据集整体拆分为train_all和test数据集
x_train_all,x_test, y_train_all,y_test = train_test_split(housing.data, housing.target, random_state=7)
#将train_all数据集拆分为train训练集和valid验证集
x_train,x_valid, y_train,y_valid = train_test_split(x_train_all, y_train_all, random_state=11)

print(x_train_all.shape,y_train_all.shape)
print(x_test.shape, y_test.shape)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)

(15480, 8) (15480,)
(5160, 8) (5160,)
(11610, 8) (11610,)
(3870, 8) (3870,)


In [4]:
#训练数据归一化处理
# x = (x - u)/std  u为均值，std为方差
from sklearn.preprocessing import StandardScaler #使用sklearn中的StandardScaler实现训练数据归一化

scaler = StandardScaler()#初始化一个scaler对象
x_train_scaler = scaler.fit_transform(x_train)#x_train已经是二维数据了，无需astype转换
x_valid_scaler = scaler.transform(x_valid)
x_test_scaler  = scaler.transform(x_test)

In [5]:
#metric使用

#直接调用均方差函数 MeanSquaredError()
metric=keras.metrics.MeanSquaredError()
print(metric([5.], [2.]))#这里单独输出为 9
print(metric([0.], [1.]))#这里单独输出为 1
print(metric.result())#累加总的结果输出为 1/2 * (9+1) = 5

#不想累加的话调用reset_states
metric.reset_states()
metric([1.],[3.])
print(metric.result())

tf.Tensor(9.0, shape=(), dtype=float32)
tf.Tensor(5.0, shape=(), dtype=float32)
tf.Tensor(5.0, shape=(), dtype=float32)
tf.Tensor(4.0, shape=(), dtype=float32)


In [7]:
#1.batch 遍历训练集 metric
#      自动求导
#2. epoch结束 验证集 metric
epochs=100
batch_size=32#batch_size表示一次训练的样本数
steps_per_epoch=len(x_train_scaler) // batch_size # 除以batch_size结果取整，表示每个epoch训练样本的次数
optimizer=keras.optimizers.SGD()# optimizer选择 sgd
metric=keras.metrics.MeanSquaredError()#损失函数 mse均方差

#自定义一次随机训练取出的32个样本数
def random_batch(x, y, batch_size=32):
    idx=np.random.randint(0,len(x),size=batch_size)#从 0 到 len(x)总的数量中 随机取出32个索引
    return x[idx],y[idx]

model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu",input_shape=x_train.shape[1:]),
    keras.layers.Dense(1),
])

for epoch in range(epochs):
    metric.reset_states()#防止平方差值累加
    for step in range(steps_per_epoch):
        x_batch, y_batch = random_batch(x_train_scaler, y_train, batch_size)
        with tf.GradientTape() as tape:
            y_pred = model(x_batch)
            loss = tf.reduce_mean(keras.losses.mean_squared_error(y_batch, y_pred))
            metric(y_batch,y_pred)
        grads = tape.gradient(loss, model.variables)
        grads_and_vars = zip(grads, model.variables)
        optimizer.apply_gradients(grads_and_vars)
        print("\rEpoch", epoch, " train mse:", metric.result().numpy(), end="")
    y_valid_pred = model(x_valid_scaler)
    valid_loss = tf.reduce_mean(keras.losses.mean_squared_error(y_valid_pred, y_valid))
    print("\t", "valid mse: ", valid_loss.numpy())

'''
#tf.keras.models.Sequential()建立模型

model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu",input_shape=x_train.shape[1:]),
    keras.layers.Dense(1),
])
#编译model。 loss目标函数为均方差，这里表面上是字符串，实际上tensorflow中会映射到对应的算法函数，我们也可以自定义
model.compile(loss="mean_squared_error", optimizer="adam")

#使用监听模型训练过程中的callbacks
logdir='./callbacks_regression'
if not os.path.exists(logdir):
    os.mkdir(logdir)
output_model_file = os.path.join(logdir,"regression_california_housing.h5")

#首先定义一个callback数组
callbacks = [
    keras.callbacks.TensorBoard(logdir),
    keras.callbacks.ModelCheckpoint(output_model_file,save_best_only=True),
    keras.callbacks.EarlyStopping(patience=5,min_delta=1e-3)
]

#查看model的架构
model.summary()

history=model.fit(x_train_scaler,y_train,epochs=100,
                 validation_data=(x_valid_scaler,y_valid),
                 callbacks=callbacks)
'''



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 0  train mse: 2.0847263	 valid mse:  1.4750860630306017
Epoch 1  train mse: 1.3247944	 valid mse:  1.4169727090294006
Epoch 2  train mse: 1.3173337	 valid mse:  1.4008404432441943
Epoch 3  train mse: 1.2927386	 valid mse:  1.393331363303617
Epoch 4  train mse: 1.2736323	 valid mse:  1.3957328946278167
Epoch 5  train mse: 1.2823045	 valid mse:  1.3924527819563814
Epoch 6  train mse: 1.2335141	 valid mse:  1.3989454292405405
Epoch 7  train mse: 1.2695118	 valid mse:  1.387919197915353
Epoch 8  train mse: 1.2374288	 valid mse:  1.389155003960776
Epoch 9  train mse: 1.2795521	 valid mse:  1.3895107881259308
Epoch 10  train mse: 1.2795583	 valid mse:  1.3876356414847781
Epoch 11  train mse

'\n#tf.keras.models.Sequential()建立模型\n\nmodel = keras.models.Sequential([\n    keras.layers.Dense(30, activation="relu",input_shape=x_train.shape[1:]),\n    keras.layers.Dense(1),\n])\n#编译model。 loss目标函数为均方差，这里表面上是字符串，实际上tensorflow中会映射到对应的算法函数，我们也可以自定义\nmodel.compile(loss="mean_squared_error", optimizer="adam")\n\n#使用监听模型训练过程中的callbacks\nlogdir=\'./callbacks_regression\'\nif not os.path.exists(logdir):\n    os.mkdir(logdir)\noutput_model_file = os.path.join(logdir,"regression_california_housing.h5")\n\n#首先定义一个callback数组\ncallbacks = [\n    keras.callbacks.TensorBoard(logdir),\n    keras.callbacks.ModelCheckpoint(output_model_file,save_best_only=True),\n    keras.callbacks.EarlyStopping(patience=5,min_delta=1e-3)\n]\n\n#查看model的架构\nmodel.summary()\n\nhistory=model.fit(x_train_scaler,y_train,epochs=100,\n                 validation_data=(x_valid_scaler,y_valid),\n                 callbacks=callbacks)\n'

In [None]:
history.history

In [None]:
#打印模型训练过程中的相关曲线
def plot_learning_curves(history):
    pd.DataFrame(history.history).plot(figsize=(8,5))
    plt.grid(True)
    plt.gca().set_ylim(0,1)
    plt.show()
plot_learning_curves(history)

In [None]:
model.evaluate(x_test_scaler,y_test)