In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.2.0-dev20200218
sys.version_info(major=3, minor=6, micro=5, releaselevel='final', serial=0)
matplotlib 3.0.3
numpy 1.16.4
pandas 0.24.2
sklearn 0.21.2
tensorflow 2.2.0-dev20200218
tensorflow.python.keras.api._v2.keras 2.2.4-tf


In [2]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

In [3]:
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)


(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

In [5]:
output_dir = "dataset"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

def save_to_csv(output_dir, data, name_prefix,
                header=None, n_parts=10):
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    filenames = []
    
    for file_idx, row_indices in enumerate(
        np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header + "\n")
            for row_index in row_indices:
                f.write(",".join(
                    [repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames

train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

# train_filenames = save_to_csv(output_dir, train_data, "train",
#                               header_str, n_parts=20)
# valid_filenames = save_to_csv(output_dir, valid_data, "valid",
#                               header_str, n_parts=10)
# test_filenames = save_to_csv(output_dir, test_data, "test",
#                              header_str, n_parts=10)

In [7]:
import pprint
pprint.pprint('Train file name: ')
pprint.pprint(train_filenames)
pprint.pprint('Valid file name: ')
pprint.pprint(valid_filenames)
pprint.pprint('Test file name: ')
pprint.pprint(test_filenames)

'Train file name: '
['dataset/train_00.csv',
 'dataset/train_01.csv',
 'dataset/train_02.csv',
 'dataset/train_03.csv',
 'dataset/train_04.csv',
 'dataset/train_05.csv',
 'dataset/train_06.csv',
 'dataset/train_07.csv',
 'dataset/train_08.csv',
 'dataset/train_09.csv',
 'dataset/train_10.csv',
 'dataset/train_11.csv',
 'dataset/train_12.csv',
 'dataset/train_13.csv',
 'dataset/train_14.csv',
 'dataset/train_15.csv',
 'dataset/train_16.csv',
 'dataset/train_17.csv',
 'dataset/train_18.csv',
 'dataset/train_19.csv']
'Valid file name: '
['dataset/valid_00.csv',
 'dataset/valid_01.csv',
 'dataset/valid_02.csv',
 'dataset/valid_03.csv',
 'dataset/valid_04.csv',
 'dataset/valid_05.csv',
 'dataset/valid_06.csv',
 'dataset/valid_07.csv',
 'dataset/valid_08.csv',
 'dataset/valid_09.csv']
'Test file name: '
['dataset/test_00.csv',
 'dataset/test_01.csv',
 'dataset/test_02.csv',
 'dataset/test_03.csv',
 'dataset/test_04.csv',
 'dataset/test_05.csv',
 'dataset/test_06.csv',
 'dataset/test_07.csv',

In [11]:
filename_dataset = tf.data.Dataset.list_files(train_filenames)

for filename in filename_dataset:
    print(filename)

tf.Tensor(b'dataset/train_07.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_15.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_03.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_13.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_14.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_04.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_05.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_00.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_11.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_08.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_02.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_12.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_09.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_18.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_06.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_19.csv', shape=(), dtype=string)
tf.Tensor(b'dataset/train_10.csv', shape=(), dtype=strin

In [16]:
n_readers = 5
dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filename).skip(1),
    cycle_length = n_readers
)

for line in dataset.take(10):
    print(line.numpy())

b'0.401276648075221,-0.9293421252555106,-0.05333050451405854,-0.1865945262276826,0.6545661895448709,0.026434465728210874,0.9312527706398824,-1.4406417263474771,2.512'
b'-1.1157655153587753,0.9930635538078697,-0.33419201318312125,-0.0653521844775239,-0.3289320346639209,0.04343065774347637,-0.12785878480573185,0.30707203993980686,0.524'
b'0.42408210084996534,0.9129633171802288,-0.04437481876046234,-0.15297213746739335,-0.24727627804141977,-0.10539166599677323,0.8612674255663844,-1.3357789003702432,3.955'
b'0.4853051504718848,-0.8492418886278699,-0.06530126513877861,-0.023379656040017353,1.4974350551260218,-0.07790657783453239,-0.9023632702857819,0.7814514907892068,2.956'
b'0.801544314532886,0.27216142415910205,-0.11624392696666119,-0.2023115137272354,-0.5430515742518128,-0.021039615516440048,-0.5897620622908205,-0.08241845654707416,3.226'
b'-0.8757754235423053,1.874166156711919,-0.9487499555702599,-0.09657184824705009,-0.7163432355284542,-0.07790191228558485,0.9825753570271144,-1.4206678

In [19]:
# tf.io.decode_csv(str, recode_defaults, )
sample_str = '1,2,3,4,5'
record_defaults = [tf.constant(0, dtype=tf.int32)] *5

parsed_fields = tf.io.decode_csv(sample_str, record_defaults)
print(parsed_fields)

[<tf.Tensor: shape=(), dtype=int32, numpy=1>, <tf.Tensor: shape=(), dtype=int32, numpy=2>, <tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(), dtype=int32, numpy=4>, <tf.Tensor: shape=(), dtype=int32, numpy=5>]


In [33]:
def parse_csv_line(line, n_fields=9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x, y

parse_csv_line(b'-0.2980728090942217,0.3522616607867429,-0.10920507530549702,-0.25055520947444,-0.034064024638222286,-0.006034004264459185,1.080554840130013,-1.0611381656679573,1.514', 
              n_fields=9)[0].numpy()


array([-0.29807281,  0.35226166, -0.10920507, -0.25055522, -0.03406402,
       -0.006034  ,  1.0805548 , -1.0611382 ], dtype=float32)

In [38]:
def csv_reader_dataset(filenames, n_readers=5, 
                      batch_size=32, n_parse_threads=5, 
                      shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers
    )
    dataset .shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line, 
                          num_parallel_calls=n_parse_threads)
    
    dataset = dataset.batch(batch_size)
    return dataset
    

batch_size=32
train_set = csv_reader_dataset(train_filenames, batch_size=batch_size)
valid_set = csv_reader_dataset(valid_filenames, batch_size=batch_size)
test_set = csv_reader_dataset(test_filenames, batch_size=batch_size)

# for x_batch, y_batch in train_set.take(2):
#     print('x:')
#     print(x_batch.numpy())
#     print('y: ')
#     print(y_batch.numpy())

In [41]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu', 
                      input_shape=[8]),
    keras.layers.Dense(1), 
    
])
model.summary()
model.compile(loss='mean_squared_error', 
             optimizer = keras.optimizers.SGD(0.001),
             )

callbacks = [keras.callbacks.EarlyStopping(patience=5, 
                                           min_delta=1e-2)
]

history = model.fit(train_set, 
                    validation_data= valid_set,
                    steps_per_epoch = 11160//batch_size,
                    validation_steps = 3870//batch_size,
                    epochs = 100,
                    callbacks= callbacks
                   )

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 30)                270       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 31        
Total params: 301
Trainable params: 301
Non-trainable params: 0
_________________________________________________________________
Train for 348 steps, validate for 120 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Ep

In [42]:
model.evaluate(test_set, steps = 5160// batch_size)



0.44464135040407593