In [154]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from tensorflow import keras
from tensorflow.keras import layers

import os
import datetime

%reload_ext tensorboard
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 78548), started 5:50:37 ago. (Use '!kill 78548' to kill it.)

In [155]:
# load the data
df = pd.read_csv('./data/housing.csv')
print(df.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


In [156]:
df["income_cat"] = pd.cut(df["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])

# stratified(i.e., preserving the percentage of samples under each class) shuffle splitting the dataset
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df["income_cat"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

# drop the categorical feature as we do not need it anymore
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

data = strat_train_set.drop("median_house_value", axis=1) 
train_labels = strat_train_set["median_house_value"].to_numpy(copy=True)

# creating a pipeline so that you can chain together multiple steps
# NOTE: remember that the output from one step will be an inout to the other
# you can do these steps seperately and combine but Pipelines are more efficient
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

housing_num = data.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

# will be applying tranasformer object to the given subset of data
# NOTE: e.g., the "num_pipeline" transformer object will be applied on the given set of columns
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

 # sending the data through a chain of transformations
 # NOTE: the final dataset is a 2D numpy array containg all numerical data
train_data = full_pipeline.fit_transform(data)

# test the model on test data
x_test = strat_test_set.drop("median_house_value", axis=1)
test_labels = strat_test_set["median_house_value"].to_numpy(copy=True)

test_data = full_pipeline.transform(x_test)

In [157]:
# build the model
def get_model():
    model = keras.Sequential([
        layers.Dense(256, input_shape=(13,), activation="relu"),
        layers.Dense(256, activation="relu"),
        layers.Dense(1)  # default linear activation due to the regression task
    ])

    model.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])

    return model

In [158]:
# TensorBoard
# create directory to store the callback logs
d = datetime.datetime.today()
timestamp = d.strftime('%Y%m%d_%H%M%S')
tensorlog_folder = os.path.join(os.path.curdir, 'logs', timestamp)
os.mkdir(tensorlog_folder)

tensorboard = keras.callbacks.TensorBoard(log_dir=tensorlog_folder)

2022-02-16 17:54:28.163598: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2022-02-16 17:54:28.163632: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
2022-02-16 17:54:28.163671: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.


In [159]:
num_epochs = 100
batch_size = 16
all_mae_history = []
    
# get the compiled model
model = get_model()

history = model.fit(train_data, train_labels, epochs=num_epochs, batch_size=batch_size, validation_split=0.5, callbacks=[tensorboard], verbose=1)

print(history.history["val_mae"])

print(f"MAE for all folds:\n{all_mae_history}")

Epoch 1/100


ValueError: in user code:

    /home/prasadith/miniconda3/envs/csi4106/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /home/prasadith/miniconda3/envs/csi4106/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/prasadith/miniconda3/envs/csi4106/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/prasadith/miniconda3/envs/csi4106/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/prasadith/miniconda3/envs/csi4106/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/prasadith/miniconda3/envs/csi4106/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:788 run_step  **
        outputs = model.train_step(data)
    /home/prasadith/miniconda3/envs/csi4106/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:754 train_step
        y_pred = self(x, training=True)
    /home/prasadith/miniconda3/envs/csi4106/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    /home/prasadith/miniconda3/envs/csi4106/lib/python3.8/site-packages/tensorflow/python/keras/engine/input_spec.py:255 assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer sequential_60 is incompatible with the layer: expected axis -1 of input shape to have value 15 but received input with shape (16, 13)


In [None]:
# based on the best hyperparameters train the final model and evaluate on test data
model = get_model()
model.fit(train_data, train_labels, epochs=10, batch_size=16, verbose=0)
test_mse, test_mae = model.evaluate(test_data, test_labels)

print(test_mae)

48562.2421875


In [None]:
# generate the predictions on test data
predictions = model.predict(test_data)
print(predictions[0])
print(test_labels[0])

[452180.12]
500001.0
