# Keras 101: A simple Neural Network for House Pricing regression

### Importing the dataset





In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd


In [2]:

# Load the California housing dataset
california_data = fetch_california_housing()


In [3]:
california_data

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [4]:
# Convert to dataframe
california_df = pd.DataFrame(california_data.data, columns=california_data.feature_names)
california_df['Target'] = california_data.target
california_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the data
X = california_data.data
y = california_data.target

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


In [10]:
from tensorflow import keras

# Initialize a sequential model
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])


In [11]:
history = model.fit(X_train, y_train, epochs=30, validation_data=(X_val, y_val), batch_size=32, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [12]:
import plotly.graph_objects as go

# Extract loss and validation loss values
loss_values = history.history['loss']
val_loss_values = history.history['val_loss']

# Convert the range object to a list
epochs = list(range(1, len(loss_values) + 1))

# Create a line trace for loss
trace0 = go.Scatter(x=epochs, y=loss_values, mode='lines', name='Training loss')

# Create a line trace for validation loss
trace1 = go.Scatter(x=epochs, y=val_loss_values, mode='lines', name='Validation loss')

# Define layout
layout = go.Layout(title='Training and Validation Loss', xaxis=dict(title='Epochs'), yaxis=dict(title='Loss'))

# Define the figure and add traces
fig = go.Figure(data=[trace0, trace1], layout=layout)

# Display the plot
fig.show()


In [13]:
# Get MAE for Neural Network
loss, mae_nn = model.evaluate(X_val, y_val, verbose=0)

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Initialize and train model
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

# Predict on validation data
predictions_lr = model_lr.predict(X_val)

# Get MAE for Linear Regression
mae_lr = mean_absolute_error(y_val, predictions_lr)


In [15]:
from sklearn.tree import DecisionTreeRegressor

# Initialize and train model
model_dt = DecisionTreeRegressor(random_state=42)
model_dt.fit(X_train, y_train)

# Predict on validation data
predictions_dt = model_dt.predict(X_val)

# Get MAE for Decision Tree
mae_dt = mean_absolute_error(y_val, predictions_dt)


In [16]:
print(f"Neural Network MAE: {mae_nn:.4f}")
print(f"Linear Regression MAE: {mae_lr:.4f}")
print(f"Decision Tree MAE: {mae_dt:.4f}")


Neural Network MAE: 0.3750
Linear Regression MAE: 0.5332
Decision Tree MAE: 0.4539


### To further improve the neural network
- Regularization: Introduce L1 or L2 regularization to reduce overfitting by penalizing large weights. This is particularly useful if we notice from the loss curves that the model is overfitting to the training data.
- Dropout: Drop random neurons during training to ensure that the network doesn't rely heavily on any particular neuron. This helps in reducing overfitting.
- Learning Rate Scheduling: Adjust the learning rate during training, often lowering it gradually to help the optimizer settle at a global minimum.
- Early Stopping: Monitor the validation loss and stop training once it starts increasing to prevent overfitting.

Note the following implementation includes all techniques. You can pick and choose what you like.

In [17]:
from tensorflow.keras import regularizers, callbacks

# Initialize a sequential model with L2 regularization and dropout
model_nn = keras.Sequential([
    keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.005), input_shape=(X_train.shape[1],)),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.005)),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(1)
])

# Compile the model
model_nn.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Learning rate scheduling callback
lr_schedule = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=20, min_lr=0.00001, verbose=1)

# Early stopping callback
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=40, restore_best_weights=True, verbose=1)

# Train the model
model_nn.fit(
    X_train, y_train, epochs=200,
    validation_data=(X_val, y_val),
    batch_size=32, verbose=1,
    callbacks=[lr_schedule, early_stopping]
)

# Get MAE for Neural Network
loss, mae_nn = model_nn.evaluate(X_val, y_val, verbose=0)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [18]:
mae_nn

0.3650054633617401