# Regression Analysis of Small Data Using K-Fold

A Model for predicting house prices in Boston

## Preparation

### Load the Library

In [1]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets.boston_housing import load_data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime

from utils.support_tf import LogLevelManager as llm
llm.set(2)

### Prepare Training Data

In [2]:
(train_x, train_y), (test_x, test_y) = load_data()
type(train_x), train_x.shape, test_x.shape, type(test_y), test_y[0]

(numpy.ndarray, (404, 13), (102, 13), numpy.ndarray, 7.2)

#### Preprocessing
* Normalize each column to a value between -1 and 1

In [3]:
normalize_mean = train_x.mean(axis=0)
train_x -= normalize_mean
normalize_std = train_x.std(axis=0)
train_x /= normalize_std

# Normalization of test data is processed using normalized values of training data.
test_x -= normalize_mean
test_x /= normalize_mean

## Modeling

### Model Definition

In [4]:
class SimpleMLPClassification(Model):
    def __init__(self, kargs):
        super(SimpleMLPClassification, self).__init__(kargs)
        self.input_layer = Dense(units=kargs["input_layer_units"], 
                                 activation=kargs["main_af"], 
                                 input_shape=kargs["input_shape"])
        self.middle_layer = Dense(units=kargs["units32"], activation=kargs["main_af"])
        # "activation" is not used in regression
        self.output_layer = Dense(units=kargs["output_layer_units"])
    
    def call(self, inputs):
        layer = self.input_layer(inputs)
        layer = self.middle_layer(layer)
        model = self.output_layer(layer)
        return model

## Training

### Define a Global Variables

In [6]:
K = 4
VALID_SIZE = train_x.shape[0] // K
EPOCHS = 100
BATCH_SIZE = 1

### Define Hyper-Parameter

In [9]:
kargs = {
    "name"                  : "simple_mlp",
    "input_layer_units"     : 64,
    "units32"               : 32,
    "main_af"               : "relu",
    "input_shape"           : train_x.shape[1:],
    "output_layer_units"    : 1,
}

### Training with all data at once

In [11]:
model = SimpleMLPClassification(kargs)
model.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])
model.fit(train_x, train_y, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0)
mse_score, mae_score = model.evaluate(test_x, test_y)
mse_score, mae_score



(160.324462890625, 8.51121711730957)

### K-Fold Training

In [12]:
model = SimpleMLPClassification(kargs)
model.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])

start_train = datetime.now()
print(f"Start Training: {start_train}")

for i in range(K):
    start_fold = datetime.now()
    print(f"{i+1} fold: (Start) {start_fold}, ", end="")
    
    fold_valid_x = train_x[i*VALID_SIZE:(i+1)*VALID_SIZE]
    fold_valid_y = train_y[i*VALID_SIZE:(i+1)*VALID_SIZE]
    fold_train_x = np.concatenate([train_x[:i*VALID_SIZE], train_x[(i+1)*VALID_SIZE:]], axis=0)
    fold_train_y = np.concatenate([train_y[:i*VALID_SIZE], train_y[(i+1)*VALID_SIZE:]], axis=0)
    fold_history = model.fit(fold_train_x, fold_train_y, epochs=EPOCHS, batch_size=BATCH_SIZE,
                             validation_data=(fold_valid_x, fold_valid_y), verbose=0)
    fold_mae = fold_history.history["mae"]
    fold_val_mae = fold_history.history["val_mae"]
    end_fold = datetime.now()
    print(f"(End) {end_fold}, (Processing Time) {end_fold-start_fold}, MAE({np.mean(fold_mae)}/{np.mean(fold_val_mae)})")
    
end_train = datetime.now()
print(f"End Training: {end_train}, Processing Time: {end_train-start_train}")

Start Training: 2022-10-18 22:38:10.228007
1 fold: (Start) 2022-10-18 22:38:10.228359, (End) 2022-10-18 22:38:50.546313, (Processing Time) 0:00:40.317954, MAE(1.937007735967636/2.0749143862724306)
2 fold: (Start) 2022-10-18 22:38:50.546487, (End) 2022-10-18 22:39:30.895254, (Processing Time) 0:00:40.348767, MAE(1.1845612758398056/2.0240016877651215)
3 fold: (Start) 2022-10-18 22:39:30.895427, (End) 2022-10-18 22:40:11.056412, (Processing Time) 0:00:40.160985, MAE(0.9442280167341233/1.8264315509796143)
4 fold: (Start) 2022-10-18 22:40:11.056579, (End) 2022-10-18 22:40:50.780418, (Processing Time) 0:00:39.723839, MAE(0.9721349668502808/1.330679433941841)
End Training: 2022-10-18 22:40:50.780726, Processing Time: 0:02:40.552719


#### Evaluation

In [13]:
mse_score, mae_score = model.evaluate(test_x, test_y)
mse_score, mae_score



(330.0292053222656, 10.352420806884766)