In [1]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import pandas as pd

BATCH_SIZE = 32
TARGET_LABEL_NAME = "SalePrice"
ID_LABEL_NAME = "Id"
train_file = "data/train.csv"
dataset = None
labels = None

print("GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

GPUs Available:  0


In [2]:
def loadDataset(train_file):
    global dataset, labels
    dataset = pd.read_csv(train_file, skipinitialspace=True)
    dataset.drop(ID_LABEL_NAME, axis=1, inplace=True)
    labels = dataset.keys()
# Initial Loading for Cleanup
loadDataset(train_file)
dataset.head(20)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
5,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,307000
7,60,RL,,10382,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2008,WD,Abnorml,129900
9,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,Corner,...,0,,,,0,1,2008,WD,Normal,118000


In [3]:
# Finding NAs and discarding unproductive labels 
nas = dataset.isna().sum()
length = len(dataset.index)
keep_label = []
drop_label = []

print(f"Dataset is {length} items with {len(labels)} features")
for k, i in enumerate(nas):
    per = (i*100/length)
    if(per < 50):
        if per:
            print(f"\t{labels[k]}: {i}NA [{per:.2f}%]")
        keep_label.append(labels[k])
    else:
        drop_label.append(labels[k])

print(f"Dropping {drop_label}")
print(f"\nFiltered to {len(keep_label)} features")

# Drop discarded columns
dataset = dataset.drop(columns=drop_label)

Dataset is 1460 items with 80 features
	LotFrontage: 259NA [17.74%]
	MasVnrType: 8NA [0.55%]
	MasVnrArea: 8NA [0.55%]
	BsmtQual: 37NA [2.53%]
	BsmtCond: 37NA [2.53%]
	BsmtExposure: 38NA [2.60%]
	BsmtFinType1: 37NA [2.53%]
	BsmtFinType2: 38NA [2.60%]
	Electrical: 1NA [0.07%]
	FireplaceQu: 690NA [47.26%]
	GarageType: 81NA [5.55%]
	GarageYrBlt: 81NA [5.55%]
	GarageFinish: 81NA [5.55%]
	GarageQual: 81NA [5.55%]
	GarageCond: 81NA [5.55%]
Dropping ['Alley', 'PoolQC', 'Fence', 'MiscFeature']

Filtered to 76 features


In [4]:
# Set type of feature for identifying type of normalizartion required
# Normalization Method: 0 -> Numerical, 1 -> Categorical
# Datatype: 0 -> String, 1 -> Integer
# NA Handling: 0 -> Drop, 1 -> Keep as 0
feature_desc = {
    # "Id": [1, 0, 1],
    "MSSubClass": [0, 1, 1],
    "MSZoning": [1, 0, 1],
    # "LotFrontage": [0, 1, 1],
    "LotArea": [0, 1, 1],
    # "Street": [1, 0, 1],
    # "Alley": [1, 0, 1],
    # "LotShape": [1, 0, 1],
    # "LandContour": [1, 0, 1],
    # "Utilities": [1, 0, 1],
    # "LotConfig": [1, 0, 1],
    # "LandSlope": [1, 0, 1],
    # "Neighborhood": [1, 0, 1],
    # "Condition1": [1, 0, 1],
    # "Condition2": [1, 0, 1],
    # "BldgType": [1, 0, 1],
    # "HouseStyle": [1, 0, 1],
    # "OverallQual": [0, 1, 1],
    # "OverallCond": [0, 1, 1],
    # "YearBuilt": [0, 1, 1],
    # "YearRemodAdd": [0, 1, 1],
    # "RoofStyle": [1, 0, 1],
    # "RoofMatl": [1, 0, 1],
    # "Exterior1st": [1, 0, 1],
    # "Exterior2nd": [1, 0, 1],
    # "MasVnrType": [1, 0, 1],
    # "MasVnrArea": [0, 1, 1],
    # "ExterQual": [1, 0, 1],
    # "ExterCond": [1, 0, 1],
    # "Foundation": [1, 0, 1],
    # "BsmtQual": [1, 0, 1],
    # "BsmtCond": [1, 0, 1],
    # "BsmtExposure": [1, 0, 1],
    # "BsmtFinType1": [1, 0, 1],
    # "BsmtFinSF1": [0, 1, 1],
    # "BsmtFinType2": [1, 0, 1],
    # "BsmtFinSF2": [0, 1, 1],
    # "BsmtUnfSF": [0, 1, 1],
    # "TotalBsmtSF": [0, 1, 1],
    # "Heating": [1, 0, 1],
    # "HeatingQC": [1, 0, 1],
    # "CentralAir": [1, 0, 1],
    # "Electrical": [1, 0, 1],
    # "1stFlrSF": [0, 1, 1],
    # "2ndFlrSF": [0, 1, 1],
    # "LowQualFinSF": [0, 1, 1],
    # "GrLivArea": [0, 1, 1],
    # "BsmtFullBath": [0, 1, 1],
    # "BsmtHalfBath": [0, 1, 1],
    # "FullBath": [0, 1, 1],
    # "HalfBath": [0, 1, 1],
    # "BedroomAbvGr": [0, 1, 1],
    # "KitchenAbvGr": [0, 1, 1],
    # "KitchenQual": [1, 0, 1],
    # "TotRmsAbvGrd": [0, 1, 1],
    # "Functional": [1, 0, 1],
    # "Fireplaces": [0, 1, 1],
    # "FireplaceQu": [1, 0, 1],
    # "GarageType": [1, 0, 1],
    # "GarageYrBlt": [0, 1, 1],
    # "GarageFinish": [1, 0, 1],
    # "GarageCars": [0, 1, 1],
    # "GarageArea": [0, 1, 1],
    # "GarageQual": [1, 0, 1],
    # "GarageCond": [1, 0, 1],
    # "PavedDrive": [1, 0, 1],
    # "WoodDeckSF": [0, 1, 1],
    # "OpenPorchSF": [0, 1, 1],
    # "EnclosedPorch": [0, 1, 1],
    # "3SsnPorch": [0, 1, 1],
    # "ScreenPorch": [0, 1, 1],
    # "PoolArea": [0, 1, 1],
    # "PoolQC": [1, 0, 1],
    # "Fence": [1, 0, 1],
    # "MiscFeature": [1, 0, 1],
    # # "MiscVal": [1, 0, 1],
    # "MoSold": [0, 1, 1],
    # "YrSold": [0, 1, 1],
    "SaleType": [1, 0, 1],
    "SaleCondition": [1, 0, 1]
}

In [5]:
# Preprocess dataset using normalizer functions
def numericalNormal(name):
    norm = layers.Normalization(axis=None)
    data = np.array(dataset[name], dtype=np.int64)
    norm.adapt(data)
    return norm

# Datatype: 0 -> String, 1 -> Integer
def categoryNormal(name, dtype):
    if(dtype == 0):
        indices = layers.StringLookup(output_mode="one_hot")
    elif(dtype == 1):
        indices = layers.IntegerLookup()
    else:
        raise Exception("Invalid Datatype in Normalization Layer!!")
    data = np.array(dataset[name], dtype=np.str0)
    indices.adapt(data)
    norm = layers.CategoryEncoding(num_tokens=indices.vocabulary_size())
    return lambda inp: norm(indices(inp))

# Method: 0 -> Numerical, 1 -> Categorical
# NA Handling: 0 -> Drop, 1 -> Keep as 0
def preprocessDataset(dataset, feature_desc):
    input_layers = []
    encoded_features = []
    n = len(feature_desc)
    print("\nStart Normalization...", end="")
    for ind, i in enumerate(feature_desc):
        if i in dataset.keys():
            print(f"\r[{ind*100/n:.2f}%] Normalizing {i}", end="...         ")
            method, dtype, nah = feature_desc[i]
            if(nah == 0):
                dataset.dropna(axis=0, inplace=True, subset=[i])
            if(method == 0):
                # If na, dtype becomes float64, pad NAs and convert back to int64
                if(nah == 1):
                    dataset[i] = dataset[i].fillna(0).astype(np.int64)
                normalLayer = numericalNormal(i)
            elif(method == 1):
                if(nah == 1):
                    dataset[i] = dataset[i].fillna("UNF")
                normalLayer = categoryNormal(i, dtype)
            input_tensor = tf.keras.Input(shape=(1,), name=i)
            encoded_input = normalLayer(input_tensor)
            # print(normalLayer, input_tensor, feature)
            input_layers.append(input_tensor)
            encoded_features.append(encoded_input)
    print("\rNormalization Done!!"+" "*20)
    return input_layers, encoded_features

In [6]:
# Normalize all data and obtain layers for train and validation
input_tensors, encoded_tensors = preprocessDataset(dataset, feature_desc)
print(f"\nGenerated {len(input_tensors)} input layers...")

print("Input", input_tensors, "Encoded", encoded_tensors, sep="\n")


Normalization Done!!                          

Generated 5 input layers...
Input
[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'MSSubClass')>, <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'MSZoning')>, <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'LotArea')>, <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'SaleType')>, <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'SaleCondition')>]
Encoded
[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization')>, <KerasTensor: shape=(None, 6) dtype=float32 (created by layer 'category_encoding')>, <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization_1')>, <KerasTensor: shape=(None, 10) dtype=float32 (created by layer 'category_encoding_1')>, <KerasTensor: shape=(None, 7) dtype=float32 (created by layer 'category_encoding_2')>]


In [7]:
# Split Dataset
dataset_sub = dataset[list(feature_desc.keys())+[TARGET_LABEL_NAME]]
train, val = np.split(dataset_sub.sample(frac=1), [int(0.8*len(dataset_sub))])
print(f"Train: {len(train)}  Val: {len(val)}")

# Prepare TF Dataset Loader
def makeDataset(dataset, output_col_name=TARGET_LABEL_NAME, batch=BATCH_SIZE):
    d = dataset.copy()
    # Get labels, remove ID from input dataset
    label = d.pop(output_col_name)
    df = {k: np.array(v) for k, v in d.items()}
    # print([[df[k].dtype, k] if df[k].dtype != int else None for k in df]) # For datatype debug
    ds = tf.data.Dataset.from_tensor_slices((dict(df), label))
    ds = ds.shuffle(buffer_size=len(d))
    ds = ds.batch(batch)
    ds = ds.prefetch(batch)
    return ds
# dataset[dataset.isnull().any(axis=1)].to_csv("debug.csv")
train_ds, val_ds = makeDataset(train), makeDataset(val)
print("Dataset Created!!")

Train: 1168  Val: 292
Dataset Created!!


In [34]:
# Create Deep Neural Network
features = layers.concatenate(encoded_tensors)
x = layers.Dense(72, activation="relu")(features)
x = layers.Dropout(0.3)(x)
# x = layers.Dense(72, activation="relu")(x)
output = layers.Dense(1)(x)

model = tf.keras.Model(inputs=input_tensors, outputs=output, name="Housing")
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=["accuracy"])

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [33]:
# Train the model
# inp, out = None, None
# for i,o in train_ds.take(1):
#     inp, out = i, o
# norm1
# # norm1(inp['MSSubClass'])
# model.fit(train_ds, epochs=20, validation_data=val_ds)

train_ds = dataset_sub["MSZoning"]
inp = layers.Input(shape=(1,))
norm = categoryNormal("MSZoning", 0)(inp)
# l1 = layers.Dense(32, activation="softmax")(norm)
out = layers.Dense(1)(norm)
model = tf.keras.Model(inp, out)
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
model.fit(train_ds, train_ds, epochs=10)

# dataset_sub

Epoch 1/10


UnimplementedError: Graph execution error:

Detected at node 'model_17/Cast' defined at (most recent call last):
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 596, in run_forever
      self._run_once()
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 1890, in _run_once
      handle._run()
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 457, in dispatch_queue
      await self.process_one()
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 446, in process_one
      await dispatch(*args)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 353, in dispatch_shell
      await result
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 648, in execute_request
      reply_content = await reply_content
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\ipkernel.py", line 353, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 2901, in run_cell
      result = self._run_cell(
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 2947, in _run_cell
      return runner(coro)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3172, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3364, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\Gokul\AppData\Local\Temp/ipykernel_18560/1020572921.py", line 16, in <module>
      model.fit(train_ds, train_ds, epochs=10)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 889, in train_step
      y_pred = self(x, training=True)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 490, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\base_layer.py", line 1014, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\functional.py", line 458, in call
      return self._run_internal_graph(
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\functional.py", line 578, in _run_internal_graph
      y = self._conform_to_reference_input(y, ref_input=x)
    File "c:\Users\Gokul\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\functional.py", line 678, in _conform_to_reference_input
      tensor = tf.cast(tensor, dtype=ref_input.dtype)
Node: 'model_17/Cast'
Cast string to float is not supported
	 [[{{node model_17/Cast}}]] [Op:__inference_train_function_38048]

In [None]:
# Get Accuracy
loss, accuracy = model.evaluate(val_ds)
print("Accuracy", accuracy)