# House Price Prediction With TensorFlow

This example demonstrates how Unfolded Map SDK can be used in the process of building a machine learning model for predicting median house prices in California.

## Dependencies 

In [None]:
!pip install pandas==1.3.2 numpy==1.19.5 sklearn==0.0 geopy==2.2.0 scipy==1.7.1 seaborn==0.11.2 matplotlib==3.4.3 tensorflow==2.6.0 unfolded.map-sdk==0.3.0

## Imports

In [None]:
from uuid import uuid4
import requests

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from geopy.distance import geodesic
from scipy.cluster.vq import vq
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from tensorflow import keras
from tensorflow.keras import Sequential, optimizers
from tensorflow.keras.layers import Dense, Flatten, Softmax

from unfolded.map_sdk import UnfoldedMap

Check package versions: 

In [None]:
print(f"pandas version: {pd.__version__}")
print(f"numpy version: {np.__version__}")
print(f"seaborn version: {sns.__version__}")
print(f"tensorflow: {tf.__version__}")

## Data Loading

In [None]:
dataset_url = "https://actionengine-public.s3.us-east-2.amazonaws.com/housing.csv"
dataset_path = "housing.csv"

dataset_data = requests.get(dataset_url, allow_redirects=True)
with open(dataset_path, "wb") as dataset_file:
    dataset_file.write(dataset_data.content)

housing = pd.read_csv(dataset_path)
housing.head()

## Feature Engineering

First, let's have a glance at the input data and try to visualize different aspects of them in a map.

### Population Clustering

Here we'll look at cities clustered by the population:

In [None]:
population_in_CA = UnfoldedMap()
dataset_id = uuid4()

population_in_CA.add_dataset(
    {"uuid": dataset_id, "label": "Population_in_CA", "data": housing},
    auto_create_layers=False,
)

population_in_CA.add_layer(
    {
        "id": "population_CA",
        "type": "cluster",
        "config": {
            "label": "population in CA",
            "data_id": dataset_id,
            "columns": {"lat": "latitude", "lng": "longitude"},
            "is_visible": True,
            "color_scale": "quantize",
            "color_field": {"name": "population", "type": "real"},
        },
    }
)

population_in_CA.set_view_state(
    {"longitude": -119.417931, "latitude": 36.778259, "zoom": 5}
)

population_in_CA

### Distance To Largest Cities

Here we can represent the closest large city for every location in the input data:

In [None]:
big_cities = {
    "Los Angeles": (34.052, -118.244),
    "San Diego": (32.716, -117.165),
    "San Jose": (37.339, -121.895),
    "San Francisco": (37.775, -122.419),
    "Fresno": (36.748, -119.772),
}

Create function to calculate nearest city

In [None]:
def closest_point(location, location_dict):
    shortest_distance = None
    shortest_distance_coordinates = None

    for loc in location_dict.values():
        distance = geodesic(location, loc).kilometers
        if shortest_distance is None or distance < shortest_distance:
            shortest_distance = distance
            shortest_distance_coordinates = loc
    return shortest_distance, shortest_distance_coordinates

Find and calculate the distances between each location in the input data and their closest big cities

In [None]:
housing["big_city"] = housing.apply(
    lambda x: closest_point((x["latitude"], x["longitude"]), big_cities), axis=1
)
housing["closest_location"] = [x[0] for x in housing["big_city"].values]
housing["big_city_dist"] = [x[1] for x in housing["big_city"].values]

housing = housing.drop("big_city", axis=1)

Make a dataframe from the data obtained above with the coordinates of the nearest big city

In [None]:
lat = []
lon = []
target_lat = []
target_lon = []

for row in housing.itertuples():
    lon.append(row.longitude)
    lat.append(row.latitude)
    target_city = row.big_city_dist
    target_lat.append(target_city[0])
    target_lon.append(target_city[1])

coords_df = pd.DataFrame(
    {"lat": lat, "lon": lon, "target_lat": target_lat, "target_lon": target_lon}
)

This map shows the distances between the locations and their nearest big cities

In [None]:
distance_to_big_cities = UnfoldedMap()
dist_data_id = uuid4()

distance_to_big_cities.add_dataset(
    {
        "uuid": dist_data_id,
        "label": "Distance to closest big cities",
        "data": coords_df,
    },
    auto_create_layers=False,
)

distance_to_big_cities.add_layer(
    {
        "id": "closest_distance",
        "type": "arc",
        "config": {
            "data_id": dist_data_id,
            "label": "distance to closest big cities",
            "columns": {
                "lng0": "lon",
                "lng1": "target_lon",
                "lat0": "lat",
                "lat1": "target_lat",
            },
            "visConfig": {"opacity": 0.8, "thickness": 0.3},
            "is_visible": True,
        },
    }
)

distance_to_big_cities.set_view_state(
    {"longitude": -119.417931, "latitude": 36.778259, "zoom": 4.5}
)

distance_to_big_cities

## Data Preprocessing

Here we are preparing data for training a TensorFlow model:

In [None]:
# we can drop null values as their count is less than 5 %
housing.dropna(inplace=True)

X = pd.DataFrame(
    columns=[
        "longitude",
        "latitude",
        "housing_median_age",
        "total_rooms",
        "total_bedrooms",
        "population",
        "households",
        "median_income",
        "ocean_proximity",
    ],
    data=housing,
)
y = pd.DataFrame(columns=["median_house_value"], data=housing)

# converting ocean_proximity into new separate columns ('NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND')
X = pd.get_dummies(
    data=X, columns=["ocean_proximity"], prefix=["ocean_proximity"], drop_first=True
)

## Data Splitting

Splitting the data into training, validation and test sets

In [None]:
# dividing training data into test, validation and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=1
)

start_values = X_test.copy(deep=True)

## Feature Scaling

Using standart scaling with mean and standard deviation from training dataset to avoid data leak

In [None]:
# feature standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

## Price Prediction Model

Here we are specify the parameters for the TensorFlow model:

In [None]:
model = Sequential()

number_of_features = X.shape[1]

# input Layer
model.add(Dense(number_of_features, activation="relu", input_dim=number_of_features))

# hidden Layer
model.add(Dense(512, activation="relu"))
model.add(Dense(512, activation="relu"))
model.add(Dense(256, activation="relu"))
model.add(Dense(128, activation="relu"))
model.add(Dense(64, activation="relu"))
model.add(Dense(32, activation="relu"))

# output Layer
model.add(Dense(1, activation="linear"))

In [None]:
model.compile(loss="mse", optimizer="adam", metrics=["mse", "mae"])
model.summary()

### Training

Here we are starting the model training:

In [None]:
history = model.fit(
    X_train,
    y_train.to_numpy(),
    batch_size=10,
    epochs=70,
    verbose=1,
    validation_data=(X_val, y_val),
)

### Evaluation

Here we are looking how well the model was trained:

In [None]:
# summarize history for loss
loss_train = history.history["loss"]
loss_val = history.history["val_loss"]
epochs = range(1, 71)
plt.figure(figsize=(10, 8))
plt.plot(epochs, loss_train, "g", label="Training loss")
plt.plot(epochs, loss_val, "b", label="Validation loss")
plt.title("Training and Validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In the above chart we can see that the training loss and validation loss are quite close to each other.

Now we can use the model to predict prices on unseen data

In [None]:
y_pred = model.predict(X_test)

We can see that loss function value on the test data is similar to the loss value on the training data

In [None]:
model.evaluate(X_test, y_test)

### Prediction

Let's now visualize the predicted numbers on the map

First, create a dataframe with predicted values obtained from the model

In [None]:
predict_data = pd.DataFrame(
    columns=["longitude", "latitude"], data=start_values[["longitude", "latitude"]]
)
predict_data["price"] = y_pred

### Visualization

This map shows the predicted prices on houses in CA

In [None]:
housing_predict_prices = UnfoldedMap()
price_data_id = uuid4()

housing_predict_prices.add_dataset(
    {
        "uuid": price_data_id,
        "label": "Predict housing prices in CA",
        "data": predict_data,
    },
    auto_create_layers=False,
)

housing_predict_prices.add_layer(
    {
        "id": "housing_prices",
        "type": "hexagon",
        "config": {
            "label": "housing prices",
            "data_id": price_data_id,
            "columns": {"lat": "latitude", "lng": "longitude"},
            "is_visible": True,
            "color_scale": "quantize",
            "color_field": {"name": "price", "type": "real"},
            "vis_config": {
                "colorRange": {
                    "colors": [
                        "#E6F598",
                        "#ABDDA4",
                        "#66C2A5",
                        "#3288BD",
                        "#5E4FA2",
                        "#9E0142",
                        "#D53E4F",
                        "#F46D43",
                        "#FDAE61",
                        "#FEE08B",
                    ]
                }
            },
        },
    }
)

housing_predict_prices.set_view_state(
    {"longitude": -119.417931, "latitude": 36.6, "zoom": 6}
)

housing_predict_prices

## Clustering Model

Let's cluster the predicted data by price levels using the KMeans algorithm

In [None]:
k = 5
km = KMeans(n_clusters=k, init="k-means++")
X = predict_data[["latitude", "longitude", "price"]]

# clustering
dtf_X = X.copy()
dtf_X["cluster"] = km.fit_predict(X)

# add clustering info to the original dataset
predict_data[["cluster"]] = dtf_X[["cluster"]]

### Visualization

Let's show the price clusters in a chart

In [None]:
fig, ax = plt.subplots()
sns.scatterplot(
    x="latitude",
    y="longitude",
    data=predict_data,
    palette=sns.color_palette("bright", k),
    hue="cluster",
    size_order=[1, 0],
    ax=ax,
).set_title("Clustering (k=" + str(k) + ")")

This map shows the same clusters in the geographic context

Here we can see that the prices for cities close to the largest cities are the highest, in contrast to those that are far from them and, moreover, far from the ocean.

In [None]:
prices_dataset_id = uuid4()
unfolded_map_prices = UnfoldedMap()

unfolded_map_prices.add_dataset(
    {"uuid": prices_dataset_id, "label": "Prices", "data": predict_data},
    auto_create_layers=False,
)

unfolded_map_prices.add_layer(
    {
        "id": "prices_CA",
        "type": "point",
        "config": {
            "data_id": prices_dataset_id,
            "label": "clustering of prices",
            "columns": {"lat": "latitude", "lng": "longitude"},
            "is_visible": True,
            "color_scale": "quantize",
            "color_field": {"name": "cluster", "type": "real"},
            "vis_config": {
                "colorRange": {
                    "colors": ["#7FFFD4", "#8A2BE2", "#00008B", "#FF8C00", "#FF1493"]
                }
            },
        },
    }
)

unfolded_map_prices.set_view_state(
    {"longitude": -119.417931, "latitude": 36.778259, "zoom": 4}
)

unfolded_map_prices