# Utah Home Value Predictor
This is a regression ML project that inputs an image of a house in the Wasatch Front and outputs an estimated home value as of 2023.
The training data is based on assessor-provided images of single-family homes in Davis County, UT.
Test data should be valid for homes in non-rural regions of Weber, Davis, Salt Lake, and Utah County, UT, but there may be slight variations due to location.

In [60]:
# Non-ML imports
import pandas as pd
import numpy as np
import random
import requests
import base64
import os
from PIL import Image

dtype = {
  "PARCEL ID": str,
  "PARCEL ZIP CODE": str
}
dtype_main = {
  "Parcel ID": str,
  "Assessed Value": np.float32
}
allowed_prop_types = ['Residential']  # allowed property types on the Davis County parcel system
blacklist_parcels = [  # parcels with invalid images or too much foliage blocking the house
  '010450014',
  '012590319',
  '030810075',
  '050330045',
  '050460021',
  '050780007',
  '050790009',
  '051130041',
  '060140083',
  '060920057',
  '070130004',
  '070140068',
  '073010128',
  '080120020',
  '080450003',
  '080980011',
  '081690013',
  '082280005',
  '084470305',
  '085850315',
  '090480038',
  '090600006',
  '091010078',
  '093380401',
  '100810002',
  '111870221',
  '114710015',
  '114930076',
  '116520007',
  '117750010',
  '127180016',
  '130170032',
  '130760106',
  '131630030',
  '140430050',
  '140560003',
  '140630013',
  '140650050',
  '140680016',
  '143430059',
  '143510048',
  '144450025',
  '145480125',
  '150400102',
]
random_seed = '70f2f796-b097-4215-b2a8-aa54cd499bbf'  # you can change this but don't be surprised if you get invalid homes that you have to filter through; this seed has been checked up to 1300 parcels
train_count = 1000  # how many instances to pull for the training/validation
epochs = 20  # how many epochs
debug = True  # some print statements

## Stage 0 (optional): Download/Filter Parcel Master
You can download the Parcel Master at https://opendata.gis.utah.gov. There is no direct URL for this so just save it as `./parcel_list/parcels_raw.csv`.

In [41]:
with open('parcel_list/parcels_raw.csv', 'r') as f:
  df = pd.read_csv(f, dtype=dtype)

  # Remove duplicate parcel IDs
  df = df.drop_duplicates(subset='PARCEL ID', keep="last")

  # Don't include any parcels that are not Private ownership
  df = df[df['OWNERSHIP TYPE'] == 'Private']

  # Don't include any parcels that have more than 10000 sqm (~2.5 acres) as that will mess up the data collection
  df = df[df['Shape__Area'] < 10000]

  # Only keep the Parcel IDs and ZIPs as a primary key
  df = df[['PARCEL ID','PARCEL ZIP CODE','Shape__Area']]

  df.to_csv('parcel_list/parcels_filtered.csv', index=False)

## Stage 1: Get Data from DC Parcel Search
This will get the data + images from the Davis County Parcel Search. If you already have files in `images/` and `property_attributes.csv` the download script will not be invoked and only `main_df` needs be instantiated.

In [16]:
# get list of filtered parcels
with open('parcel_list/parcels_filtered.csv', 'r') as f:
  df = pd.read_csv(f, dtype=dtype, index_col='PARCEL ID')

# set seed for parcel randomization
parcels = df.index.values.tolist()
random.seed(random_seed)
random.shuffle(parcels)

try:
  with open('property_attributes.csv', 'r') as f:
    # if file already exists, automatically load it
    main_df = pd.read_csv(f, dtype=dtype_main, index_col='Parcel ID')
except:
  # Main Data collection if missing
  i = -1
  i_success = 0
  main_df = pd.DataFrame(columns=[
    'Parcel ID',
    'Property Type',
    'Property Size',
    'Year Built',
    'Assessed Value'
  ], dtype=dtype_main)
  main_df = main_df.set_index('Parcel ID')

  while i_success < train_count:
    i += 1

    if debug:
      print(f'Trying iloc={i} ({parcels[i]})')

    data_core = requests.get(f'https://webportal.daviscountyutah.gov/App/PropertySearch/api/parcel/buildings/{parcels[i]}').json()

    # Blank parcel (no property)
    if len(data_core) == 0:
      if debug:
        print(f'Skipped {parcels[i]} (REASON: data)')
      continue

    data_core = data_core[0]

    # Missing physical information
    if data_core["propertyType"] not in allowed_prop_types or\
      'bltasYearBuilt' not in data_core or\
      'landGrossAcres' not in data_core or\
      data_core['landGrossAcres'] == 0:
      if debug:
        print(f'Skipped {parcels[i]} (REASON: data_core)')
      continue
    
    data_value = requests.get(f'https://webportal.daviscountyutah.gov/App/PropertySearch/api/taxrecord/{parcels[i]}').json()

    # Missing market value information
    if len(data_value) == 0 or\
      'marketImproveValue' not in data_value[0] or\
      'marketLandValue' not in data_value[0] or\
      data_value[0]['marketImproveValue'] == 0 or\
      data_value[0]['marketLandValue'] == 0:
      if debug:
        print(f'Skipped {parcels[i]} (REASON: data_value)')
      continue

    data_image = requests.get(f'https://webportal.daviscountyutah.gov/App/PropertySearch/api/parcel/images/{parcels[i]}').json()

    # Missing image
    if len(data_image) == 0 or parcels[i] in blacklist_parcels:
      if debug:
        print(f'Skipped {parcels[i]} (REASON: data_image)')
      continue
    
    main_df.loc[parcels[i]] = [
      data_core["propertyType"],
      data_core["landGrossAcres"],
      data_core["bltasYearBuilt"],
      data_value[0]['marketImproveValue'] + data_value[0]['marketLandValue']
    ]

    # export image
    raw_image = str.encode(data_image[0].replace('data:image/jpeg;base64,', ''))
    
    with open(f'images/{parcels[i]}.jpg', 'wb') as b:
      b.write(base64.decodebytes(raw_image))
    
    i_success += 1
    
  main_df.to_csv('property_attributes.csv')
main_df["Assessed Value"] = main_df["Assessed Value"] / 1000000
main_df["Assessed Value"] = main_df["Assessed Value"].astype('float32')

In [40]:
# Resize all images to be 640x480 ~ 4:3 ratio
def convert_image(i):
  with open(i, 'rb') as f:
    img = Image.open(f)
    w = img.width
    h = img.height

    if w == 640 and h == 480:
      return
    
    # other aspect ratio in landscape mode
    elif w >= h * 4/3:
      target_w = h * 4/3
      biaxial_cropped_w = (w - target_w) / 2

      img = img.crop((biaxial_cropped_w, 0, w - biaxial_cropped_w, h))
    
    # other aspect ratio in portrait mode
    else:
      target_h = w / (4/3)
      biaxial_cropped_h = (h - target_h) / 2

      img = img.crop((0, biaxial_cropped_h, w, h - biaxial_cropped_h))
    
    img = img.resize((640, 480))
    img.save(i)

for i in os.listdir('images'):
  if '.jpg' in i:  # valid image
    convert_image(f"images/{i}")


## Stage 2: Time to Train!
Let's train our data now using a simple convolutional neural network (CNN).

In [58]:
# ML imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms

# function to transform PIL image into np tensor
transform_func = transforms.Compose([
  transforms.ToTensor()
])

# order of priority: NVIDIA Cuda, Apple MPS, CPU.
device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))

# Dataset class: automatically partitions into 80% train/20% validation
class HomeValueDataset(Dataset):
  def __init__(self, train=True):
    self.imgs = [transform_func(Image.open(f"images/{x}.jpg")) for x in main_df.index.values]
    self.index_modifier = 0 if train else train_count - (train_count // 5)
    self.train = train

  def __getitem__(self, index):
    input = self.imgs[index+self.index_modifier]
    output = main_df["Assessed Value"].iloc[index+self.index_modifier]

    return input, output

  def __len__(self):
    return len(self.imgs) - (len(self.imgs) // 5) if self.train else len(self.imgs) // 5

class HomeValueCNN(nn.Module):
  def __init__(self, num_outputs=1):
    super(HomeValueCNN, self).__init__()

    # Load ResNet-18 model
    resnet = models.resnet18(pretrained=True)

    # Replace the fully connected layer to output num_outputs for regression
    self.model = nn.Sequential(
        resnet,
        nn.Linear(resnet.fc.in_features, num_outputs)  # Modify output layer for regression
    )
    
    # Remove the original fully connected layer from ResNet
    self.model[0].fc = nn.Identity()
        
  def forward(self, x):
    return self.model(x)

# Instantiate the model
model = HomeValueCNN()



In [61]:
# Define loss function and optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()

# main train loop
def train_model(model, train_loader, val_loader, epochs=epochs):
  model.to(device)

  for epoch in range(epochs):
    model.train()
    training_run_loss = 0.0
    
    for images, labels in train_loader:
      images, labels = images.to(device), labels.to(device)
      
      # Convert labels to float for regression and adjust shape
      labels = labels.float().unsqueeze(1)  # Change shape from [batch_size] to [batch_size, 1]

      # Zero the parameter gradients
      optimizer.zero_grad()

      # Forward pass
      outputs = model(images)
      loss = criterion(outputs, labels)

      # Backward pass and optimize
      loss.backward()
      optimizer.step()

      training_run_loss += loss.item() * images.size(0)

    training_loss = training_run_loss / len(train_loader.dataset)

    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
      for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
      
        # Convert labels to float for regression and adjust shape
        labels = labels.float().unsqueeze(1)  # Change shape from [batch_size] to [batch_size, 1]

        outputs = model(images)
        loss = criterion(outputs, labels)
        
        val_loss += loss.item() * images.size(0)

    val_loss /= len(val_loader.dataset)
    
    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {training_loss:.4f}, Validation Loss: {val_loss:.4f}')

# Example usage (assuming train_loader and val_loader are defined DataLoader objects)
train_dataset = HomeValueDataset(train=True)
val_dataset = HomeValueDataset(train=False)

# Initialize DataLoaders
train_loader = DataLoader(train_dataset, batch_size=5, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=5, shuffle=False, pin_memory=True)

train_model(model, train_loader, val_loader)

Epoch 1/20, Train Loss: 0.0792, Validation Loss: 0.1483
Epoch 2/20, Train Loss: 0.0348, Validation Loss: 0.0572
Epoch 3/20, Train Loss: 0.0219, Validation Loss: 0.0513
Epoch 4/20, Train Loss: 0.0166, Validation Loss: 0.0311
Epoch 5/20, Train Loss: 0.0124, Validation Loss: 0.0256
Epoch 6/20, Train Loss: 0.0103, Validation Loss: 0.0305
Epoch 7/20, Train Loss: 0.0098, Validation Loss: 0.0398
Epoch 8/20, Train Loss: 0.0088, Validation Loss: 0.0278
Epoch 9/20, Train Loss: 0.0094, Validation Loss: 0.0356
Epoch 10/20, Train Loss: 0.0084, Validation Loss: 0.0382
Epoch 11/20, Train Loss: 0.0101, Validation Loss: 0.0301
Epoch 12/20, Train Loss: 0.0073, Validation Loss: 0.0297
Epoch 13/20, Train Loss: 0.0084, Validation Loss: 0.0956
Epoch 14/20, Train Loss: 0.0082, Validation Loss: 0.0339
Epoch 15/20, Train Loss: 0.0092, Validation Loss: 0.0344
Epoch 16/20, Train Loss: 0.0084, Validation Loss: 0.0380
Epoch 17/20, Train Loss: 0.0106, Validation Loss: 0.0267
Epoch 18/20, Train Loss: 0.0086, Validat

In [65]:
torch.save(model.state_dict(), 'utsfhval.pth')

## Stage 3: The Tenuous Test
How did our model perform? We'll evaluate by using a simple mean squared error. Multiply the MSE by 1 million to get the average error in $$.

In [82]:
import re

test_data = []

for i in os.listdir("test"):
  if '.jpg' in i:
    msrp = float(re.findall(r'\d+', i)[0]) * 1000

    convert_image(f"test/{i}")
    test_image_tensor = transform_func(Image.open(f"test/{i}")).unsqueeze(0)
    test_image_tensor = test_image_tensor.to(device)

    with torch.no_grad():
      prediction = model(test_image_tensor)
      test_data.append(
        (msrp, prediction.item() * 1e6)
      )

print(f"Current MSE: {sum([abs((x - y)/1e6) for x, y in test_data])/len(test_data)}")

Current MSE: 0.2174291739463806
