# Utah Home Value Predictor
This is a regression ML project that inputs an image of a house in the Wasatch Front and outputs an estimated home value as of 2023.
The training data is based on assessor-provided images of single-family homes in Davis County, UT.
Test data should be valid for homes in non-rural regions of Weber, Davis, Salt Lake, and Utah County, UT, but there may be slight variations due to location.

In [34]:
# Non-ML imports
import pandas as pd
import numpy as np
import random
import requests
import base64
import os
from PIL import Image

dtype = {
  "PARCEL ID": str,
  "PARCEL ZIP CODE": str
}
dtype_main = {
  "Parcel ID": str,
  "Assessed Value": np.float32
}
allowed_prop_types = ['Residential']  # allowed property types on the Davis County parcel system
blacklist_parcels = [  # parcels with invalid images or too much foliage blocking the house
  '010450014',
  '012590319',
  '030810075',
  '050330045',
  '050460021',
  '050780007',
  '050790009',
  '051130041',
  '060140083',
  '060920057',
  '070130004',
  '070140068',
  '073010128',
  '080120020',
  '080450003',
  '080980011',
  '081690013',
  '082280005',
  '084470305',
  '085850315',
  '090480038',
  '090600006',
  '091010078',
  '093380401',
  '100810002',
  '111870221',
  '114710015',
  '114930076',
  '116520007',
  '117750010',
  '127180016',
  '130170032',
  '130760106',
  '131630030',
  '140430050',
  '140560003',
  '140630013',
  '140650050',
  '140680016',
  '143430059',
  '143510048',
  '144450025',
  '145480125',
  '150400102',
]
random_seed = '70f2f796-b097-4215-b2a8-aa54cd499bbf'  # you can change this but don't be surprised if you get invalid homes that you have to filter through; this seed has been checked up to 1300 parcels
train_count = 1000  # how many instances to pull for the training/validation
epochs = 5  # how many epochs
debug = True  # some print statements

## Stage 0 (optional): Download/Filter Parcel Master
You can download the Parcel Master at https://opendata.gis.utah.gov. There is no direct URL for this so just save it as `./parcel_list/parcels_raw.csv`.

In [41]:
with open('parcel_list/parcels_raw.csv', 'r') as f:
  df = pd.read_csv(f, dtype=dtype)

  # Remove duplicate parcel IDs
  df = df.drop_duplicates(subset='PARCEL ID', keep="last")

  # Don't include any parcels that are not Private ownership
  df = df[df['OWNERSHIP TYPE'] == 'Private']

  # Don't include any parcels that have more than 10000 sqm (~2.5 acres) as that will mess up the data collection
  df = df[df['Shape__Area'] < 10000]

  # Only keep the Parcel IDs and ZIPs as a primary key
  df = df[['PARCEL ID','PARCEL ZIP CODE','Shape__Area']]

  df.to_csv('parcel_list/parcels_filtered.csv', index=False)

## Stage 1: Get Data from DC Parcel Search
This will get the data + images from the Davis County Parcel Search. If you already have files in `images/` and `property_attributes.csv` the download script will not be invoked and only `main_df` needs be instantiated.

In [16]:
# get list of filtered parcels
with open('parcel_list/parcels_filtered.csv', 'r') as f:
  df = pd.read_csv(f, dtype=dtype, index_col='PARCEL ID')

# set seed for parcel randomization
parcels = df.index.values.tolist()
random.seed(random_seed)
random.shuffle(parcels)

try:
  with open('property_attributes.csv', 'r') as f:
    # if file already exists, automatically load it
    main_df = pd.read_csv(f, dtype=dtype_main, index_col='Parcel ID')
except:
  # Main Data collection if missing
  i = -1
  i_success = 0
  main_df = pd.DataFrame(columns=[
    'Parcel ID',
    'Property Type',
    'Property Size',
    'Year Built',
    'Assessed Value'
  ], dtype=dtype_main)
  main_df = main_df.set_index('Parcel ID')

  while i_success < train_count:
    i += 1

    if debug:
      print(f'Trying iloc={i} ({parcels[i]})')

    data_core = requests.get(f'https://webportal.daviscountyutah.gov/App/PropertySearch/api/parcel/buildings/{parcels[i]}').json()

    # Blank parcel (no property)
    if len(data_core) == 0:
      if debug:
        print(f'Skipped {parcels[i]} (REASON: data)')
      continue

    data_core = data_core[0]

    # Missing physical information
    if data_core["propertyType"] not in allowed_prop_types or\
      'bltasYearBuilt' not in data_core or\
      'landGrossAcres' not in data_core or\
      data_core['landGrossAcres'] == 0:
      if debug:
        print(f'Skipped {parcels[i]} (REASON: data_core)')
      continue
    
    data_value = requests.get(f'https://webportal.daviscountyutah.gov/App/PropertySearch/api/taxrecord/{parcels[i]}').json()

    # Missing market value information
    if len(data_value) == 0 or\
      'marketImproveValue' not in data_value[0] or\
      'marketLandValue' not in data_value[0] or\
      data_value[0]['marketImproveValue'] == 0 or\
      data_value[0]['marketLandValue'] == 0:
      if debug:
        print(f'Skipped {parcels[i]} (REASON: data_value)')
      continue

    data_image = requests.get(f'https://webportal.daviscountyutah.gov/App/PropertySearch/api/parcel/images/{parcels[i]}').json()

    # Missing image
    if len(data_image) == 0 or parcels[i] in blacklist_parcels:
      if debug:
        print(f'Skipped {parcels[i]} (REASON: data_image)')
      continue
    
    main_df.loc[parcels[i]] = [
      data_core["propertyType"],
      data_core["landGrossAcres"],
      data_core["bltasYearBuilt"],
      data_value[0]['marketImproveValue'] + data_value[0]['marketLandValue']
    ]

    # export image
    raw_image = str.encode(data_image[0].replace('data:image/jpeg;base64,', ''))
    
    with open(f'images/{parcels[i]}.jpg', 'wb') as b:
      b.write(base64.decodebytes(raw_image))
    
    i_success += 1
    
  main_df.to_csv('property_attributes.csv')
main_df["Assessed Value"] = main_df["Assessed Value"] / 1000000
main_df["Assessed Value"] = main_df["Assessed Value"].astype('float32')

In [40]:
# Resize all images to be 640x480 ~ 4:3 ratio
def convert_image(i):
  with open(i, 'rb') as f:
    img = Image.open(f)
    w = img.width
    h = img.height

    if w == 640 and h == 480:
      return
    
    # other aspect ratio in landscape mode
    elif w >= h * 4/3:
      target_w = h * 4/3
      biaxial_cropped_w = (w - target_w) / 2

      img = img.crop((biaxial_cropped_w, 0, w - biaxial_cropped_w, h))
    
    # other aspect ratio in portrait mode
    else:
      target_h = w / (4/3)
      biaxial_cropped_h = (h - target_h) / 2

      img = img.crop((0, biaxial_cropped_h, w, h - biaxial_cropped_h))
    
    img = img.resize((640, 480))
    img.save(i)

for i in os.listdir('images'):
  if '.jpg' in i:  # valid image
    convert_image(f"images/{i}")


## Stage 2: Time to Train!
Let's train our data now using a simple convolutional neural network (CNN).

In [41]:
# ML imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms

# function to transform PIL image into np tensor
transform_func = transforms.Compose([
  transforms.ToTensor()
])

# order of priority: NVIDIA Cuda, Apple MPS, CPU.
device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))

# Dataset class: automatically partitions into 80% train/20% validation
class HomeValueDataset(Dataset):
  def __init__(self, train=True):
    self.imgs = [transform_func(Image.open(f"images/{x}.jpg")) for x in main_df.index.values]
    self.index_modifier = 0 if train else train_count - (train_count // 5)
    self.train = train

  def __getitem__(self, index):
    input = self.imgs[index+self.index_modifier]
    output = main_df["Assessed Value"].iloc[index+self.index_modifier]

    return input, output

  def __len__(self):
    return len(self.imgs) - (len(self.imgs) // 5) if self.train else len(self.imgs) // 5

class HomeValueCNN(nn.Module):
  def __init__(self, num_outputs=1):
    super(HomeValueCNN, self).__init__()
    
    # Load the VGG16 model pre-trained on ImageNet
    vgg = models.vgg16(pretrained=True)
    
    # Modify the classifier part of VGG to suit 640x480 images
    # Remove the last classifier layers and adapt them to regression
    
    # The VGG architecture for reference
    self.features = vgg.features  # Convolutional part remains the same
    
    # Define custom classifier
    self.regressor = nn.Sequential(
        nn.Linear(512 * 20 * 15, 4096),  # Adjust input size to match the output from conv layers for 640x480
        nn.ReLU(inplace=True),
        nn.Dropout(),
        nn.Linear(4096, 4096),
        nn.ReLU(inplace=True),
        nn.Dropout(),
        nn.Linear(4096, num_outputs)  # Output size for regression
    )
  
  def forward(self, x):
    # Forward through the VGG feature extractor
    x = self.features(x)
    
    # Flatten the output from the conv layers
    x = torch.flatten(x, 1)
    
    # Forward through the regressor
    x = self.regressor(x)
    
    return x

# Instantiate the model
model = HomeValueCNN()

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /Users/darrenrs/.cache/torch/hub/checkpoints/vgg16-397923af.pth
Python(92395) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
100%|██████████| 528M/528M [00:18<00:00, 30.2MB/s] 


In [43]:
# Define loss function and optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()

# main train loop
def train_model(model, train_loader, val_loader, epochs=epochs):
  model.to(device)

  for epoch in range(epochs):
    model.train()
    training_run_loss = 0.0
    
    for images, labels in train_loader:
      images, labels = images.to(device), labels.to(device)
      
      # Convert labels to float for regression and adjust shape
      labels = labels.float().unsqueeze(1)  # Change shape from [batch_size] to [batch_size, 1]

      # Zero the parameter gradients
      optimizer.zero_grad()

      # Forward pass
      outputs = model(images)
      loss = criterion(outputs, labels)

      # Backward pass and optimize
      loss.backward()
      optimizer.step()

      training_run_loss += loss.item() * images.size(0)

    training_loss = training_run_loss / len(train_loader.dataset)

    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
      for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
      
        # Convert labels to float for regression and adjust shape
        labels = labels.float().unsqueeze(1)  # Change shape from [batch_size] to [batch_size, 1]

        outputs = model(images)
        loss = criterion(outputs, labels)
        
        val_loss += loss.item() * images.size(0)

    val_loss /= len(val_loader.dataset)
    
    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {training_loss:.4f}, Validation Loss: {val_loss:.4f}')

# Example usage (assuming train_loader and val_loader are defined DataLoader objects)
train_dataset = HomeValueDataset(train=True)
val_dataset = HomeValueDataset(train=False)

# Initialize DataLoaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, pin_memory=True)

train_model(model, train_loader, val_loader)

RuntimeError: MPS backend out of memory (MPS allocated: 15.77 GB, other allocations: 82.47 MB, max allowed: 18.13 GB). Tried to allocate 2.34 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

## Stage 3

In [31]:
convert_image("test/TestHome700k.jpg")
test_image_tensor = transform_func(Image.open("test/TestHome700k.jpg")).unsqueeze(0)
test_image_tensor = test_image_tensor.to(device)

with torch.no_grad():
  prediction = model(test_image_tensor)
  predicted_value_700k = prediction.item() * 1e6

In [32]:
convert_image("test/TestHome400k.jpg")
test_image_tensor = transform_func(Image.open("test/TestHome400k.jpg")).unsqueeze(0)
test_image_tensor = test_image_tensor.to(device)

with torch.no_grad():
  prediction = model(test_image_tensor)
  predicted_value_400k = prediction.item() * 1e6

In [33]:
convert_image("test/TestHome1.3M.jpg")
test_image_tensor = transform_func(Image.open("test/TestHome1.3M.jpg")).unsqueeze(0)
test_image_tensor = test_image_tensor.to(device)

with torch.no_grad():
  prediction = model(test_image_tensor)
  predicted_value_1300k = prediction.item() * 1e6

In [38]:
model.fc_input_size

1228800