# Download Zillow Images

While writing this, Zillow's API was down so instead I used a URL on their website which doesn't seem to use the API.

**NOTE** I do not recommend using this method for getting results from Zillow.com and would say their API is a better route.

In [6]:
from collections import namedtuple
from time import sleep
from lxml import html

import requests
import logging
import json

In [7]:
# Constants used in getting results from the API.
Result = namedtuple("Result", ["guid", "image_url", "price"])

# Region for Portland, OR according to Zillow.com.
PORTLAND = "-123014946,45489020,-122293968,45594822"

logger = logging.getLogger("Download Zillow Images")
logger.setLevel(logging.DEBUG)

In [8]:
def get_images_from_detail_zillow_listing(zp_id):
    """
    Open a Zillow detail page and scrape it for any photo URLs.
    
    Parameters
    ----------
    zp_id : str
        Zillow listing ID.
        
    Returns
    -------
    list(str)
        List of all the images found on the page.
        
    Notes
    -----
    Will return None if an error occurs.
    """
    response = requests.get(
        "http://www.zillow.com/homedetails/{zp_id}_zpid/".format(
            zp_id=zp_id))

    if response.status_code is not 200:
        logger.error("Trouble hitting zillow.com: %i", response.status_code)
        return None

    tree = html.fromstring(response.text)

    return map(
        lambda i: i.get("src") or i.get("href"),
        tree.xpath("//ol[@class='photos']/li/a/img")
    )


def get_page_of_zillow_articles(page, rect=PORTLAND):
    """
    Request a page of search results from Zillow's "GetResults.html" webpage.
    
    Parameters
    ----------
    page : int
        Page number to request, can be used in pagination.
    rect : csv
        Latitude and longitude coordinates for the map.
        
    Returns
    -------
    list
        List of Articles found in the search which include their Price and preview image URLs.
    """
    url = 'http://www.zillow.com/search/GetResults.htm'

    response = requests.get(
        url,
        params={
            "spt": "homes",  # We ignore any forclosure or renting properties
            "status": 100000,
            "zoom": 10,
            "rect": rect,
            "sort": "days",
            "search": "maplist",
            "zoom": 10,
            "p": page,
        })

    if response.status_code is not 200:
        logger.error("Trouble hitting zillow.com: %i", response.status_code)
        return None

    results = response.json()

    return results["map"]["properties"]


def get_zillow_results_from_web(max_articles=None):
    """
    Get a single page of results from Zillow's search and then slowly request their corresponding
    property detail pages to get images.
    
    Parameters
    ----------
    max_articles : int
        Once this threshold is crossed, stop requesting any further detail pages.
        
    Yields
    ------
    list
        Each set of images for a Zillow listing.
    """
    articles = get_page_of_zillow_articles(1, rect=PORTLAND)

    for counter, article in enumerate(articles):
        zp_id = article[0]
        price = article[3]

        article_results = []
        logger.debug("Getting images for %s.", zp_id)
        for image in get_images_from_detail_zillow_listing(zp_id):
            article_results.append(Result(zp_id, image, price))

        if len(article_results) == 0:
            logger.info("No images found for %s, recording an empty list.", zp_id)
            
        with open("./details/{zp_id}.json".format(zp_id=zp_id), "w") as f:
            f.write(
                json.dumps(
                    {"results": article_results},
                    indent=4,
                    sort_keys=True))

        yield article_results

        if max_articles and counter > max_articles:
            logger.info("Stopped downloading, hit max number of articles.")
            break

        # Add in a sleep timer if you actually use something like this, please never request too often.
        # sleep(0.5)


def download_image(result, image_dir):
    """
    Download a Zillow Result.
    
    Parameters
    ----------
    result : Result
        Result with an #image_url to download.
    image_dir : str
        Directory to store images, they are saved with their web filename.
    """
    response = requests.get(result.image_url, stream=True)

    logger.info("Downloading %s.", result)
    if response.status_code is 200:
        filename = result.image_url.split("/")[-1]
        with open(
                "{base}/{image_filename}".format(
                    base=image_dir,
                    image_filename=filename),
                "wb") as f:
            for chunk in response.iter_content():
                f.write(chunk)

In [9]:
for images in get_zillow_results_from_web(max_articles=10):
    for result in images:
        download_image(result, "./images")

logger.info("Completed Downloads.")

DEBUG:Download Zillow Images:Getting images for 53929464.
INFO:Download Zillow Images:Downloading Result(guid=53929464, image_url='http://photos3.zillowstatic.com/p_h/ISz0m8d8ev9jmr.jpg', price=u'$299K').
INFO:Download Zillow Images:Downloading Result(guid=53929464, image_url='http://photos2.zillowstatic.com/p_h/ISz0m8d4gt1foj.jpg', price=u'$299K').
INFO:Download Zillow Images:Downloading Result(guid=53929464, image_url='http://photos2.zillowstatic.com/p_h/ISz0nw15g3fvz7.jpg', price=u'$299K').
INFO:Download Zillow Images:Downloading Result(guid=53929464, image_url='http://photos3.zillowstatic.com/p_h/ISz0nw19e5nzxf.jpg', price=u'$299K').
INFO:Download Zillow Images:Downloading Result(guid=53929464, image_url='http://photos3.zillowstatic.com/p_h/ISz0m8cwkol7s3.jpg', price=u'$299K').
INFO:Download Zillow Images:Downloading Result(guid=53929464, image_url='http://photos3.zillowstatic.com/p_h/ISz214hfj2k7tv.jpg', price=u'$299K').
INFO:Download Zillow Images:Downloading Result(guid=53929464