# Transform Downloaded Zillow Details

We expect the downloaded information to be a bunch of image URLs with their corresponding details downloaded earlier. This format needs to be translated into a format which can be used for further categorization or straight into Caffe.

In [17]:
from collections import namedtuple

import json
import glob
import csv

In [18]:
Result = namedtuple("Result", ["guid", "image_url", "price"])

In [19]:
def parse_image_detail_files(image_detail_files="./details/*.json"):
    """
    Parse the image detail json files.
    
    Parameters
    ----------
    image_details : str
        Glob pattern to search for Zillow detail pages for images.
        
    Yields
    ------
    Result
        Each Zillow result from the different Zillow files.
    """
    for image_detail_filename in glob.glob(image_detail_files):
        raw_results = None
        with open(image_detail_filename, "r") as image_detail_file:
            raw_results = json.load(image_detail_file)["results"]

        results = map(lambda r: Result(*r), raw_results)
        for result in results:
            yield result

In [20]:
def mturk_variables(output_filename, parser=parse_image_detail_files):
    """
    Write a list of images as a file full of variables for mturk to create as
    separate HITs.

    The details are from the script to get a set of images from Zillow, when the
    script runs it downloads the JSON for that listing including image URLs.

    Paramters
    ---------
    output_filename : str
        File to be used by mturk, in TSV format.
    parser : function
        Function which yields one Result as it goes.
    """
    with open(output_filename, "w") as images_file:
        writer = csv.DictWriter(
            images_file,
            delimiter="\t",
            fieldnames=["id", "url"])
        writer.writeheader()

        for result in parser():
            writer.writerow({
                "id": result.guid,
                "url": result.image_url})

In [21]:
def web_categorization(output_filename, parser=parse_image_detail_files):
    """
    Write a TSV which can be used with the webapp's `python manage.py import_uncategorized_images` command.
    
    Parameters
    ----------
    output_filename : str
        Location to write the TSV, the webapp will need access to it at least once.
    parser : function
        Function which yields one Result as it goes.
        
    Notes
    -----
    This is practically the same as #mturk_variables but has changed a few times.
    """
    with open(output_filename, "w") as images_file:
        writer = csv.DictWriter(
            images_file,
            delimiter="\t",
            fieldnames=["zillow_id", "url"])
        writer.writeheader()

        for result in parser():
            writer.writerow({
                "zillow_id": result.guid,
                "url": result.image_url})

In [22]:
mturk_variables("./mturk/input/images_with_ids.tsv")

In [23]:
web_categorization("./categorization/uncategorized_images.tsv")