# Building Searchable Documentation for Zillow's API Codes on Quandl's API Client
## *Edward Krueger*

The Quandl documentation for the Zillow data at https://www.quandl.com/data/ZILLOW-Zillow-Real-Estate-Research/documentation/data-organization, isn't quite correct or complete. Even if it were, a searchable dictionary would be useful. Also, Quandl's search isn't completely useful. Fortunately, there is a zipped file with all of the codes and their descriptions.

I will build my own data dictionary and search for the data.

## Setup

### Import and configure packages

In [1]:
# import packages to unzip the list of Quandl Codes
from io import BytesIO
from zipfile import ZipFile
import urllib.request

# import and configure pandas
import pandas as pd
pd.options.display.max_colwidth = 100
pd.set_option('display.max_rows', 1000)

# import quandl and configure API key
import quandl
quandl.ApiConfig.api_key = "qTtZzvXVi7jgbHx5tng8"

### Build functions to create and search the documentation

In [2]:
# strips characters that are not in the alphabet from a string
def strip_non_alpha(s):
    return "".join(filter(lambda x : x.isalpha(), s))

# converts an area type code into its area type
def area_type_code_to_area_type(s):
    if s =="S":
        return "State"
    elif s == "CO":
        return "County"
    elif s == "M":
        return "Greater Metropolitan Area"
    elif s == "C":
        return "City"
    elif s == "N":
        return "Neighborhood"
    elif s == "Z":
        return "Zip Code"
    else:
        return ""

# loads and formats the documentation from the zip file
def load_documentation(url_str = "https://www.quandl.com/api/v3/databases/ZILLOW/codes?api_key=qTtZzvXVi7jgbHx5tng8"):
    
    # open the zip file as a ZipFile
    url = urllib.request.urlopen(url_str)
    zip_file = ZipFile(BytesIO(url.read()))
    
    # find the name of the only file and open it
    names = zip_file.namelist()
    name = names[0]
    
    # load into pandas
    df = pd.read_csv(zip_file.open(name), header = None, low_memory = False)
    
    # rename to columns
    df.columns = ["code", "description"]
    
    # get the area type and area code info
    df["zillow_code"] = df["code"].apply(lambda x : x.split("/")[1])
    df["area_cat_code"] = df["zillow_code"].apply(lambda x : x.split("_")[0])
    df["area_type_code"] = df["area_cat_code"].apply(lambda x : strip_non_alpha(x[0:2]))
    df["area_type"] = df["area_type_code"].apply(area_type_code_to_area_type)
    
    # drop duplicate area_cat_codes
    df = df.drop_duplicates("area_cat_code")
    
    # strip the first part of the description
    df["description"] = df["description"].apply(lambda x : x.split(":")[1])
    df["area"] = df["description"].apply(lambda x : x.split("-")[-1].strip())
    
    # drop unessesary variables
    df = df[["area", "area_cat_code", "area_type_code", "area_type"]]
    
    df["area_type_code"] = pd.Categorical(df["area_type_code"], ["S", "CO", "M", "C", "N", "Z"])
    df.sort_values("area_type_code")
    return df

def filter_documentation(df, area_type_code):
    # subset by area_cat
    df = df[df["area_type_code"] == area_type_code]
    df = df.set_index("area").sort_index()
    return df
    
def search_documentation(df, term):
    df = df[df["area"].str.lower().str.contains(term.lower())]
    df = df[["area", "area_type", "area_cat_code"]]
    return df

## Explore the Documentation

### Load the documentation

In [3]:
df = load_documentation()

### Display the shape, head and tail of the documentation

In [4]:
df.shape

(74410, 4)

In [5]:
df.head()

Unnamed: 0,area,area_cat_code,area_type_code,area_type
0,Arkansas,S4,S,State
1,Colorado,S7,S,State
2,Alaska,S2,S,State
3,Alabama,S3,S,State
4,United States,S1,S,State


In [6]:
df.tail()

Unnamed: 0,area,area_cat_code,area_type_code,area_type
1423949,"Woodville, MS",C24926,C,City
1427109,"Middlesboro, KY",C24924,C,City
1427136,"Pomfret Center, CT",C24925,C,City
1427999,"Clarkton, MO",C24928,C,City
1428160,"New Rockford, ND",C24929,C,City


### Create a DataFrame for each type of area

In [7]:
# find and show the unique area categories and thier codes
df[["area_type_code", "area_type"]].drop_duplicates().reset_index(drop = True)

Unnamed: 0,area_type_code,area_type
0,S,State
1,M,Greater Metropolitan Area
2,CO,County
3,N,Neighborhood
4,C,City
5,Z,Zip Code


In [8]:
# create a DataFrame for each type
S_df = filter_documentation(df, "S")
M_df = filter_documentation(df, "M")
C_df = filter_documentation(df, "C")
CO_df = filter_documentation(df, "CO")
N_df = filter_documentation(df, "N")
Z_df = filter_documentation(df, "Z")

## Search the documentation for Austin

In [9]:
result_df = search_documentation(df, "austin")
result_df

Unnamed: 0,area,area_type,area_cat_code
4018,"Austin, TX",Greater Metropolitan Area,M31
13351,"Austin, MN",Greater Metropolitan Area,M893
42197,"Austin, TX",County,CO1749
121430,"South Austin, Chicago, IL",Neighborhood,N111
121685,"North Austin, Chicago, IL",Neighborhood,N363
123005,"Barton Creek, Austin, TX",Neighborhood,N1683
126939,"West Oak Hill, Austin, TX",Neighborhood,N2706
127010,"East Oak Hill, Austin, TX",Neighborhood,N2736
135535,"Austin Avenue, Waco, TX",Neighborhood,N10157
137280,"Austin's Colony, Bryan, TX",Neighborhood,N11707


## Search the documentation for Meyerland

In [10]:
result_df = search_documentation(df, "meyerland")
result_df

Unnamed: 0,area,area_type,area_cat_code
137888,"Meyerland Area, Houston, TX",Neighborhood,N11814
