In [198]:
from bs4 import BeautifulSoup
import urllib2
import sys
import csv
import untangle
import re
import pandas as pd
sys.argv[1] = 'YOUR ZILLOW API KEY' 

In [199]:
# Transfer list of list to csv file to save it
'''
Take list of list as input
Generate csv file and save
'''

def listToCsv(putListHere): #list is a list of lists [[],[],[],[]]
    with open("zillow_citydata-1.csv", "wb") as data:
        writer = csv.writer(data)
        writer.writerows(putListHere)

In [200]:
# Use zillow API to get zip code list according to the inpute city/state
'''
Input city/state
returl a zip code list
'''

# KEY is Zillow key: X1-ZWz19khtiopr0r_4n5nu, X1-ZWz19kgq22gnij_5f91a
def GetZip(city, state, KEY = sys.argv[1]): #given a city and state, returns a list of zip codes from API GetRegionChildren
    ZipList = []
    # Find all zipcode in Seattle
    QuoteURL = "http://www.zillow.com/webservice/GetRegionChildren.htm?zws-id=%s&state=%s&city=%s&childtype=zipcode"

    city = city.replace(' ', '%20')
    state = state.replace(' ', '%20')
    URL = QuoteURL % (KEY, state, city)
    response = urllib2.urlopen(URL)
    xmldata = response.read()  # Seattle read all data

    # Store the zip code in the zipcodelist
    data = untangle.parse(xmldata)

    for zipcode in data.RegionChildren_regionchildren.response.list.region:
        ZipList.append(zipcode.name.cdata)
    return ZipList

In [201]:
# Get property Id and property detail (square footage)
'''
getZPID: get property ID (scrape data from searching result of zillow)
getPropertyData: get number of square foot information for particular property
'''

def getZPID(property): # Get Property ID given an html and store in a list
    if property.find('a') is None: # if a property has no id
        return 0
    else:
        zpid = property.find('a')['data-fm-zpid']
        return zpid

def getPropertyData(property): # Given the property, get the bath, bed and sqft info
    detail = [x.strip() for x in property.text.split(u'·')]
    for item in detail:
        if item == 'studio': item = 0
    detail = [re.sub("\D", "", x) for x in detail]  # remove char from list, we need number only
    position = [len(detail[0]), len(detail[1]), len(detail[2])].index(max(len(detail[0]), len(detail[1]), len(detail[2])))
    if len(detail[position]) > 2:
        return detail[position]
    else:
        return None
    #return detail[0], detail[1], detail[2], detail[position]

In [202]:
# Use Zillow API to get property estimated price and rent
'''
getRentPrice: get zillow estimated rent and price from API
getListPrice: get zillow listed price from API
'''

def getRentPrice(zpid, KEY = sys.argv[1]): # Giving a property ID, return the price and rent
    QuoteURL = "http://www.zillow.com/webservice/GetZestimate.htm?zws-id=%s&zpid=%s&rentzestimate=true"
    URL = QuoteURL % (KEY, zpid)
    response = urllib2.urlopen(URL)
    xmldata = response.read()  # Seattle read all data

    # Untangle and get the rent/price for each property
    data = untangle.parse(xmldata)
    try:
        zestimateRent = data.Zestimate_zestimate.response.rentzestimate.amount.cdata
        zestimatePrice = data.Zestimate_zestimate.response.zestimate.amount.cdata
    except:
        zestimatePrice = ""
        zestimateRent = ""
    return zestimatePrice, zestimateRent

def getListPrice(property): # get property price
    if property:
        price = re.sub("\D", "", property.text)
    else:
        price = ''
    return price

In [203]:
# From http://www.city-data.com/, get each zip code information
'''
input: zip code number
output: list of data
1. income: Estimated median household income in 2015
2. prate: Residents with income below the poverty level in 2015
3. age: Median resident age:
4. edd: Bachelor's degree or higher
5. col: Mar. 2016 cost of living index in this zip code
'''

def cdScrape(zipCode):
    myurl = urllib2.urlopen("http://www.city-data.com/zips/%s.html" % (zipCode))
    whole = BeautifulSoup(myurl, "html.parser")
    html_text = whole.get_text(' ', strip=False)
    povrate = re.search('(?<=poverty level in 2015:  \n This zip code: )[\d\.]+', html_text)
    mhincome = re.search('(?<=income in 2015:  This zip code: \$)[\d,]{3,}', html_text)
    ageraw = re.search('(?<=resident age: This zip code: )[\d\.]+', html_text)
    ed = re.search("(?<=Bachelor's degree or higher:  )[\d\.]+", html_text)
    costOfLiving = re.search("(?<=living index in zip code [\d]{5}:  )[\d\.]+", html_text)
    metrics = []
    if mhincome:
        income = mhincome.group(0).replace(',', '')
    else:
        income = ''
    if povrate:
        prate = povrate.group(0)
    else:
        prate = ''
    if ageraw:
        age = ageraw.group(0)
    else:
        age = ''
    if ed:
        edd = ed.group(0)
    else:
        edd = ''
    if costOfLiving:
        col = costOfLiving.group(0)
    else:
        col = ''
    return income, prate, edd, col, age

In [204]:
# Combine functions above, scrape data from Zillow and city data
'''
input: zip code number, city and state
output: list of data ['zillow ID', 'lat', 'lon', 'sqft', 'price', 'zillow estimated price', 'zillow estimated rent', 'income', 'property', 'degree', 'cost of living']
Flip page according to the number of result, and get all property data from Zillow
Get city data from citydata.com
'''

def HouseGrabber(zip, cityyy, stateee):  #given a zip, returns house stuff
    URL = "http://www.zillow.com/homes/for_sale/94105/house,condo,apartment_duplex,mobile,townhouse_type/1_rs/1_fr/"
    req = urllib2.Request(URL, headers={'User-Agent': 'Resistance Ha'})
    response = urllib2.urlopen(req)
    html = BeautifulSoup(response, "html.parser")

    if html.find_all('h3', {'class': 'zsg-content_collapsed'}):
        pass
    else:
        numResult = html.findAll('meta')[2].attrs[u'content'].split(" ")[2]

    if numResult > 500:
        page = range(20)
    else:
        page = range(int(numResult) / 25)

    ###### DELETE AFTER TESTING #######
    #page = range(1)

    zipdetail = []
    for i in page:
        URL = "http://www.zillow.com/homes/for_sale/%s/house,condo,apartment_duplex,mobile,townhouse_type/%s_p/1_rs/1_fr/" % (zip, i)
        req = urllib2.Request(URL, headers={'User-Agent': 'Resistance Ha'})
        response = urllib2.urlopen(req)
        html = BeautifulSoup(response, "html.parser")

        if html.find_all('h3',{'class':'zsg-content_collapsed'}):       #some ZIPs don't exist, if you search one on zillow it gives "nearby" homes
            pass                                                        #we DON'T want this, therefore if there are no homes it will fail
        else:
            #tempvar1 = citydata()
            #tempvar2 = data2()
            income, prate, edd, col, age = cdScrape(zip)

            for item in html.find_all(class_="zsg-photo-card-content zsg-aspect-ratio-content"):
                if item.find(class_="zsg-photo-card-actions") is not None:
                    zpid = getZPID(item.find(class_="zsg-photo-card-actions"))
                    if zpid == 0:
                        continue
                    sqft = getPropertyData(item.find(class_="zsg-photo-card-info"))
                    zprice, zrent = getRentPrice(zpid)
                    price = getListPrice (item.find(class_="zsg-photo-card-price"))
                    lat = item.findAll('meta')[0].attrs[u'content']
                    lon = item.findAll('meta')[1].attrs[u'content']

                    zipdetail.append((cityyy, stateee, zip, zpid, lat, lon, sqft, price, zprice, zrent, income, prate, edd, col, age))

    return zipdetail

In [205]:
# Get all properties data 
'''
Get Zip code list and return fulllist of all properties
Save the result(list of list) to csv file
'''

fulllist = [['City', 'State', 'ZIP', 'ZillowID','latitude', 'lontitude', 'sqft','Price', 'ZPrice', 'ZRent', 'Income', 'Poverty','Degree', 'CostofLiving', 'Age']]    #citylist = getCitylist()
#ziplist = GetZip('San Francisco', 'CA')
ziplist = ['94130', '94133', '94129', '94123', '94111', '94108', '94105', '94104', '94103', '94114', '94107', '94131', '94116', '94124', '94127', '94132', '94134', '94128']
for zip in ziplist[1:3]:
    fulllist.extend(HouseGrabber(zip, 'San Francisco', 'CA'))

In [206]:
listToCsv(fulllist)

In [207]:
# Get average rent, price per zipcode
'''
Transfer list of list to panda dataframe and aggregate by zip code to get average estimated price/rent
Average data of following list:
1. 'Price': listed property price
2. 'ZPrice': estimated zillow price
3. 'ZRent': zillow estimated rent
4. 'RentSqft': rent per square footage
5. 'Income': average Estimated median household income in 2015 per zip code
6. 'Poverty': average percentage of residents with income below the poverty level in 2015
7. 'Degree': average percentage of Bachelor's degree or higher
8. 'CostofLiving': average of Mar. 2016 cost of living index per zip code
'''

headers = fulllist.pop(0)
df = pd.DataFrame(fulllist, columns=headers)
for c in ['sqft','Price', 'ZPrice', 'ZRent', 'Income', 'Poverty','Degree', 'CostofLiving', 'Age']:
    df[c] = pd.to_numeric(df[c])
df['RentSqft'] = df['ZRent']/df['sqft']
dfgroup = df.groupby(['City', 'State', 'ZIP'], axis=0)['Price', 'ZPrice', 'ZRent', 'RentSqft','Income', 'Poverty','Degree', 'CostofLiving', 'Age'].mean()

dfgroup.to_csv('zillow_citydata_condensed-1.csv')