In [24]:
from __future__ import print_function, division

In [25]:
import random
import uuid
import os
import sys

### Helper Functions

Import functions for creating nicely formatted output.

In [3]:
# %load 'utils.py'
from IPython.display import display, HTML

def pp_bold(str):
    display(HTML('<b>{}</b>'.format(str)))


def pp_listOflist(l):
    display(HTML(
        u'<table>{}</table>'.format(
            u''.join(u'<tr>{}</tr>'.format(
                u''.join(u'<td>{}</td>'.format(v) for v in sublist)) for sublist in l))))
    

def pp_dict(d, rows=None):
    if not rows or rows >= len(d):
        display(HTML(
            u'<table>{}</table>'.format(
                u''.join(u'<tr><td><b>{}</b></td><td>{}</td></tr>'.format(k, d[k]) for k in d))))
    else:
        nitems = len(d)
        width = -(-nitems // rows)
        i = 0
        list_ = [[] for _ in range(rows)]
        for _ in range(width):
            for row in range(rows):
                if i < nitems:
                    k, v = d.items()[i]
                    list_[row].extend(['<b>{}</b>'.format(k), v])
                i += 1
        pp_listOflist(list_)


def pp_dictOflist(d):
    display(HTML(
        u'<table>{}</table>'.format(
            u''.join(u'<tr><td><b>{}</b></td>{}</tr>'.format(k,
                u''.join(u'<td>{}</td>'.format(v) for v in d[k])) for k in d.keys()))))
    

def pp_dfinfo(df, width=4):
    ncols = len(df.columns)
    width = min(width, ncols)
    depth = -(-ncols // width)
    i = 0
    list_ = [[] for _ in range(depth)]
    for _ in range(width):
        for row in range(depth):
            if i < ncols:
                col = df.columns[i]
                list_[row].extend(['<b>{}</b>'.format(col), df[col].count(), df.dtypes[i]])
            i += 1

    print('{} entries, {} columns'.format(len(df), ncols))
    pp_listOflist(list_)


def pp_counts(series, rows=1, caption=None):
    if caption: pp_bold(caption)
    list_ = [(k, '{:.4f}'.format(v)) for k, v in series.to_dict().items()] 
    dict_ = OrderedDict(sorted(list_, key=lambda x: x[0]))
    pp_dict(dict_, rows)


def pp_progress(s):
    sys.stdout.write('\r{}'.format(s))
    sys.stdout.flush()


Implement simple web page cache.

In [4]:
import os
import pickle
import requests

_cache = None
_cache_dir = "./cache"
_cache_index = os.path.join(_cache_dir, 'index.pkl')
_cache_counter = 0

def cache_init():
    global _cache
    if _cache == None:
        if os.path.exists(_cache_index):
            with open(_cache_index, 'rb') as fd:
                _cache = pickle.load(fd)
        else:
            _cache = {}
    return _cache

def cache_get(key):
    return cache_init().get(key, '')

def cache_add(key, value):
    global _cache_counter
    cache = cache_init()
    cache[key] = value
    _cache_counter += 1
    if _cache_counter % 100 == 0:
        cache_commit()

def cache_commit():
    if not _cache == None:
        with open(_cache_index, 'wb') as fd:
            pickle.dump(_cache, fd)

StreetEasy aggresively blocks robots, so use we use Selenium with the Chrome driver (https://sites.google.com/a/chromium.org/chromedriver) to scrape listings. These functions download a page.

In [11]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
driver = None

def get_driver():
    global driver
    if driver == None:
        chromedriver = "/home/dexter/bin/chromedriver"
        os.environ["webdriver.chrome.driver"] = chromedriver
        driver = webdriver.Chrome(chromedriver)
    return driver

def get_page(url, wait_for_element='pagination'):
    """Use Selenium driver to download a web page."""

    # Check if we have this page
    
    filename = cache_get(url)
    if filename and os.path.exists(filename):
        with open(filename, 'rb') as fd:
            return fd.read().decode('utf-8')

    # Otherwise, download the page ...
    
    get_driver().get(url)
    wait = WebDriverWait(driver, 20)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, wait_for_element)))
    html_doc = driver.page_source

    # ... and cache it

    global _cache_dir
    if not os.path.isdir(_cache_dir):
        os.mkdir(_cache_dir)
        
    if not filename:
        filename = os.path.join(_cache_dir, uuid.uuid4().hex)

    with open(filename, 'wb') as fd:
        fd.write(html_doc.encode('utf-8'))

    cache_add(url, filename)
    
    return html_doc

The StreetEasy 'for rent' section has multiple listings per page. This function parses one such listing.

In [6]:
from bs4 import BeautifulSoup
import re
import datetime
from geojson import Point

def get_listing(element):
    """
    Parse a listing from StreetEasy's 'for rent' page. It returns a dictionary with the
    listing information, plus a set containing any unrecognized details.
    """

    unknown = set()
    listing = {
        'created' : unicode(datetime.datetime.utcnow()).split('.')[0],
        'bedrooms' : 0.0,
        'furnished' : 0
    }

    # Get id, url and address
    
    details = element.find('div', class_='details row')
    href = details.find('a', {'data-gtm-listing-id' : True})
    listing['listing_id'] = int(href['data-gtm-listing-id'])
    listing['url'] = unicode(href['href'].split('?')[0])
    listing['street_address'] = unicode(href.string)

    # Get GPS coordinates
    
    loc = element['se:map:point'].split(',')
    if loc[0] and loc[1]:
        listing['latitude'] = float(loc[0])
        listing['longitude'] = float(loc[1])
        listing['loc'] = Point((listing['longitude'], listing['latitude']))

    # Get price
    
    price = element.find('span', class_='price')
    if price:
        listing['price'] = int(re.sub('[^0-9]', '', price.string))
    
    # Get size and whether furnished

    for detail in details.find_all('span', {'class' : re.compile('.*?_detail_cell')}):
        m = re.search('([0-9.,]+) ((bed|bath|ft))', detail.string)
        if not m:
            if 'Furnished' == detail.string:
                listing['furnished'] = 1
            else:
                unknown.add(detail.string)
        elif 'bed' == m.group(2):
            listing['bedrooms'] = float(m.group(1))
        elif 'bath' == m.group(2):
            listing['bathrooms'] = float(m.group(1))
        elif 'ft' == m.group(2):
            listing['size'] = int(m.group(1).replace(',', ''))

    return listing, unknown

### Web Scraping

Connect to the MongoDB database.

In [7]:
import pymongo
client = pymongo.MongoClient('ec2-34-198-246-43.compute-1.amazonaws.com', 27017)
db = client.streeteasy
collection = db.listings
pp_bold('{} listings'.format(collection.count()))

In [8]:
URL_STREETEASY = 'http://streeteasy.com/for-rent/nyc?page={}'

Scrape the pages and add new listings.

In [239]:
# Build set of existing listings

seen = set([r['listing_id'] for r in collection.find({}, {'_id':0, 'listing_id':1})])

# Bulk add new listings.

all_unrecog = set() # track unrecognized listing details
bulk = collection.initialize_ordered_bulk_op()
for pageno in range(1, 2170):
    pp_progress('Scraping page {}'.format(pageno))
    
    # Download the page

    url = URL_STREETEASY.format(pageno)
    html_doc = get_page(url)
    soup = BeautifulSoup(html_doc, 'lxml')

    # Extract the listings
    
    listing_elements = soup.find_all('div', class_='item')
    for element in listing_elements:
        
        # Get next listing. If it has no price, punt.
        
        listing, unrecog = get_listing(element)
        all_unrecog |= unrecog
        if not 'price' in listing:
            print('skipping unpriced listing on page {}'.format(pageno), file=sys.stderr)
            continue
    
        # Add new listings

        listing_id = listing['listing_id']
        if not listing_id in seen:
            seen.add(listing_id)
            bulk.insert(listing)
        
    # Clean up
    del soup, html_doc, listing_elements

# Commit new listings

try:
    print('')
    print(bulk.execute())
    collection.create_index([("loc", pymongo.GEOSPHERE)])
except pymongo.errors.InvalidOperation as e:
    if str(e) == 'No operations to execute':
        pass
    
# Report unrecognized listing details

print('unrecognized details: {}'.format(', '.join(all_unrecog)), file=sys.stderr)

Scraping page 667

skipping unpriced listing on page 665


Scraping page 2169


unrecognized details: 7+ baths, 1+ bath, 4+ baths, 5+ baths, studio, 2+ baths, 3+ baths, 6+ baths


In [35]:
removed_listings = [
    2030929, 2029118, 2029115, 2028448, 2028443, 2028441, 2028423,
    2028418, 2028416, 2027868, 2027264, 1978969, 1978963, 1978923,
    1978905, 1978896, 1978890, 1978882, 1978883, 1978877, 1978878,
    1978876, 1978873, 1978875, 1978856, 1978880, 1978816, 1978817,
    1978796, 1978772, 1978738, 1978727, 1978724, 1978708, 1978681,
    1978660, 1978611, 1978599, 1978598, 1978595, 1978566, 1978779,
    1978705, 2024881, 2020587, 1978480, 1978417, 1978415, 1978411,
    1978409, 1978410, 1978407, 1978395, 1978248, 1976817, 2026352,
    1976530, 2016048,
    ]
listings = [r for r in collection.find({}, {'_id':0, 'listing_id':1, 'url':1})]
try:
    for i, listing in enumerate(listings):
        url = 'http://www.streeteasy.com' + listing['url']
        listing_id = listing['listing_id']
        if not listing_id in removed_listings and not cache_get(url):
            try:
                pp_progress('Getting page {} of {}, {}, {}'.format(i + 1, len(listings), listing_id, url))
                html_doc = get_page(url, wait_for_element='details_info')
                del html_doc
            except TimeoutException:
                pass
finally:
    cache_commit()

Getting page 10770 of 25709, 2015923, http://www.streeteasy.com/building/7221-17-avenue-brooklyn/rental/2015923

KeyboardInterrupt: 

In [34]:
del driver
driver = None