In [1]:
from __future__ import print_function, division

In [2]:
import random
import uuid
import os
import sys

### Utilities

#### Pretty printers

In [3]:
# %load 'utils.py'
from IPython.display import display, HTML

def pp_bold(str):
    display(HTML('<b>{}</b>'.format(str)))


def pp_listOflist(l):
    display(HTML(
        u'<table>{}</table>'.format(
            u''.join(u'<tr>{}</tr>'.format(
                u''.join(u'<td>{}</td>'.format(v) for v in sublist)) for sublist in l))))
    

def pp_dict(d, rows=None):
    if not rows or rows >= len(d):
        display(HTML(
            u'<table>{}</table>'.format(
                u''.join(u'<tr><td><b>{}</b></td><td>{}</td></tr>'.format(k, d[k]) for k in d))))
    else:
        nitems = len(d)
        width = -(-nitems // rows)
        i = 0
        list_ = [[] for _ in range(rows)]
        for _ in range(width):
            for row in range(rows):
                if i < nitems:
                    k, v = d.items()[i]
                    list_[row].extend(['<b>{}</b>'.format(k), v])
                i += 1
        pp_listOflist(list_)


def pp_dictOflist(d):
    display(HTML(
        u'<table>{}</table>'.format(
            u''.join(u'<tr><td><b>{}</b></td>{}</tr>'.format(k,
                u''.join(u'<td>{}</td>'.format(v) for v in d[k])) for k in d.keys()))))
    

def pp_dfinfo(df, width=4):
    ncols = len(df.columns)
    width = min(width, ncols)
    depth = -(-ncols // width)
    i = 0
    list_ = [[] for _ in range(depth)]
    for _ in range(width):
        for row in range(depth):
            if i < ncols:
                col = df.columns[i]
                list_[row].extend(['<b>{}</b>'.format(col), df[col].count(), df.dtypes[i]])
            i += 1

    print('{} entries, {} columns'.format(len(df), ncols))
    pp_listOflist(list_)


def pp_counts(series, rows=1, caption=None):
    if caption: pp_bold(caption)
    list_ = [(k, '{:.4f}'.format(v)) for k, v in series.to_dict().items()] 
    dict_ = OrderedDict(sorted(list_, key=lambda x: x[0]))
    pp_dict(dict_, rows)


def pp_progress(s):
    sys.stdout.write('\r{}'.format(s))
    sys.stdout.flush()


#### A simple URL cache

In [4]:
import os
import pickle
import requests

_cache = None
_cache_dir = "./cache"
_cache_index = os.path.join(_cache_dir, 'index.pkl')

def cache_init():
    global _cache
    if _cache == None:
        if os.path.exists(_cache_index):
            with open(_cache_index, 'rb') as fd:
                _cache = pickle.load(fd)
        else:
            _cache = {}
    return _cache

def cache_get(key):
    return cache_init().get(key, '')

def cache_add(key, value):
    cache = cache_init()
    cache[key] = value
    with open(_cache_index, 'wb') as fd:
        pickle.dump(cache, fd)

def cache_term():
    with open(_cache_index, 'wb') as fd:
        pickle.dump(cache, fd)

### Web Scraping

In [35]:
def get_page(url, driver):
    """Get a web page."""

    # Check if we have this page
    
    filename = cache_get(url)
    if filename and os.path.exists(filename):
        with open(filename, 'rb') as fd:
            return fd.read()

    # Otherwise, download the page ...
    
    driver.get(url)
    wait = WebDriverWait(driver, 5)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'pagination')))
    html_doc = driver.page_source

    # ... and cache it

    global _cache_dir
    if not os.path.isdir(_cache_dir):
        os.mkdir(_cache_dir)
        
    if not filename:
        filename = os.path.join(_cache_dir, uuid.uuid4().hex)

    with open(filename, 'wb') as fd:
        fd.write(html_doc.encode('utf-8'))

    cache_add(url, filename)
    
    return html_doc

In [30]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By

chromedriver = "/home/dexter/bin/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

In [31]:
driver.get('http://streeteasy.com/for-rent/nyc')
wait = WebDriverWait(driver, 5)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'pagination')))
print("Page is ready!")

Page is ready!


In [32]:
URL_STREETEASY = 'http://streeteasy.com/for-rent/nyc?page={}'

In [36]:
for pageno in range(1, 2170):
    pp_progress('Getting page {}'.format(pageno))
    url = URL_STREETEASY.format(pageno)
    filename = cache_get(url)
    if not (filename and os.path.exists(filename)):
        html_doc = get_page(url, driver)
        del html_doc

Getting page 2169

In [92]:
from bs4 import BeautifulSoup

In [93]:
def scrape_page(url):
    result = []
    soup = BeautifulSoup(get_page(url), 'lxml')
    pass