## Objective

**Predict the price of a New York City apartment.**

This project originally used data from renthop.com, an apartment listing website, provided by a Kagggle competition. This notebook scraps the listings directly from renthop.com.

In [2]:
from __future__ import print_function, division

In [2]:
import random
import uuid
import os
import sys

### Helper Functions

Import functions for creating nicely formatted output.

In [3]:
# %load 'utils.py'
from IPython.display import display, HTML

def pp_bold(str):
    display(HTML('<b>{}</b>'.format(str)))


def pp_listOflist(l):
    display(HTML(
        u'<table>{}</table>'.format(
            u''.join(u'<tr>{}</tr>'.format(
                u''.join(u'<td>{}</td>'.format(v) for v in sublist)) for sublist in l))))
    

def pp_dict(d, rows=None):
    if not rows or rows >= len(d):
        display(HTML(
            u'<table>{}</table>'.format(
                u''.join(u'<tr><td><b>{}</b></td><td>{}</td></tr>'.format(k, d[k]) for k in d))))
    else:
        nitems = len(d)
        width = -(-nitems // rows)
        i = 0
        list_ = [[] for _ in range(rows)]
        for _ in range(width):
            for row in range(rows):
                if i < nitems:
                    k, v = d.items()[i]
                    list_[row].extend(['<b>{}</b>'.format(k), v])
                i += 1
        pp_listOflist(list_)


def pp_dictOflist(d):
    display(HTML(
        u'<table>{}</table>'.format(
            u''.join(u'<tr><td><b>{}</b></td>{}</tr>'.format(k,
                u''.join(u'<td>{}</td>'.format(v) for v in d[k])) for k in d.keys()))))
    

def pp_dfinfo(df, width=4):
    ncols = len(df.columns)
    width = min(width, ncols)
    depth = -(-ncols // width)
    i = 0
    list_ = [[] for _ in range(depth)]
    for _ in range(width):
        for row in range(depth):
            if i < ncols:
                col = df.columns[i]
                list_[row].extend(['<b>{}</b>'.format(col), df[col].count(), df.dtypes[i]])
            i += 1

    print('{} entries, {} columns'.format(len(df), ncols))
    pp_listOflist(list_)


def pp_counts(series, rows=1, caption=None):
    if caption: pp_bold(caption)
    list_ = [(k, '{:.4f}'.format(v)) for k, v in series.to_dict().items()] 
    dict_ = OrderedDict(sorted(list_, key=lambda x: x[0]))
    pp_dict(dict_, rows)


def pp_progress(s):
    sys.stdout.write('\r{}'.format(s))
    sys.stdout.flush()


Implement simple web page cache.

In [26]:
import os
import pickle
import requests

_cache = None
_cache_dir = "./cache"
_cache_index = os.path.join(_cache_dir, 'index.pkl')
_cache_counter = 0

def cache_init():
    global _cache
    if _cache == None:
        if os.path.exists(_cache_index):
            with open(_cache_index, 'rb') as fd:
                _cache = pickle.load(fd)
        else:
            _cache = {}
    return _cache

def cache_get(key):
    return cache_init().get(key, '')

def cache_add(key, value):
    global _cache_counter
    cache = cache_init()
    cache[key] = value
    _cache_counter += 1
    if _cache_counter % 100 == 0:
        cache_commit()

def cache_commit():
    if not _cache == None:
        with open(_cache_index, 'wb') as fd:
            pickle.dump(_cache, fd)

In [437]:
def get_page(url, useCache=True):
    """Get a web page."""

    # Check if we have this page
    
    filename = cache_get(url)
    if useCache and filename and os.path.exists(filename):
        with open(filename, 'rb') as fd:
            return fd.read()

    # Otherwise, download the page ...
    
    r = requests.get(url, timeout=10)
    r.raise_for_status()
    
    # ... and cache it

    global _cache_dir
    if not os.path.isdir(_cache_dir):
        os.mkdir(_cache_dir)
        
    if not filename:
        filename = os.path.join(_cache_dir, uuid.uuid4().hex + '.html')

    with open(filename, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=4096):
            fd.write(chunk)

    cache_add(url, filename)
    
    return r.text

In [422]:
import datetime
import re
import time
from bs4 import BeautifulSoup
from geojson import Point

### Web Scraping

Connect to the MongoDB database.

In [439]:
import pymongo
client = pymongo.MongoClient('ec2-34-198-246-43.compute-1.amazonaws.com', 27017)
db = client.renthop2
collection = db.listings
pp_bold('{} listings'.format(collection.count()))

Scrape the pages and add new listings.

In [440]:
def extract_details(listing):
    html_doc = get_page(listing['url'])
    soup = BeautifulSoup(html_doc, 'lxml')
    body = soup.find(id='listing-details-body-premap')
    if not body:
        print('warning: body not found: {}'.format(listing['url']), file=sys.stderr)
        return

    for div in body.find_all('div', recursive=False):
        desc = div.find('div', id='description')
        if desc:
            listing['description'] = desc.find_next_sibling().text.strip()
            continue

        features_div = div.find('div', text=re.compile('Features & Amenities'))
        if features_div:
            features = []
            for div in features_div.find_next_sibling().find_all('div'):
                table = div.find('table')
                if table:
                    feature = table.tr.find_all('td')[1].text.strip()
                    if 'Featured' != feature:
                        features.append(feature)
            listing['features'] = '\n'.join(features)
            continue

        for table in div.find_all('table'):
            for tr in table.find_all('tr'):
                for td in tr.find_all('td'):
                    if 'Studio' in td.text:
                        listing['bedrooms'] = 0.0
                    elif 'Bed' in td.text:
                        listing['bedrooms'] = float(td.text.strip().split()[0])
                    elif 'Bath' in td.text:
                        listing['bathrooms'] = float(td.text.strip().split()[0])


In [443]:
URL_RENTHOP = 'https://www.renthop.com/search/nyc?min_price=0&max_price=50000&sort=hopscore&page={}'

# Identify old listings

seen = set([r['listing_id'] for r in collection.find({}, {'_id':0, 'listing_id':1})])

# Scrape new listings

for pageno in range(925, 4000):
    # Download page

    url = URL_RENTHOP.format(pageno)
    html_doc = get_page(url)
    soup = BeautifulSoup(html_doc, 'lxml')
    pp_progress('Scraping page {} '.format(url))

    # Verify page number
    
    page_box = soup.find('input', id='page_input_box')
    if page_box['value'] != str(pageno):
        print('error: incorrect page_input_box value: {}', page_box['value'], file=sys.stderr)
        break

    # Find listings

    listings_divs = soup.find_all('div', class_='search-listing')
    if not listings_divs:
        print('error: no listings on page {}'.format(pageno), file=sys.stderr)
        break
        
    # Extract listings
    
    bulk = collection.initialize_ordered_bulk_op()
    for div in listings_divs:
        listing_id = int(div['listing_id'])
        if listing_id in seen:
            continue
            
        listing = { 
            'created' : unicode(datetime.datetime.utcnow()).split('.')[0],
        }
        listing['listing_id'] = listing_id
        listing['latitude'] = float(div['latitude'])
        listing['longitude'] = float(div['longitude'])
        listing['loc'] = Point((listing['longitude'], listing['latitude']))
        
        title = div.find('a', class_='listing-title-link')
        listing['url'] = title['href']
        listing['title'] = title.string
        
        price = div.find(id='listing-{}-price'.format(listing_id))
        listing['price'] = int(re.sub('[^0-9]', '', price.string))
        
        # Extract listing details
        
        pp_progress('Scraping page {}: {}'.format(pageno, listing['url']))
        extract_details(listing)
        if 'bedrooms' in listing: # ignore shares
            bulk.insert(listing)
            seen.add(listing_id)
#            time.sleep(0.25)

    # Save listings

    try:
        bulk.execute()
        cache_commit()
    except pymongo.errors.InvalidOperation as e:
        if str(e) == 'No operations to execute':
            pass
        
print('\nDone!')

Scraping page 1383: https://www.renthop.com/listings/east-16th-street/6p/8661183



Scraping page 1432: https://www.renthop.com/listings/classon/15l/8740631



Scraping page 1433: https://www.renthop.com/listings/west-65th-street/4a/8348997



Scraping page 1516: https://www.renthop.com/listings/dahill-road/5t/8503853



Scraping page 1556: https://www.renthop.com/listings/31st-street/3b/8695181



Scraping page 1563: https://www.renthop.com/listings/west-26th-street/808/8448910



Scraping page 1794: https://www.renthop.com/listings/hancock-st/e/8642212



Scraping page 1860: https://www.renthop.com/listings/east-71st-street/jp87/8706783



Scraping page 1878: https://www.renthop.com/listings/e-4th-st/1/8633639



Scraping page 1892: https://www.renthop.com/listings/staniford-st/3404/8106016



Scraping page 1895: https://www.renthop.com/listings/63rd-st-3rd-ave/425/8546362



Scraping page 1904: https://www.renthop.com/listings/hancock-street/1j/8602189



Scraping page 1909: https://www.renthop.com/listings/east-32nd-street/6-a/8448596



Scraping page 1912: https://www.renthop.com/listings/west-43rd-street/na/8592951



Scraping page 1925: https://www.renthop.com/listings/mott-street/4n/8341585



Scraping page 1957: https://www.renthop.com/listings/e-37th-st/52/7401877



Scraping page 1958: https://www.renthop.com/listings/kneeland-st/1907/8220130



Scraping page 1969: https://www.renthop.com/listings/30th-street/4-a/8618726



Scraping page 1973: https://www.renthop.com/listings/east-37th-street/1aw/8723937



Scraping page 1988: https://www.renthop.com/listings/eighth-avenue/4a/8688425



Scraping page 1989: https://www.renthop.com/listings/washington-st/2009/8253649



Scraping page 1991: https://www.renthop.com/listings/807-halsey-street/3/7991740



Scraping page 1993: https://www.renthop.com/listings/nashua-st/2902/8401702



Scraping page 1995: https://www.renthop.com/listings/sheridan-square/2-a/8715309



Scraping page 1996: https://www.renthop.com/listings/east-13th-street/3e/8198929



Scraping page 2004: https://www.renthop.com/listings/astor-pl/3b/8347069



Scraping page 2013: https://www.renthop.com/listings/west-end-avenue/11-c/8556355



Scraping page 2022: https://www.renthop.com/listings/west-77th-street/03g/8561846



Scraping page 2027: https://www.renthop.com/listings/west-31st-street/na/8695762



Scraping page 2040: https://www.renthop.com/listings/west-11th-street/6-a/8453703



Scraping page 2070: https://www.renthop.com/listings/e-46th-st/11/7511504



Scraping page 2071: https://www.renthop.com/listings/east-115th-street/4/8332077



Scraping page 2089: https://www.renthop.com/listings/seaport-blvd/ph1614/8406353



Scraping page 2113: https://www.renthop.com/listings/east-116th-street/2-a/8188808



Scraping page 2126: https://www.renthop.com/listings/38th-st/3r/8657629



Scraping page 2126: https://www.renthop.com/listings/w-47-st/8t/8732359



Scraping page 2132: https://www.renthop.com/listings/west-39th-street/ml069/8704985



Scraping page 2159: https://www.renthop.com/listings/w-21st-st/303/8735122



Scraping page 2160: https://www.renthop.com/listings/10th-ave/39f/8696612



Scraping page 2236: https://www.renthop.com/listings/wall-street/1508/8620478



Scraping page 2292: https://www.renthop.com/listings/e-77-st/na/8708730



Scraping page 2310: https://www.renthop.com/listings/fifth-avenue/6d/8523849



Scraping page 2443: https://www.renthop.com/listings/e-54-street/4a/8705808



Scraping page 2443: https://www.renthop.com/listings/maiden-lane/506/8570902



Scraping page 2463: https://www.renthop.com/listings/220-w-107th/4d/8721591



Scraping page https://www.renthop.com/search/nyc?min_price=0&max_price=50000&sort=hopscore&page=3174 
Done!


error: incorrect page_input_box value: {} 3173


In [434]:
import pandas as pd
import numpy as np
columns = ['listing_id', 'bedrooms', 'bathrooms', 'features', 'price']
df = pd.DataFrame(list(collection.find({}, columns)), columns=columns)
df.info()

217
