In [1]:
# Import Libraries
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import  pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import string
import time
import re
import os
from functools import reduce
trim = re.compile(r'[^\d.,]+')

# http://www.zipcodestogo.com/New%20York/
# http://www.zipcodestogo.com/New%20Jersey/

### Read in NYC Zip Codes
Source: https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm

In [2]:
cwd = os.getcwd()
zipcodes = pd.read_csv(cwd + "/nyc-zip-codes.csv")
zipcodes.head()

Unnamed: 0,Borough,Neighborhood,ZipCode
0,Bronx,Central Bronx,10453
1,Bronx,Central Bronx,10457
2,Bronx,Central Bronx,10460
3,Bronx,Bronx Park and Fordham,10458
4,Bronx,Bronx Park and Fordham,10467


### Generate Craigslist Links

In [3]:
base_links = []
for i in range(0, len(zipcodes)):
    link = "https://newyork.craigslist.org/search/aap?postal={}".format(zipcodes.iloc[i,2])
    base_links.append(link)

In [4]:
base_links[0:5]

['https://newyork.craigslist.org/search/aap?postal=10453',
 'https://newyork.craigslist.org/search/aap?postal=10457',
 'https://newyork.craigslist.org/search/aap?postal=10460',
 'https://newyork.craigslist.org/search/aap?postal=10458',
 'https://newyork.craigslist.org/search/aap?postal=10467']

### Collect Listings Data

In [5]:
start_time = time.time()
sleep_time = 3

housing = pd.DataFrame()
length = len(base_links)
#length = 5

print("--- Expected Run Time: %  seconds ---" % (length*sleep_time))

for i in range(0, length):
    
    time.sleep(sleep_time)
    s = requests.Session()
    retries = Retry(total=3, backoff_factor=0.5)
    s.mount('http://', HTTPAdapter(max_retries=retries))
    r = s.get(base_links[i])
    page = BeautifulSoup(r.content, "lxml")
    
    dates = []
    prices = []
    bedrooms = []
    titles = []
    locations = []
    links = []

    rows = page.findAll('li', {'class': 'result-row'})

    for row in rows:
        # Date
        try: date = row.find('time', {'class': 'result-date'})['datetime']
        except: date = None
        
        # Price
        try: price = row.find('span', {'class': 'result-price'}).text
        except: price = None
            
        # Bedroom
        try: bedroom = row.find('span', {'class': 'housing'}).text
        except: bedroom = None
        
        # Title
        try: title = row.find('a', {'class': 'result-title hdrlnk'}).text
        except: title = None
        
        # Location
        try: location = row.find('span', {'class': 'result-hood'}).text
        except: location = None
        
        # Link
        try: link = row.find('a', href = True)['href']
        except: link = None
        
        dates.append(date)
        prices.append(price)
        bedrooms.append(bedroom)
        titles.append(title)
        locations.append(location)
        links.append(link)
        
    borough = [zipcodes.iloc[i,0]] * len(dates)
    neighborhood = [zipcodes.iloc[i,1]] * len(dates)
    zipcode = [zipcodes.iloc[i,2]] * len(dates)
        
    temp = pd.DataFrame(list(zip(borough, neighborhood, zipcode, dates, prices, bedrooms, titles, locations, links)))
    housing = pd.concat([housing, temp])

print("--- %s seconds ---" % (time.time() - start_time))
print("--- %s seconds NOT sleeping ---" % (time.time() - start_time - (sleep_time * length)))

--- Expected Run Time: 534econds ---
--- 623.296999931 seconds ---
--- 89.2969999313 seconds NOT sleeping ---


In [6]:
# Clean the data
housing.columns = ["Borough", "Neighborhood", "Zipcode", "Date", "Price", "Bedrooms", "Title", "Location", "Link"]

for i in range(0, len(housing)):
    try: housing.iloc[i,4] = housing.iloc[i,4].replace('$', '')
    except: housing.iloc[i,4] = housing.iloc[i,4]
    
    try: housing.iloc[i,5] = housing.iloc[i,5].replace('\n', '')
    except: housing.iloc[i,5] = housing.iloc[i,5]
    
    try: housing.iloc[i,5] = housing.iloc[i,5].replace('-', '')
    except: housing.iloc[i,5] = housing.iloc[i,5]
        
    try: housing.iloc[i,5] = housing.iloc[i,5].strip()
    except: housing.iloc[i,5] = housing.iloc[i,5]
    
    try:
        if housing.iloc[i,5].find('br') == True:
            housing.iloc[i,5] = housing.iloc[i,5][0:3]
        else:
            housing.iloc[i,5] = None
    except: None
    
    try: housing.iloc[i,7] = housing.iloc[i,7].replace('(', '')
    except: housing.iloc[i,7] = housing.iloc[i,7]
        
    try: housing.iloc[i,7] = housing.iloc[i,7].replace(')', '')
    except: housing.iloc[i,7] = housing.iloc[i,7]
        
# Remove Duplictates
housing = housing.drop_duplicates(subset = ['Borough', 'Neighborhood', 'Zipcode', 'Price', 'Bedrooms', 'Location', 'Title'], keep = 'first')

In [11]:
housing.tail()

Unnamed: 0,Borough,Neighborhood,Zipcode,Date,Price,Bedrooms,Title,Location,Link
37,Staten Island,Mid-Island,10314,2018-03-06 13:10,2000,2br,STUNNING...2BR...DUPLEX APARTMENT,Travis,https://newyork.craigslist.org/stn/fee/d/stunn...
38,Staten Island,Mid-Island,10314,2018-03-02 17:45,2050,3br,Beautiful 3 BR Condo with Garage for Rent near...,Staten Island,https://newyork.craigslist.org/stn/fee/d/beaut...
39,Staten Island,Mid-Island,10314,2018-02-23 15:11,2400,3br,Great 3Bed/2 bath HOUSE in Manor Heights Hurry!@,29 Sunset Ave,https://newyork.craigslist.org/stn/fee/d/great...
41,Staten Island,Mid-Island,10314,2018-02-16 09:29,1750,2br,MINT NEWLY RENOVATED 2BR W/YARD - W/D,HEARTLAND VILLAGE,https://newyork.craigslist.org/stn/fee/d/mint-...
42,Staten Island,Mid-Island,10314,2018-02-16 09:14,1850,2br,"MINT NEWLY RENOVATED 2BR W/YARD, PARKING - W/D",HEARTLAND VILLAGE,https://newyork.craigslist.org/stn/fee/d/mint-...


### Export the Data

In [14]:
#housing.to_csv("housing.csv", index = False)
#housing.to_excel("housing.csv", index = False)

writer = pd.ExcelWriter('housing.xlsx')
housing.to_excel(writer,'Sheet1')
writer.save()

### To Do
1. Clean the 'bedrooms' variable properly
2. Figure out how to remove duplicates
3. Add NJ zipcodes
4. Add other important factors (crime, commute to JPM [Maps API])

### Other Ideas

Links:
- http://www.areavibes.com/
- http://www.city-data.com/zips/10024.html
- https://www.melissadata.com/lookups/index.htm
- http://www.relocationessentials.com/aff/www/tools/community/index.aspx

Variables:
- Estimated zip code population