In [1]:
# Import Libraries
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import  pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import string
import time
import re
import os
from functools import reduce
trim = re.compile(r'[^\d.,]+')

# http://www.zipcodestogo.com/New%20York/
# http://www.zipcodestogo.com/New%20Jersey/

### Read in NJ Zip Codes
Source: http://www.zipcodestogo.com/New%20Jersey/

In [2]:
zipcodes = pd.read_csv("/Users/erikgregorywebb/Documents/Python/nyc-housing/Data/nj-zip-codes.csv")
zipcodes.tail()

Unnamed: 0,County,City,ZipCode
166,Union,Linden,7036
167,Union,Rahway,7065
168,Union,Roselle,7203
169,Union,Roselle Park,7204
170,Union,Union,7083


### Generate Craigslist Links

In [3]:
base_links = []
for i in range(0, len(zipcodes)):
    link = "https://newjersey.craigslist.org/search/apa?postal={}".format(zipcodes.iloc[i,2])
    base_links.append(link)

In [4]:
base_links[0:5]

['https://newjersey.craigslist.org/search/apa?postal=7620',
 'https://newjersey.craigslist.org/search/apa?postal=7621',
 'https://newjersey.craigslist.org/search/apa?postal=7603',
 'https://newjersey.craigslist.org/search/apa?postal=7072',
 'https://newjersey.craigslist.org/search/apa?postal=7010']

In [5]:
len(base_links)

171

### Generate Craigslist Links [2nd Method]

In [6]:
links = []
for i in range(0, 3000, 120):
    link = "https://newjersey.craigslist.org/search/apa?s={}".format(i)
    links.append(link)

In [7]:
links[0:5]

['https://newjersey.craigslist.org/search/apa?s=0',
 'https://newjersey.craigslist.org/search/apa?s=120',
 'https://newjersey.craigslist.org/search/apa?s=240',
 'https://newjersey.craigslist.org/search/apa?s=360',
 'https://newjersey.craigslist.org/search/apa?s=480']

In [8]:
base_links = links

### Collect Listings Data

In [9]:
start_time = time.time()
sleep_time = 1

housing = pd.DataFrame()
length = len(base_links)
#length = 1

print("--- Expected Run Time: % seconds ---" % (length*sleep_time))

for i in range(0, length):
    
    time.sleep(sleep_time)
    s = requests.Session()
    retries = Retry(total=3, backoff_factor=0.5)
    s.mount('http://', HTTPAdapter(max_retries=retries))
    r = s.get(base_links[i])
    page = BeautifulSoup(r.content, "lxml")
    
    dates = []
    prices = []
    bedrooms = []
    titles = []
    locations = []
    links = []

    rows = page.findAll('li', {'class': 'result-row'})

    for row in rows:
        # Date
        try: date = row.find('time', {'class': 'result-date'})['datetime']
        except: date = None
        
        # Price
        try: price = row.find('span', {'class': 'result-price'}).text
        except: price = None
            
        # Bedroom
        try: bedroom = row.find('span', {'class': 'housing'}).text
        except: bedroom = None
        
        # Title
        try: title = row.find('a', {'class': 'result-title hdrlnk'}).text
        except: title = None
        
        # Location
        try: location = row.find('span', {'class': 'result-hood'}).text
        except: location = None
        
        # Link
        try: link = row.find('a', href = True)['href']
        except: link = None
        
        dates.append(date)
        prices.append(price)
        bedrooms.append(bedroom)
        titles.append(title)
        locations.append(location)
        links.append(link)
        
    county = [zipcodes.iloc[i,0]] * len(dates)
    city = [zipcodes.iloc[i,1]] * len(dates)
    zipcode = [zipcodes.iloc[i,2]] * len(dates)
        
    temp = pd.DataFrame(list(zip(county, city, zipcode, dates, prices, bedrooms, titles, locations, links)))
    housing = pd.concat([housing, temp])

print("--- %s seconds ---" % (time.time() - start_time))
print("--- %s seconds NOT sleeping ---" % (time.time() - start_time - (sleep_time * length)))

--- Expected Run Time: 25econds ---
--- 66.30304193496704 seconds ---
--- 41.303170919418335 seconds NOT sleeping ---


In [10]:
# Clean the data
housing.columns = ["County", "City", "Zipcode", "Date", "Price", "Bedrooms", "Title", "Location", "Link"]

for i in range(0, len(housing)):
    try: housing.iloc[i,4] = housing.iloc[i,4].replace('$', '')
    except: housing.iloc[i,4] = housing.iloc[i,4]
    
    try: housing.iloc[i,5] = housing.iloc[i,5].replace('\n', '')
    except: housing.iloc[i,5] = housing.iloc[i,5]
    
    try: housing.iloc[i,5] = housing.iloc[i,5].replace('-', '')
    except: housing.iloc[i,5] = housing.iloc[i,5]
        
    try: housing.iloc[i,5] = housing.iloc[i,5].strip()
    except: housing.iloc[i,5] = housing.iloc[i,5]
    
    try:
        if housing.iloc[i,5].find('br') == True:
            housing.iloc[i,5] = housing.iloc[i,5][0:3]
        else:
            housing.iloc[i,5] = None
    except: None
    
    try: housing.iloc[i,7] = housing.iloc[i,7].replace('(', '')
    except: housing.iloc[i,7] = housing.iloc[i,7]
        
    try: housing.iloc[i,7] = housing.iloc[i,7].replace(')', '')
    except: housing.iloc[i,7] = housing.iloc[i,7]
        
# Remove Duplictates
housing = housing.drop_duplicates(subset = ['County', 'City', 'Zipcode', 'Price', 'Bedrooms', 'Location', 'Title'], keep = 'first')

In [11]:
housing.tail()

Unnamed: 0,County,City,Zipcode,Date,Price,Bedrooms,Title,Location,Link
115,Bergen,Hasbrouck Heights,7604,2018-04-13 22:02,1845,1br,~No Broker Fee! Washer/Dryer in-unit~,Bloomfield,https://newjersey.craigslist.org/apa/d/no-brok...
116,Bergen,Hasbrouck Heights,7604,2018-04-13 22:02,1865,1br,~No Broker Fee! Steps to Train Station~,Bloomfield,https://newjersey.craigslist.org/apa/d/no-brok...
117,Bergen,Hasbrouck Heights,7604,2018-04-13 22:02,1865,1br,No Broker Fee! Bloomfield Luxury!,Bloomfield,https://newjersey.craigslist.org/apa/d/no-brok...
118,Bergen,Hasbrouck Heights,7604,2018-04-13 22:01,1860,1br,No Broker Fee! Stone Counters!,East Rutherford,https://newjersey.craigslist.org/apa/d/no-brok...
119,Bergen,Hasbrouck Heights,7604,2018-04-13 22:01,2355,2br,No Broker Fee! Minutes to NYC!,East Rutherford,https://newjersey.craigslist.org/apa/d/no-brok...


### Export the Data

In [12]:
os.chdir("/Users/erikgregorywebb/Documents/Python/nyc-housing/Data")
housing.to_csv("nj-housing-2.csv")

### Other Ideas

Links:
- http://www.areavibes.com/
- http://www.city-data.com/zips/10024.html
- https://www.melissadata.com/lookups/index.htm
- http://www.relocationessentials.com/aff/www/tools/community/index.aspx

Variables:
- Estimated zip code population