In [1]:
'''
Copyright (c) 2019, Crystal Xue
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
3. All advertising materials mentioning features or use of this software
   must display the following acknowledgement:
   This product includes software developed by the <organization>.
4. Neither the name of the <organization> nor the
   names of its contributors may be used to endorse or promote products
   derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY CRYSTAL XUE ''AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL CRYSTAL XUE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
'''

"\nCopyright (c) 2019, Crystal Xue\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are met:\n1. Redistributions of source code must retain the above copyright\n   notice, this list of conditions and the following disclaimer.\n2. Redistributions in binary form must reproduce the above copyright\n   notice, this list of conditions and the following disclaimer in the\n   documentation and/or other materials provided with the distribution.\n3. All advertising materials mentioning features or use of this software\n   must display the following acknowledgement:\n   This product includes software developed by the <organization>.\n4. Neither the name of the <organization> nor the\n   names of its contributors may be used to endorse or promote products\n   derived from this software without specific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY CRYSTAL XUE ''AS IS'' AND AN

In [2]:
# This cell downloads necessary packages, which contain functions that the program will call later to scrape necessary information.
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
import urllib
from bs4 import BeautifulSoup
import numpy as np
import csv, os, sys

In [3]:
def get_headers():
    # Creating headers.
    headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
               'accept-encoding': 'gzip, deflate, sdch, br',
               'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
               'cache-control': 'max-age=0',
               'upgrade-insecure-requests': '1',
               'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
    return headers

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, headers=get_headers(), stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [4]:
def get_address_from_detail_page(detailPageUrl, isDebug=True):
    # returns address from second level page (i.e. not the landing page, but the listing page)
    response = simple_get(detailPageUrl)
    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        for li in html.select('p[class="mapaddress"]'):
            googlemap = li.select('a')[0].get('href').strip(' ,\t\n\r')
            googlemap = urllib.parse.unquote(googlemap)
            
            return googlemap
    
    raise Exception('Error retrieving contents at {}'.format(detailPageUrl))

In [13]:
def get_address_price(html, isDebug=True):
    # Scrapes desired information from html code. Can be modified to include more/less information as long as content is in html file.
    addressPriceList = list()
    na = 'n/a'
    i = 0
    for li in html.select('li[class="result-row"]'):
        # selects each property
        i = i+1
            
        if li.select('a[class="result-image gallery"]'):
            detailPageUrl = li.select('a[class="result-image gallery"]')[0].get('href').strip(' ,\t\n\r')
            if isDebug:
                print(detailPageUrl)
            
            googlemap = get_address_from_detail_page(detailPageUrl, isDebug)
            if googlemap:
                if isDebug:    
                    print(googlemap)

                if li.select('span[class="result-price"]'):
                    # returns price (if listed)
                    resultPrice = li.select('span[class="result-price"]')[0].text.strip('$ ,\t\n\r')
                else: 
                    resultPrice = na
                if isDebug:
                    print(resultPrice)

                if li.select('span[class="housing"]'):
                    # returns type of housing (generally given in the number of bedrooms and square footage)
                    housing = li.select('span[class="housing"]')[0].text.replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "").replace("-", " ").strip()
                else:
                    housing = na    
                if isDebug:
                    print(housing)
                
                bed = ""
                sqft = ""
                
                if housing != na:
                    if (housing.find("br")>= 0):
                        bed = housing[:housing.index("b")]
                        #returns number for bedrooms from general housing data (if possible)
                    if (housing.find("ft2")>= 0):
                        #returns square footage from general housing data (if possible)
                        if (housing.find(" ")>= 0):
                            sqft = housing[housing.index(" ")+1:housing.index("f")]
                        else:
                            sqft = housing[:housing.index("f")]
                
                # Add listing to database
                    addressPriceList.append([detailPageUrl, resultPrice, housing, bed, sqft, googlemap])
        
    # Print number of properties on page. Should be used as a check to make sure scraper is passing through all properties.
    print('# cards in page: ', i)
    
    # Print number of properties listed in Dayton, OH. These properties will be written to csv file.
    print('# cards in city: ', len(addressPriceList))
    
    return addressPriceList
    
    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(i))

In [14]:
#Isolates and call information for each individual property
def get_property_card(city, state, page=1, isDebug=True):
  
    url = 'https://dayton.craigslist.org/search/hhh?availabilityMode=1&excats=2-17-21-1-17-7-34-22&postal=45402&search_distance=7&sort=date&s=' + str((page-1)*120)
    
    print(url)
    
    '''
    Use this code for debugging. Code returns html file of page and stops. Use it isolate pieces of information and ensure return is html.
    
    response = simple_get(url)
    html = BeautifulSoup(response, 'html.parser')
    return html
    '''

    response = simple_get(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        return get_address_price(html, isDebug)
    
    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))

In [19]:
# Test cell. Ensure scraper is running properly by testing with test page.
property_cards = get_property_card('Dayton', 'OH', 3, isDebug=True)

https://dayton.craigslist.org/search/hhh?availabilityMode=1&excats=2-17-21-1-17-7-34-22&postal=45402&search_distance=7&sort=date&s=240
https://dayton.craigslist.org/apa/d/dayton-looking-for-change-have-just/6935222010.html
https://www.google.com/maps/preview/@39.769157,-84.206123,16z
925
3br 1368ft2
https://dayton.craigslist.org/apa/d/dayton-rare-3-br-great-location-first/6936761976.html
https://www.google.com/maps/preview/@39.665100,-84.240100,16z
899
3br 1100ft2
https://dayton.craigslist.org/apa/d/dayton-3-story-townhome-3-bedrooms/6936742089.html
https://www.google.com/maps/preview/@39.762433,-84.205153,16z
799
3br 1500ft2
https://dayton.craigslist.org/apa/d/dayton-fully-furnished-studio-for-rent/6936703580.html
https://www.google.com/maps/preview/@39.775978,-84.066377,16z
1740
1br 350ft2
https://dayton.craigslist.org/apa/d/dayton-large-3-bdrm-half-double/6936736992.html
https://www.google.com/maps/preview/@39.746640,-84.155847,16z
825
3br 1200ft2
https://dayton.craigslist.org/apa/d

https://www.google.com/maps/preview/@39.789900,-84.213500,16z
835
1br 855ft2
https://dayton.craigslist.org/apa/d/dayton-looking-for-great-place-to-live/6925313542.html
https://www.google.com/maps/preview/@39.769157,-84.206123,16z
899
3br 1368ft2
https://dayton.craigslist.org/apa/d/dayton-looking-for-great-place-to-live/6925307928.html
https://www.google.com/maps/preview/@39.765817,-84.204470,16z
650
1br 660ft2
https://dayton.craigslist.org/apa/d/dayton-like-to-cook-have-large-kitchens/6925317266.html
https://www.google.com/maps/preview/@39.769157,-84.206123,16z
899
3br 1368ft2
https://dayton.craigslist.org/apa/d/dayton-your-new-home-is-here-all-that/6925358910.html
https://www.google.com/maps/preview/@39.789900,-84.213500,16z
995
2br 1237ft2
https://dayton.craigslist.org/apa/d/dayton-your-new-home-is-here-all-that/6925355670.html
https://www.google.com/maps/preview/@39.789900,-84.213500,16z
1225
2br 1237ft2
https://dayton.craigslist.org/apa/d/dayton-looking-for-great-place-to-live/6925

https://www.google.com/maps/preview/@39.665100,-84.240100,16z
729
2br 985ft2
https://dayton.craigslist.org/apa/d/dayton-september-availability-for-your/6934716622.html
https://www.google.com/maps/preview/@39.748834,-84.121166,16z
629
2br 790ft2
https://dayton.craigslist.org/apa/d/dayton-dishwasher-clubhouse-breakfast/6934714912.html
https://www.google.com/maps/preview/@39.680431,-84.122519,16z
669
1br 650ft2
https://dayton.craigslist.org/reo/d/dayton-we-buy-houses-we-use-private/6925696125.html
https://www.google.com/maps/preview/@39.684100,-84.163300,16z
39900
n/a
https://dayton.craigslist.org/apa/d/dayton-great-deals-great-location-this/6934679501.html
https://www.google.com/maps/preview/@39.665100,-84.240100,16z
799
2br 922ft2
https://dayton.craigslist.org/apa/d/dayton-august-is-almost-here-reserve/6934671357.html
https://www.google.com/maps/preview/@39.748642,-84.121284,16z
614
2br 790ft2
https://dayton.craigslist.org/apa/d/dayton-house-for-rent/6934663540.html
https://www.google.c

In [16]:
# Test cell. Ensure scraper is only returning properties listed in Dayton, OH.
print(property_cards)

NameError: name 'property_cards' is not defined

In [None]:
# Test cell. Count output of previous cell and check for match.
len(property_cards)

In [18]:
# Run scraper for all addresses in Dayton
totalAddressList = list()
i = 1
while True:
    addresses = get_property_card('Dayton', 'OH', i, isDebug=False)
    print(addresses)
    l = len(addresses)
    print(l)
    print(i)
    if l > 0 :
        totalAddressList = totalAddressList + addresses
        print(len(totalAddressList))
        i = i + 1
    else:
        break
        
print(totalAddressList)
print(len(totalAddressList))

https://dayton.craigslist.org/search/hhh?availabilityMode=1&excats=2-17-21-1-17-7-34-22&postal=45402&search_distance=7&sort=date&s=0
# cards in page:  120
# cards in city:  112
[['https://dayton.craigslist.org/apa/d/dayton-clubhouse-pet-friendly-swimming/6939640797.html', '800', '2br 900ft2', '2', '900', 'https://www.google.com/maps/preview/@39.674900,-84.113600,16z'], ['https://dayton.craigslist.org/apa/d/dayton-your-new-home-awaits-all-that-is/6921526933.html', '650', '1br 660ft2', '1', '660', 'https://www.google.com/maps/preview/@39.765817,-84.204470,16z'], ['https://dayton.craigslist.org/apa/d/dayton-amazing-1-bedroom-apartment-just/6932319227.html', '865', '1br 855ft2', '1', '855', 'https://www.google.com/maps/preview/@39.789900,-84.213500,16z'], ['https://dayton.craigslist.org/apa/d/dayton-looking-for-great-place-to-live/6921513134.html', '899', '3br 1368ft2', '3', '1368', 'https://www.google.com/maps/preview/@39.769157,-84.206123,16z'], ['https://dayton.craigslist.org/apa/d/dayt

# cards in page:  120
# cards in city:  117
[['https://dayton.craigslist.org/apa/d/dayton-your-new-home-awaits-all-that-is/6925288345.html', '650', '1br 660ft2', '1', '660', 'https://www.google.com/maps/preview/@39.765817,-84.204470,16z'], ['https://dayton.craigslist.org/apa/d/dayton-amazing-1-bedroom-apartment-just/6932328275.html', '865', '1br 855ft2', '1', '855', 'https://www.google.com/maps/preview/@39.789900,-84.213500,16z'], ['https://dayton.craigslist.org/apa/d/dayton-your-new-home-awaits-all-that-is/6930891274.html', '650', '1br 660ft2', '1', '660', 'https://www.google.com/maps/preview/@39.765817,-84.204470,16z'], ['https://dayton.craigslist.org/apa/d/dayton-amazing-1-bedroom-apartment-just/6932281105.html', '865', '1br 855ft2', '1', '855', 'https://www.google.com/maps/preview/@39.789900,-84.213500,16z'], ['https://dayton.craigslist.org/apa/d/dayton-amazing-2-bedroom-apartment-home/6932283744.html', '1225', '2br 1237ft2', '2', '1237', 'https://www.google.com/maps/preview/@39.78

# cards in page:  120
# cards in city:  117
[['https://dayton.craigslist.org/apa/d/dayton-3-story-townhome-3-bedrooms/6936742089.html', '799', '3br 1500ft2', '3', '1500', 'https://www.google.com/maps/preview/@39.762433,-84.205153,16z'], ['https://dayton.craigslist.org/apa/d/dayton-fully-furnished-studio-for-rent/6936703580.html', '1740', '1br 350ft2', '1', '350', 'https://www.google.com/maps/preview/@39.775978,-84.066377,16z'], ['https://dayton.craigslist.org/apa/d/dayton-large-3-bdrm-half-double/6936736992.html', '825', '3br 1200ft2', '3', '1200', 'https://www.google.com/maps/preview/@39.746640,-84.155847,16z'], ['https://dayton.craigslist.org/apa/d/dayton-last-1-br-apply-today/6936717791.html', '549', '1br 550ft2', '1', '550', 'https://www.google.com/maps/preview/@39.719360,-84.142223,16z'], ['https://dayton.craigslist.org/apa/d/dayton-huge-3-br-townhome-rare-apply/6936701378.html', '929', '3br 1166ft2', '3', '1166', 'https://www.google.com/maps/preview/@39.665100,-84.240100,16z'], [

# cards in page:  120
# cards in city:  115
[['https://dayton.craigslist.org/apa/d/dayton-chich-2-br-2-bath-first-floor/6934539405.html', '879', '2br 1030ft2', '2', '1030', 'https://www.google.com/maps/preview/@39.665100,-84.240100,16z'], ['https://dayton.craigslist.org/apa/d/completely-renovated-with-newer/6914331385.html', '435', '1br', '1', '', 'https://www.google.com/maps/preview/@39.786023,-84.215153,16z'], ['https://dayton.craigslist.org/reo/d/dayton-2305-keenan-ave/6934486647.html', '35000', '2br 900ft2', '2', '900', 'https://www.google.com/maps/preview/@39.806262,-84.190510,16z'], ['https://dayton.craigslist.org/apa/d/dayton-keep-calm-and-love-your-new-home/6934484670.html', '629', '790ft2', '', '790', 'https://www.google.com/maps/preview/@39.748312,-84.121284,16z'], ['https://dayton.craigslist.org/apa/d/dayton-last-chance-incredibly-spacious/6934468306.html', '614', '2br 790ft2', '2', '790', 'https://www.google.com/maps/preview/@39.748312,-84.121370,16z'], ['https://dayton.cra

Error during requests to https://dayton.craigslist.org/apa/d/dayton-stonebridge-is-just-stones-throw/6928035705.html : HTTPSConnectionPool(host='dayton.craigslist.org', port=443): Max retries exceeded with url: /apa/d/dayton-stonebridge-is-just-stones-throw/6928035705.html (Caused by ProxyError('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response',)))


Exception: Error retrieving contents at https://dayton.craigslist.org/apa/d/dayton-stonebridge-is-just-stones-throw/6928035705.html

In [None]:
# Test cell. Double-check that totalAddressList is in form that can be written to csv file. Should see "list" in output.
type(totalAddressList)

In [9]:
# Create csv file. Headers can be modified to refer to actual values in list.
import csv

header = ['detailPageUrl', 'resultPrice', 'housing', 'bed', 'sqft', 'googlemap',]
type(header)

list

In [92]:
# Write totalAddressList to csv file and download csv file.
with open('finalTotalAddressCraigList.csv', 'w') as f:
    csv_writer = csv.writer(f)
 
    csv_writer.writerow(header) # write header
 
    for row in totalAddressList:
        csv_writer.writerow(row)
        
print("Done")

Done
