# Web Scraping Craiglist Housing Prices
- This notebook is a tutorial on how to scrape Craigslist housing data with Python. 

In [2]:
# import all modules that will be used in our scraper

from requests import get 
from bs4 import BeautifulSoup
from time import sleep
import re
from random import randint 
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np
import pandas as pd
from datetime import datetime

In [3]:
# request the contents of the page we're scraping

results = get('https://www.drudgereport.com/')

In [4]:
# make the content we grabbed easy to read

html_soup = BeautifulSoup(results.text, 'html.parser')

In [5]:
html_soup

 <html>
<head>
<meta content="upgrade-insecure-requests" http-equiv="Content-Security-Policy"/>
<title>DRUDGE REPORT 2022®</title>
<!-- eProof, DR Headline Updater -->
<script src="//cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" type="text/javascript"></script>
<script src="//eproof.drudgereport.com/dr.js" type="text/javascript"></script>
<!-- eProof.com end -->
<script src="/js/cookie_filter_1.0.1.js" type="text/javascript"></script>
<script type="text/javascript">
<!-- 
var img = new Image(), url = "/204.png", container = document.getElementById("div-204");
img.onload = function () { container.appendChild(img); };
img.src = url;
var timer = setInterval("__drudge__321__autoRefresh()", 1000 * 35 * 3);
function __drudge__321__autoRefresh(){self.location.reload(true);}
(function () {
    var __oldClearInterval = clearInterval;

    clearInterval = function (arg) {
        if(arg == timer) {
                console.log("clearInterval of TIMER intercepted! (" + arg + ")");
   

In [6]:
# get the macro-container containing the posts we want

post = html_soup.find_all('li', class_= 'result-row')

print(type(post)) # check I got a ResultSet
print(len(post)) # check I got 120 

<class 'bs4.element.ResultSet'>
0


In [41]:
# grab the first post

post_one = post[0]

In [42]:
print(post_one)

<li class="result-row" data-pid="7268318498">
<a class="result-image gallery" data-ids="3:00G0G_2sMb5yO0bTaz_1440I3,3:01616_3LiWmir2BBuz_1440I3,3:00K0K_k0FqTVASag0z_1440I3,3:00w0w_lyFcG9KF20Kz_1440I3,3:00P0P_fNTvbr6Ahefz_1440I3,3:00n0n_ec7vT7ZfjS9z_1440I3,3:00d0d_YzyVMrH5IRz_1440I3,3:00404_c9swjlbrWrbz_1440I3,3:00101_c70HmWhgqxWz_04q05L" href="https://phoenix.craigslist.org/cph/apa/d/tempe-covered-parking-available/7268318498.html">
<span class="result-price">$1,015</span>
</a>
<div class="result-info">
<span class="icon icon-star" role="button">
<span class="screen-reader-text">favorite this post</span>
</span>
<time class="result-date" datetime="2021-01-26 19:50" title="Tue 26 Jan 07:50:49 PM">Jan 26</time>
<h3 class="result-heading">
<a class="result-title hdrlnk" data-id="7268318498" href="https://phoenix.craigslist.org/cph/apa/d/tempe-covered-parking-available/7268318498.html" id="postid_7268318498">Covered Parking Available, Clubhouse with Free Wi-Fi, Refrigerator</a>
</h3>
<span

In [43]:
#grab the price of the first post

post_one_price = post_one.a.text

In [44]:
post_one_price

'\n$1,015\n'

In [45]:
post_one_price.strip()

'$1,015'

In [46]:
# grab the time of the post 

post_one_time = post_one.find('time', class_= 'result-date')['datetime']
post_one_time

'2021-01-26 19:50'

In [47]:
# grab the title

post_one_title = post_one.find('a', class_='result-title hdrlnk')

post_one_title

<a class="result-title hdrlnk" data-id="7268318498" href="https://phoenix.craigslist.org/cph/apa/d/tempe-covered-parking-available/7268318498.html" id="postid_7268318498">Covered Parking Available, Clubhouse with Free Wi-Fi, Refrigerator</a>

In [48]:
# take only the text element of the post_one_title variable

post_one_title_text = post_one_title.text
post_one_title_text

'Covered Parking Available, Clubhouse with Free Wi-Fi, Refrigerator'

In [49]:
# grab the link

post_one_link = post_one_title['href']
post_one_link

'https://phoenix.craigslist.org/cph/apa/d/tempe-covered-parking-available/7268318498.html'

In [50]:
# grab the number of bedrooms

post_one_bedrooms = post_one.find('span', class_ = 'housing').text.split()[0]
post_one_bedrooms

'1br'

In [51]:
# grab the square footage

post_one_sqft = post_one.find('span', class_ = 'housing').text.split()[2][:-3] #cleans the ft2 at the end
post_one_sqft 

'732'

In [52]:
# grab the neighborhood
post_one_neighborhood = post_one.find('span', class_='result-hood').text # [2:-1]


# post_one_neighborhood = post[0].find('span', class_='result-hood').text # [2:-1]

print(post_one_neighborhood)

AttributeError: 'NoneType' object has no attribute 'text'

# Build the Loop

In [53]:
# find the total number of posts to find the limit of the pagination
find_total = html_soup.find('div', class_= 'search-legend')



# grab the total count of posts 
total_posts = int(find_total.find('span', class_='totalcount').text) 



# vary the value of the page parameters
pages = np.arange(0, total_posts+1, 120)



# vary the value of the page parameters
pages = np.arange(0, total_posts+1, 120)



# count tracker for number of iterations
iterations = 0



# initialize empty lists where we'll store our date 
post_times = []
post_neighborhoods = []
post_titles = []
post_bedrooms = []
post_sqft = []
post_links = []
post_prices = []



# create for loop
for page in pages:
    
    # get request
    response = get("https://boston.craigslist.org/search/apa?" 
                   + "s=" # parameter for defining the page number 
                   + str(page) # page number in the pages array 
                   + "&hasPic=1"
                   + "&availabilityMode=0")
    
    # control the crawl rate 
    sleep(randint(1,10))
    
    # throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        
    # define the html text
    html = BeautifulSoup(response.text, 'html.parser')
    
    # define the posts
    posts = html_soup.find_all('li', class_= 'result-row')
    
    
    # extract data item-wise
    for post in posts:

        # if we aren't missing the neighborhood information
        if post.find('span', class_ = 'result-hood') is not None:

            # date
            post_datetime = post.find('time', class_= 'result-date')['datetime']
            post_times.append(post_datetime)

            # neighborhoods
            post_hoods = post.find('span', class_= 'result-hood').text
            post_neighborhoods.append(post_hoods)

            # title 
            post_title = post.find('a', class_='result-title hdrlnk')
            post_title_text = post_title.text
            post_titles.append(post_title_text)

            # link
            post_link = post_title['href']
            post_links.append(post_link)
            
            # removes the \n whitespace from each side, removes the currency symbol, and turns it into an int
            post_price = int(float(post.a.text.strip().replace("$", "").replace(",","")))
            post_prices.append(post_price)
            
            
            
            
            
            # if the number of bedrooms OR sqft aren't missing 
            if post.find('span', class_ = 'housing') is not None:
                
                # if the first element is accidentally square footage
                if 'ft' in post.find('span', class_ = 'housing').text.split()[0]:
                    
                    # make bedroom NaN
                    bedroom_count = np.nan
                    post_bedrooms.append(bedroom_count)
                    
                    # make sqft the first element
                    sqft = int(post.find('span', class_ = 'housing').text.split()[0][:-3])
                    post_sqft.append(sqft)
                    
                # if the length of the housing details element is more than 2
                elif len(post.find('span', class_ = 'housing').text.split()) > 2:
                    
                    # therefore element 0 will be bedroom count
                    bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                    post_bedrooms.append(bedroom_count)
                    
                    # and sqft will be number 3, so set these here and append
                    sqft = int(post.find('span', class_ = 'housing').text.split()[2][:-3])
                    post_sqft.append(sqft)
                    
                # if there is num bedrooms but no sqft
                elif len(post.find('span', class_ = 'housing').text.split()) == 2:
                    
                    # therefore element 0 will be bedroom count
                    bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                    post_bedrooms.append(bedroom_count)
                    
                    # and sqft will be number 3, so set these here and append
                    sqft = np.nan
                    post_sqft.append(sqft)                    
                
                else:
                    bedroom_count = np.nan
                    post_bedrooms.append(bedroom_count)
                
                    sqft = np.nan
                    post_sqft.append(sqft)
                
            # if none of those conditions catch, make bedroom NaN 
            else:
                bedroom_count = np.nan
                post_bedrooms.append(bedroom_count)
                
                sqft = np.nan
                post_sqft.append(sqft)
                
    iterations += 1
      
    print("Page " + str(iterations) + " scraped successfully!")

    
print("\n")
print("Scrape complete!")

Page 1 scraped successfully!
Page 2 scraped successfully!
Page 3 scraped successfully!
Page 4 scraped successfully!
Page 5 scraped successfully!
Page 6 scraped successfully!
Page 7 scraped successfully!
Page 8 scraped successfully!
Page 9 scraped successfully!
Page 10 scraped successfully!
Page 11 scraped successfully!
Page 12 scraped successfully!
Page 13 scraped successfully!
Page 14 scraped successfully!
Page 15 scraped successfully!
Page 16 scraped successfully!
Page 17 scraped successfully!
Page 18 scraped successfully!
Page 19 scraped successfully!
Page 20 scraped successfully!
Page 21 scraped successfully!
Page 22 scraped successfully!
Page 23 scraped successfully!
Page 24 scraped successfully!
Page 25 scraped successfully!
Page 26 scraped successfully!


Scrape complete!


In [55]:
phx_apts = pd.DataFrame({'posted': post_times,
                       'neighborhood': post_neighborhoods,
                       'post title': post_titles,
                       'number bedrooms': post_bedrooms,
                        'sqft': post_sqft,
                        'URL': post_links,
                       'price': post_prices})

print(phx_apts.info())
phx_apts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1664 entries, 0 to 1663
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   posted           1664 non-null   object 
 1   neighborhood     1664 non-null   object 
 2   post title       1664 non-null   object 
 3   number bedrooms  1560 non-null   object 
 4   sqft             1352 non-null   float64
 5   URL              1664 non-null   object 
 6   price            1664 non-null   int64  
dtypes: float64(1), int64(1), object(5)
memory usage: 91.1+ KB
None


Unnamed: 0,posted,neighborhood,post title,number bedrooms,sqft,URL,price
0,2021-01-26 19:44,(PHOENIX),"Dog Park, 24 hour Fitness Center, Garages Avai...",2,865.0,https://phoenix.craigslist.org/cph/apa/d/phoen...,1251
1,2021-01-26 19:42,"(5740 North 59th Ave, Glendale, AZ, US)",Make Ranchwood your home TODAY! Apply online f...,1,540.0,https://phoenix.craigslist.org/wvl/apa/d/glend...,888
2,2021-01-26 19:41,"(5740 North 59th Ave, Glendale, AZ, US)",High speed internet included! Make Ranchwood y...,1,676.0,https://phoenix.craigslist.org/wvl/apa/d/glend...,881
3,2021-01-26 19:27,(near Scottsdale Rd / McDowell),Home in Scottsdale,3,,https://phoenix.craigslist.org/evl/apa/d/scott...,1895
4,2021-01-26 19:13,(Phoenix),Luxury studio residence atop Palomar Hotel in ...,1,840.0,https://phoenix.craigslist.org/cph/apa/d/phoen...,800
...,...,...,...,...,...,...,...
1659,2021-01-26 17:55,"(2625 E. Camelback Rd., Phoenix, AZ)","Sit Back, Relax. Your New View Awaits.",2,1074.0,https://phoenix.craigslist.org/evl/apa/d/phoen...,2408
1660,2021-01-26 17:53,"(2300 W Pecos Rd Chandler, AZ)",Come see our beautiful community!!!,1,756.0,https://phoenix.craigslist.org/evl/apa/d/chand...,1352
1661,2021-01-26 17:53,(Phoenix),Two bedroom in Beautiful Ahwatukee,2,1205.0,https://phoenix.craigslist.org/evl/apa/d/phoen...,1590
1662,2021-01-26 17:52,(Mesa),Stay & Play! Fun Amenities! Call Now!,2,828.0,https://phoenix.craigslist.org/evl/apa/d/mesa-...,1185


In [61]:
# to move all the scraped data to a CSV file

phx_apts.to_csv('phx_apts.csv', index=False)