# Web Scraping Craiglist Housing Prices
- This notebook is a tutorial on how to scrape Craigslist housing data with Python. 

In [2]:
# import all modules that will be used in our scraper

from requests import get 
from bs4 import BeautifulSoup
from time import sleep
import re
from random import randint 
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np
import pandas as pd
from datetime import datetime

In [3]:
# request the contents of the page we're scraping

results = get('https://boston.craigslist.org/d/apartments-housing-for-rent/search/apa?availabilityMode=0&hasPic=1')

In [4]:
# make the content we grabbed easy to read

html_soup = BeautifulSoup(results.text, 'html.parser')

In [5]:
html_soup

<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="craigslist" property="og:site_name"/>
<meta content="preview" name="twitter:card"/>
<meta content="boston apartments / housing for rent - craigslist" property="og:title"/>
<meta content="boston apartments / housing for rent - craigslist" name="description"/>
<meta content="boston apartments / housing for rent - craigslist" property="og:description"/>
<meta content="https://boston.craigslist.org/d/apartments-housing-for-rent/search/apa" property="og:url"/>
<title>boston apartments / housing for rent - craigslist</title>
<link href="https://boston.craigslist.org/d/apartments-housing-for-rent/search/apa" rel="canonical"/>
<link href="https://boston.craigslist.org/d/apartments-housing-for-rent/search/apa?s=120&amp;availabilityMode=0&amp;hasPic=1" rel="next"/>
<script id="ld_breadcrumb_data" type="ap

In [6]:
# get the macro-container containing the posts we want

post = html_soup.find_all('li', class_= 'result-row')

print(type(post)) # check I got a ResultSet
print(len(post)) # check I got 120 

<class 'bs4.element.ResultSet'>
120


In [7]:
# grab the first post

post_one = post[0]

In [8]:
print(post_one)

<li class="result-row" data-pid="7430816647">
<a class="result-image gallery" data-ids="3:00909_2PYipng36Huz_0gj0t2,3:01515_1soxcEiYPizz_0gh0t2,3:00z0z_42PwO5KTjuTz_0js0t2,3:00e0e_iYhtAUdOOywz_0fX0t2,3:00101_8NLEfKZqy1sz_0dq0t2,3:00A0A_a9zVy67EInoz_0iq0t2,3:00c0c_cShghPZoQp6z_0hV0t2,3:00S0S_6GWxLfWWQLgz_0fP0t2" href="https://boston.craigslist.org/nos/apa/d/lynn-for-rent/7430816647.html">
<span class="result-price">$2,500</span>
</a>
<div class="result-info">
<span class="icon icon-star" role="button">
<span class="screen-reader-text">favorite this post</span>
</span>
<time class="result-date" datetime="2022-01-09 19:52" title="Sun 09 Jan 07:52:02 PM">Jan  9</time>
<h3 class="result-heading">
<a class="result-title hdrlnk" data-id="7430816647" href="https://boston.craigslist.org/nos/apa/d/lynn-for-rent/7430816647.html" id="postid_7430816647">For rent</a>
</h3>
<span class="result-meta">
<span class="result-price">$2,500</span>
<span class="housing">
                    3br -
           

In [9]:
#grab the price of the first post

post_one_price = post_one.a.text

In [10]:
post_one_price

'\n$2,500\n'

In [11]:
post_one_price.strip()

'$2,500'

In [12]:
# grab the time of the post 

post_one_time = post_one.find('time', class_= 'result-date')['datetime']
post_one_time

'2022-01-09 19:52'

In [13]:
# grab the title

post_one_title = post_one.find('a', class_='result-title hdrlnk')

post_one_title

<a class="result-title hdrlnk" data-id="7430816647" href="https://boston.craigslist.org/nos/apa/d/lynn-for-rent/7430816647.html" id="postid_7430816647">For rent</a>

In [14]:
# take only the text element of the post_one_title variable

post_one_title_text = post_one_title.text
post_one_title_text

'For rent'

In [15]:
# grab the link

post_one_link = post_one_title['href']
post_one_link

'https://boston.craigslist.org/nos/apa/d/lynn-for-rent/7430816647.html'

In [17]:
# grab the number of bedrooms

post_one_bedrooms = post_one.find('span', class_ = 'housing').text.split()[0]
post_one_bedrooms

'3br'

In [18]:
# grab the square footage

post_one_sqft = post_one.find('span', class_ = 'housing').text.split()[2][:-3] #cleans the ft2 at the end
post_one_sqft 

IndexError: list index out of range

In [20]:
# grab the neighborhood
post_one_neighborhood = post_one.find('span', class_='result-hood').text # [2:-1]


# post_one_neighborhood = post[0].find('span', class_='result-hood').text # [2:-1]

print(post_one_neighborhood)

 ( north shore )


# Build the Loop

In [21]:
# find the total number of posts to find the limit of the pagination
find_total = html_soup.find('div', class_= 'search-legend')



# grab the total count of posts 
total_posts = int(find_total.find('span', class_='totalcount').text) 



# vary the value of the page parameters
pages = np.arange(0, total_posts+1, 120)



# vary the value of the page parameters
pages = np.arange(0, total_posts+1, 120)



# count tracker for number of iterations
iterations = 0



# initialize empty lists where we'll store our date 
post_times = []
post_neighborhoods = []
post_titles = []
post_bedrooms = []
post_sqft = []
post_links = []
post_prices = []



# create for loop
for page in pages:
    
    # get request
    response = get("https://boston.craigslist.org/search/apa?" 
                   + "s=" # parameter for defining the page number 
                   + str(page) # page number in the pages array 
                   + "&hasPic=1"
                   + "&availabilityMode=0")
    
    # control the crawl rate 
    sleep(randint(1,10))
    
    # throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        
    # define the html text
    html = BeautifulSoup(response.text, 'html.parser')
    
    # define the posts
    posts = html_soup.find_all('li', class_= 'result-row')
    
    
    # extract data item-wise
    for post in posts:

        # if we aren't missing the neighborhood information
        if post.find('span', class_ = 'result-hood') is not None:

            # date
            post_datetime = post.find('time', class_= 'result-date')['datetime']
            post_times.append(post_datetime)

            # neighborhoods
            post_hoods = post.find('span', class_= 'result-hood').text
            post_neighborhoods.append(post_hoods)

            # title 
            post_title = post.find('a', class_='result-title hdrlnk')
            post_title_text = post_title.text
            post_titles.append(post_title_text)

            # link
            post_link = post_title['href']
            post_links.append(post_link)
            
            # removes the \n whitespace from each side, removes the currency symbol, and turns it into an int
            post_price = int(float(post.a.text.strip().replace("$", "").replace(",","")))
            post_prices.append(post_price)
            
            
            
            
            
            # if the number of bedrooms OR sqft aren't missing 
            if post.find('span', class_ = 'housing') is not None:
                
                # if the first element is accidentally square footage
                if 'ft' in post.find('span', class_ = 'housing').text.split()[0]:
                    
                    # make bedroom NaN
                    bedroom_count = np.nan
                    post_bedrooms.append(bedroom_count)
                    
                    # make sqft the first element
                    sqft = int(post.find('span', class_ = 'housing').text.split()[0][:-3])
                    post_sqft.append(sqft)
                    
                # if the length of the housing details element is more than 2
                elif len(post.find('span', class_ = 'housing').text.split()) > 2:
                    
                    # therefore element 0 will be bedroom count
                    bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                    post_bedrooms.append(bedroom_count)
                    
                    # and sqft will be number 3, so set these here and append
                    sqft = int(post.find('span', class_ = 'housing').text.split()[2][:-3])
                    post_sqft.append(sqft)
                    
                # if there is num bedrooms but no sqft
                elif len(post.find('span', class_ = 'housing').text.split()) == 2:
                    
                    # therefore element 0 will be bedroom count
                    bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                    post_bedrooms.append(bedroom_count)
                    
                    # and sqft will be number 3, so set these here and append
                    sqft = np.nan
                    post_sqft.append(sqft)                    
                
                else:
                    bedroom_count = np.nan
                    post_bedrooms.append(bedroom_count)
                
                    sqft = np.nan
                    post_sqft.append(sqft)
                
            # if none of those conditions catch, make bedroom NaN 
            else:
                bedroom_count = np.nan
                post_bedrooms.append(bedroom_count)
                
                sqft = np.nan
                post_sqft.append(sqft)
                
    iterations += 1
      
    print("Page " + str(iterations) + " scraped successfully!")

    
print("\n")
print("Scrape complete!")

Page 1 scraped successfully!
Page 2 scraped successfully!
Page 3 scraped successfully!
Page 4 scraped successfully!
Page 5 scraped successfully!
Page 6 scraped successfully!
Page 7 scraped successfully!
Page 8 scraped successfully!
Page 9 scraped successfully!
Page 10 scraped successfully!
Page 11 scraped successfully!
Page 12 scraped successfully!
Page 13 scraped successfully!
Page 14 scraped successfully!
Page 15 scraped successfully!
Page 16 scraped successfully!
Page 17 scraped successfully!
Page 18 scraped successfully!
Page 19 scraped successfully!
Page 20 scraped successfully!
Page 21 scraped successfully!
Page 22 scraped successfully!


Scrape complete!


In [22]:
phx_apts = pd.DataFrame({'posted': post_times,
                       'neighborhood': post_neighborhoods,
                       'post title': post_titles,
                       'number bedrooms': post_bedrooms,
                        'sqft': post_sqft,
                        'URL': post_links,
                       'price': post_prices})

print(phx_apts.info())
phx_apts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2640 entries, 0 to 2639
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   posted           2640 non-null   object 
 1   neighborhood     2640 non-null   object 
 2   post title       2640 non-null   object 
 3   number bedrooms  2464 non-null   object 
 4   sqft             1342 non-null   float64
 5   URL              2640 non-null   object 
 6   price            2640 non-null   int64  
dtypes: float64(1), int64(1), object(5)
memory usage: 144.5+ KB
None


Unnamed: 0,posted,neighborhood,post title,number bedrooms,sqft,URL,price
0,2022-01-09 19:52,( north shore ),For rent,3,,https://boston.craigslist.org/nos/apa/d/lynn-f...,2500
1,2022-01-09 19:49,(Lynn north shore ),$1550/1 Bdrm - 900 sq. ft.- Ocean View Luxury ...,1,900.0,https://boston.craigslist.org/nos/apa/d/lynn-b...,1550
2,2022-01-09 19:41,(Brighton boston/cambridge/brookline ),Modern 2-bedroom first floor apartment at Harr...,2,1078.0,https://boston.craigslist.org/gbs/apa/d/bright...,2450
3,2022-01-09 18:37,(Brighton boston/cambridge/brookline ),Spacious apartment in Brighton/ $1150 (utiliti...,2,,https://boston.craigslist.org/gbs/apa/d/bright...,1050
4,2022-01-09 15:43,"(Salem, MA north shore )",Sunny 2 bed 2 Full Bath + Office,3,1410.0,https://boston.craigslist.org/nos/apa/d/salem-...,2650
...,...,...,...,...,...,...,...
2635,2022-01-08 06:01,(Woburn north shore ),"Linen Closets, Dens, Resident social activitie...",2,1403.0,https://boston.craigslist.org/nos/apa/d/woburn...,2815
2636,2022-01-08 05:59,(Boston boston/cambridge/brookline ),"Hardwood floors, Balcony, Stainless steel appl...",2,900.0,https://boston.craigslist.org/gbs/apa/d/boston...,5286
2637,2022-01-08 05:57,(Boston boston/cambridge/brookline ),"Wood burning fireplace, Granite countertops, C...",1,724.0,https://boston.craigslist.org/gbs/apa/d/boston...,4134
2638,2022-01-08 05:55,(Boston boston/cambridge/brookline ),"Furnished apartments available, Washer and dry...",2,1033.0,https://boston.craigslist.org/gbs/apa/d/boston...,5499


In [24]:
# to move all the scraped data to a CSV file

phx_apts.to_csv('boston_apts.csv', index=False)

In [26]:
import pymongo

In [27]:
client = pymongo.MongoClient("mongodb+srv://userid:password@cluster0.zadqe.mongodb.net/BostonHousing?retryWrites=true&w=majority")
db = client.test

In [28]:
# importing pymongo
from pymongo import MongoClient

In [29]:
# establing connection
try:
    connect = MongoClient()
    print("Connected successfully!!!")
except:
    print("Could not connect to MongoDB")

Connected successfully!!!


In [32]:
# creating or switching to demoCollection
collection = db.demoCollection

In [34]:
# first document
document1 = {
        "name":"John",
        "age":24,
        "location":"New York"
        }
#second document
document2 = {
        "name":"Sam",
        "age":21,
        "location":"Chicago"
        }

In [35]:
# Inserting both document one by one
collection.insert_one(document1)
collection.insert_one(document2)

<pymongo.results.InsertOneResult at 0x7f615047a200>

In [36]:
# Printing the data inserted
cursor = collection.find()
for record in cursor:
    print(record)

{'_id': ObjectId('61dbad44764e3f90ccba1687'), 'name': 'John', 'age': 24, 'location': 'New York'}
{'_id': ObjectId('61dbad44764e3f90ccba1688'), 'name': 'Sam', 'age': 21, 'location': 'Chicago'}


In [37]:
# connecting or switching to the database
db = connect.BostonHousing

In [40]:
collection.insert_many(phx_apts.to_dict())

TypeError: documents must be a non-empty list

In [None]:
mongoimport --db=importdata --collection=data --type=csv --headerline --file=boston_apts.csv

In [None]:
mongoimport --uri mongodb+srv://userid:password@cluster0.zadqe.mongodb.net/BostonHousing --collection CraigsList --type csv --headerline --file boston_apts.csv