# Capturing SF rental information from Craigslist

#### Using beautifulsoup to parse html. 

In [1]:
#importing get to send get requests to websites using python
from requests import get
from bs4 import BeautifulSoup

## Understanding the HTML structure of Apartment/housing listings
#### The structure for Apartment/housing filter and rooms/shared are different and hence need to parse them separately

In [2]:
#get the craigslist page for apartments and housing in SF
#used filter for postings only with images and to bundle duplicates
response = get('https://sfbay.craigslist.org/search/sfc/apa?hasPic=1&bundleDuplicates=1&availabilityMode=0&sale_date=all+dates')
html_soup = BeautifulSoup(response.text, 'html.parser')

In [3]:
# looking at the html structure to get an idea of where the required items have been stored
html_soup.find_all

<bound method Tag.find_all of ﻿<!DOCTYPE html>

<html class="no-js"><head>
<title>SF bay area apts/housing for rent  - craigslist</title>
<meta content="SF bay area apts/housing for rent  - craigslist" name="description"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible">
<link href="https://sfbay.craigslist.org/search/sfc/apa" rel="canonical"/>
<link href="https://sfbay.craigslist.org/search/sfc/apa?availabilityMode=0&amp;bundleDuplicates=1&amp;format=rss&amp;hasPic=1" rel="alternate" title="RSS feed for craigslist | SF bay area apts/housing for rent  - craigslist" type="application/rss+xml"/>
<link href="https://sfbay.craigslist.org/search/sfc/apa?s=120&amp;availabilityMode=0&amp;bundleDuplicates=1&amp;hasPic=1" rel="next"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<link href="//www.craigslist.org/styles/cl.css?v=281764e2707bd58e05233e4b7df36df8" media="all" rel="stylesheet" type="text/css"/>
<link href="//www.craigslist.org/styles/search.css?v=2d97269

In [4]:
# the urls related to the listings is in class result 
listings = html_soup.find_all('li', class_='result-row')
listings[5]

<li class="result-row" data-pid="6851306867">
<a class="result-image gallery" data-ids="1:00f0f_5TEsI85FMlR,1:01010_eeaRDEokwLu,1:00o0o_fX3CCJYf5S1,1:00808_1bkHa8byvTj,1:00000_e8qvm6MvDeb,1:00l0l_7lvU2yfKqWo,1:00n0n_7REXcZa3vVw,1:00909_29R5pureE9V,1:00t0t_c8FtJFEDNfU,1:00H0H_iLy3DgQuiDC,1:00C0C_cvW6jbj5DmJ,1:00w0w_adzIfCHLMFI,1:00i0i_l0ZJbc4tMMe,1:00l0l_ggO0Rm8ytLE,1:00000_6PAnLcIQ2Jo,1:00G0G_8OjTIgAz9XU,1:01515_a2niZ0yNG9L,1:00A0A_8aPxQDG8Vec,1:00909_lhFZI1FkIUE,1:00g0g_lKzfSbsey4i,1:00o0o_jPOtxLfzrel,1:00t0t_c8FtJFEDNfU,1:00m0m_ce9h3QU8gU5,1:00V0V_iZxXPTFHQXj" href="https://sfbay.craigslist.org/sfc/apa/d/san-francisco-full-of-lights-large-3br/6851306867.html">
<span class="result-price">$3300</span>
</a>
<p class="result-info">
<span class="icon icon-star" role="button">
<span class="screen-reader-text">favorite this post</span>
</span>
<time class="result-date" datetime="2019-03-28 22:03" title="Thu 28 Mar 10:03:19 PM">Mar 28</time>
<a class="result-title hdrlnk" data-id="6851306867

In [5]:
# We will first try to extract data out of the first link and see if we could extract everything
# Based on that we can later set a loop

#Listing title
firstlisting_title = listings[0].find('a', class_='result-title hdrlnk')['href']
print(firstlisting_title)

#Number of bedrooms
firstlisting_br = listings[0].find('span', class_='housing').text.split()[0]
print(firstlisting_br)

#Sqft
firstlisting_sqft = listings[2].find('span', class_='housing').text.split()[2]
print(firstlisting_sqft)
# need to write an exception here since no all the listings have sqft infomation listed

#Neighborhood
firstlisting_neighborhood = listings[0].find('span', class_='result-hood').text.strip()
print(firstlisting_neighborhood)

#rental price
firstlisting_price = listings[0].find('span', class_='result-price').text
print(firstlisting_price)

#Listing date
firstlisting_date = listings[0].find('time', class_='result-date')['datetime']
print(firstlisting_date)

https://sfbay.craigslist.org/sfc/apa/d/san-luis-obispo-three-bath-room-four-rm/6852987212.html
3br
989ft2
(inner sunset / UCSF)
$1066
2019-03-28 22:40


## Building the Apartments/housing dataset

In [6]:
# Its time to loop it through!
import numpy as np
import pandas as pd

from random import randint
from time import sleep 
import warnings
warnings.filterwarnings('ignore') 

In [7]:
# Find the total number of pages
listings_per_page = int(html_soup.find('span', class_='rangeTo').text)
total_listings = int(html_soup.find('span', class_='totalcount').text)
number_of_pages = total_listings/listings_per_page

In [38]:
# create empty lists for the info we want
listings_title = []
listings_br = []
listings_sqft = []
listings_neighborhood = []
listings_price = []
listings_date = []
listings_type = []

#### Had a look on how the page 1 and the following pages are formatted in craigslist
#### 1st Page: https://sfbay.craigslist.org/search/sfc/apa?availabilityMode=0&bundleDuplicates=1&hasPic=1
#### 2nd page: https://sfbay.craigslist.org/search/sfc/apa?availabilityMode=0&bundleDuplicates=1&hasPic=1&s=120
#### 3rd page: https://sfbay.craigslist.org/search/sfc/apa?availabilityMode=0&bundleDuplicates=1&hasPic=1&s=240

In [39]:
# to loop through all the pages
for page in range(0,total_listings+1,listings_per_page):
    # requesting the web pages
    request = get('https://sfbay.craigslist.org/search/sfc/apa?availabilityMode=0&bundleDuplicates=1&hasPic=1'
                 +'&s=' + str(page))
    html_soup = BeautifulSoup(request.text, 'html.parser')
    
    # to avoid sending too many requests, set a sleep to delay a few secs
    sleep(randint(1,3))
    
    #listings in each page
    listings = html_soup.find_all('li', class_= 'result-row')
    
    #extracting each listing in the page
    for listing in listings:
        if listing.find('span', class_='result-hood' ) is not None:
            
            #Title
            title = listing.find('a', class_='result-title hdrlnk')['href']
            listings_title.append(title)
            
            #some postings do not have br info and some do not have sqft info
            #need to carefully extract them
            if listing.find('span', class_='housing') is not None:
                
                if 'ft2' in listing.find('span', class_='housing').text.split()[0]:
                   
                    # br is nan
                    br = np.nan
                    listings_br.append(br)
                    
                    #1st element is sqft
                    sqft = listing.find('span', class_='housing').text.split()[0]
                    listings_sqft.append(sqft)
                    
                elif len(listing.find('span', class_='housing').text.split()) > 2:
                   
                    # 0th element is br
                    br = listing.find('span', class_='housing').text.split()[0]
                    listings_br.append(br)
                    
                    #3rd element is sqft
                    sqft = listing.find('span', class_='housing').text.split()[2]
                    listings_sqft.append(sqft)
                    
                #if the number of elements in housing is only 2
                elif len(listing.find('span', class_='housing').text.split()) == 2:
                    
                    # 0th element is br
                    br = listing.find('span', class_='housing').text.split()[0]
                    listings_br.append(br)
                    
                    #3rd element doesnt exist
                    sqft = np.nan
                    listings_sqft.append(sqft)
                    
                #if there is no information in housing
                else:
                    
                    # set br to nan
                    br = np.nan
                    listings_br.append(br)
                    
                    #set sqft to nan
                    sqft = np.nan
                    listings_sqft.append(sqft) 
            
            #if the listing doesnt have the class housing
            else:
                # set br to nan
                    br = np.nan
                    listings_br.append(br)
                    
                    #set sqft to nan
                    sqft = np.nan
                    listings_sqft.append(sqft)  
            
            #neighborhood
            neighborhood = listing.find('span', class_='result-hood').text.strip()
            listings_neighborhood.append(neighborhood)  
            
            #price
            price = listing.find('span', class_='result-price').text
            listings_price.append(price)
            
            #date
            date = listing.find('time', class_='result-date')['datetime']
            listings_date.append(date)
        
print('Sucessfully completed data extraction from craigslist!')

Sucessfully completed data extraction from craigslist!


In [41]:
len(listings_price)

3559

In [42]:
Apartment_listings = pd.DataFrame({'title':listings_title, 
                                   'bedrooms': listings_br,
                                   'sqft': listings_sqft,
                                   'neigborhood': listings_neighborhood,
                                   'price': listings_price,
                                   'date': listings_date,
                                   'listings_type': 'Apartment/housing'})

In [47]:
Apartments = Apartment_listings.drop_duplicates().reset_index(drop=True)
Apartments.tail()

Unnamed: 0,title,bedrooms,sqft,neigborhood,price,date,listings_type
2769,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,2br,,(mission district),$4999,2019-03-26 08:50,Apartment/housing
2770,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,,,(nob hill),$1899,2019-03-26 08:50,Apartment/housing
2771,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,,,(marina / cow hollow),$3300,2019-03-26 08:45,Apartment/housing
2772,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,3br,2225ft2,(laurel hts / presidio),$7450,2019-03-26 08:39,Apartment/housing
2773,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,,,(marina / cow hollow),$3100,2019-03-26 08:39,Apartment/housing


## Understanding the structure of HTML for rooms/shared

In [27]:
#get the craigslist page for rooms/shared in SF
#used filter for postings only with images and to bundle duplicates
request_rooms = get('https://sfbay.craigslist.org/search/sfc/roo?bundleDuplicates=1&hasPic=1')
html_soup = BeautifulSoup(request_rooms.text, 'html.parser')

In [33]:
# the urls related to the listings is in class result 
listings_rooms = html_soup.find_all('li', class_='result-row')
listings_rooms[2]

<li class="result-row" data-pid="6852966327">
<a class="result-image gallery" data-ids="1:00H0H_5WYXY6ZXGKI,1:01717_akWC9aOryvc,1:00V0V_fixwqbJcnWI,1:00X0X_eSDtAUrugyG,1:00o0o_gpaViZcbZmJ" href="https://sfbay.craigslist.org/sfc/roo/d/san-francisco-seeking-social-roommates/6852966327.html">
<span class="result-price">$1650</span>
</a>
<p class="result-info">
<span class="icon icon-star" role="button">
<span class="screen-reader-text">favorite this post</span>
</span>
<time class="result-date" datetime="2019-03-28 22:04" title="Thu 28 Mar 10:04:05 PM">Mar 28</time>
<a class="result-title hdrlnk" data-id="6852966327" href="https://sfbay.craigslist.org/sfc/roo/d/san-francisco-seeking-social-roommates/6852966327.html">Seeking social roommates! Room to rent in SoMa!</a>
<span class="result-meta">
<span class="result-price">$1650</span>
<span class="result-hood"> (SOMA / south beach)</span>
<span class="result-tags">
                    pic
                    <span class="maptag" data-pid="6

## Building the rooms/shared listings dataset

In [29]:
# Find the total number of pages
listings_rooms_per_page = int(html_soup.find('span', class_='rangeTo').text)
total_listings_rooms = int(html_soup.find('span', class_='totalcount').text)

In [34]:
listings_rooms_title = []
listings_rooms_neighborhood = []
listings_rooms_price = []
listings_rooms_date = []
listings_rooms_type = []

In [60]:
title = listings_rooms[0].find('a', class_='result-title hdrlnk')['href'].text
title

AttributeError: 'str' object has no attribute 'text'

In [46]:
# to loop through all the pages
for page in range(0,total_listings_rooms+1,listings_rooms_per_page):
    # requesting the web pages
    request = get('https://sfbay.craigslist.org/search/sfc/roo?bundleDuplicates=1&hasPic=1'
                 +'&s=' + str(page))
    html_soup = BeautifulSoup(request.text, 'html.parser')
    # to avoid sending too many requests, set a sleep to delay a few secs
    sleep(randint(1,3))
    
    #listings in each page
    listings = html_soup.find_all('li', class_= 'result-row')
    
    #extracting each listing in the page
    for listing in listings:
        if listing.find('span', class_='result-hood' ) is not None:
            
            #Title
            title = listing.find('a', class_='result-title hdrlnk')['href']
            listings_rooms_title.append(title)
            
            #neighborhood
            neighborhood = listing.find('span', class_='result-hood').text.strip()
            listings_rooms_neighborhood.append(neighborhood)  
            
            #price
            price = listing.find('span', class_='result-price').text
            listings_rooms_price.append(price)
            
            #date
            date = listing.find('time', class_='result-date')['datetime']
            listings_rooms_date.append(date)
        
print('Sucessfully completed data extraction from craigslist!')

Sucessfully completed data extraction from craigslist!


In [50]:
Rooms_listings = pd.DataFrame({'title':listings_rooms_title, 
                               'neigborhood': listings_rooms_neighborhood,
                               'price': listings_rooms_price,
                               'date': listings_rooms_date,
                               'listings_rooms_type': 'Rooms/shared'})

In [56]:
Rooms = Rooms_listings.drop_duplicates().reset_index(drop=True)
Rooms.head()

Unnamed: 0,title,neigborhood,price,date,listings_rooms_type
0,https://sfbay.craigslist.org/sfc/roo/d/san-fra...,(nob hill),$1900,2019-03-29 00:01,Rooms/shared
1,https://sfbay.craigslist.org/sfc/roo/d/san-fra...,(twin peaks / diamond hts),$1200,2019-03-28 23:58,Rooms/shared
2,https://sfbay.craigslist.org/sfc/roo/d/san-fra...,(Windsor at Dogpatch),$1089,2019-03-28 23:42,Rooms/shared
3,https://sfbay.craigslist.org/sfc/roo/d/san-fra...,(Financial District),$1253,2019-03-28 23:42,Rooms/shared
4,https://sfbay.craigslist.org/sfc/roo/d/san-fra...,(Financial District),$1257,2019-03-28 23:46,Rooms/shared


## Data Cleaning

In [69]:
# All the columns in Rooms/shared and apartments/houses are objects. 

#Need to convert price to numbers
Rooms.price = Rooms.price.apply(lambda x: x.replace('$','')).astype('int')
Apartments.price = Apartments.price.apply(lambda x: x.replace('$','')).astype('int')

In [128]:
#Need to strip () and take care of '/',',' in the column

Rooms.neigborhood = Rooms.neigborhood.apply(lambda x: x.replace('(','')).apply(lambda x: x.replace(')','')).\
    apply(lambda x: x.split('/')[0]).apply(lambda x: x.split(',')[0]).apply(lambda x: x.split(',')[0]).\
    apply(lambda x: x.strip()) + ', San Francisco, CA'
Apartments.neigborhood = Apartments.neigborhood.apply(lambda x: x.replace('(','')).apply(lambda x: x.replace(')','')).\
    apply(lambda x: x.split('/')[0]).apply(lambda x: x.split(',')[0]).apply(lambda x: x.split(',')[0]).\
    apply(lambda x: x.strip()) + ', San Francisco, CA'

In [138]:
# For extracting latitude and longitude of the neighborhood
from geopy.geocoders import Nominatim
from geopy.distance import geodesic


geolocator = Nominatim(timeout=5)
latitude = []
longitude = []
for i in range (Rooms.shape[0]):
    location = geolocator.geocode(Rooms.neigborhood[i])
    if location == None:
        latitude.append(np.nan)
        longitude.append(np.nan)
    else:
        latitude.append(location.latitude)
        longitude.append(location.longitude)     

Rooms['latitude'] = latitude
Rooms['longitude'] = longitude

In [159]:
from geopy.exc import GeocoderTimedOut
geolocator = Nominatim()
latitude1 = []
longitude1 = []
for i in range (Apartments.shape[0]):
    try:
        location = geolocator.geocode(Apartments.neigborhood[i], timeout=5)
    except GeocoderTimedOut:
        location = geolocator.geocode(Apartments.neigborhood[i], timeout=5)
       
    if location == None:
        latitude1.append(np.nan)
        longitude1.append(np.nan)
    else:
        latitude1.append(location.latitude)
        longitude1.append(location.longitude)

Apartments['latitude'] = latitude1
Apartments['longitude'] = longitude1

In [160]:
len(longitude1)

2774

In [161]:
Rooms.info()
Apartments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1361 entries, 0 to 1360
Data columns (total 7 columns):
title                  1361 non-null object
neigborhood            1361 non-null object
price                  1361 non-null int32
date                   1361 non-null object
listings_rooms_type    1361 non-null object
latitude               1262 non-null float64
longitude              1262 non-null float64
dtypes: float64(2), int32(1), object(4)
memory usage: 69.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2774 entries, 0 to 2773
Data columns (total 9 columns):
title            2774 non-null object
bedrooms         2191 non-null object
sqft             1333 non-null object
neigborhood      2774 non-null object
price            2774 non-null int32
date             2774 non-null object
listings_type    2774 non-null object
latitude         2442 non-null float64
longitude        2442 non-null float64
dtypes: float64(2), int32(1), object(6)
memory usage: 184.3+ KB
