In [1]:
# websraping libraries
import requests as rq
from bs4 import BeautifulSoup as bs

# dataframe libraries
import pandas as pd
import numpy as np

# base url
url = 'https://newyork.craigslist.org/d/apts-housing-for-rent/search/apa'

In [2]:
# convert to soup object
page = rq.get(url)
soup = bs(page.content, 'html.parser')

# take a look
soup

<!DOCTYPE html>

<html class="no-js">
<head>
<meta charset="utf-8"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="craigslist" property="og:site_name"/>
<meta content="preview" name="twitter:card"/>
<meta content="new york apartments / housing for rent - craigslist" property="og:title"/>
<meta content="new york apartments / housing for rent - craigslist" name="description"/>
<meta content="new york apartments / housing for rent - craigslist" property="og:description"/>
<meta content="https://newyork.craigslist.org/d/apartments-housing-for-rent/search/apa" property="og:url"/>
<title>new york apartments / housing for rent - craigslist</title>
<link href="https://newyork.craigslist.org/d/apartments-housing-for-rent/search/apa" rel="canonical"/>
<link href="https://newyork.craigslist.org/d/apartments-housing-for-rent/search/apa?s=120" rel="next"/>
<script id="ld_breadcrumb_data" type="application/l

In [3]:
# after inspecting the page, the urls exists within h3
h3s = soup.find_all('h3', class_='result-heading',
#                    attrs={'class': 'result-heading'}
                   )
h3s

[<h3 class="result-heading">
 <a class="result-title hdrlnk" data-id="7342241942" href="https://newyork.craigslist.org/fct/apa/d/bridgeport-bdr-apt-completely-renovated/7342241942.html" id="postid_7342241942">1 BDR Apt Completely Renovated</a>
 </h3>,
 <h3 class="result-heading">
 <a class="result-title hdrlnk" data-id="7342238086" href="https://newyork.craigslist.org/mnh/apa/d/new-york-no-fee-month-free-newly/7342238086.html" id="postid_7342238086">No Fee, 1 Month Free Newly Renovated 1 Bed 1 Bath, Hudson Yards</a>
 </h3>,
 <h3 class="result-heading">
 <a class="result-title hdrlnk" data-id="7342237406" href="https://newyork.craigslist.org/mnh/apa/d/new-york-the-best-water-views-around/7342237406.html" id="postid_7342237406">The Best Water Views Around.</a>
 </h3>,
 <h3 class="result-heading">
 <a class="result-title hdrlnk" data-id="7342236992" href="https://newyork.craigslist.org/que/apa/d/sunnyside-fully-furnished-bdrm-apt/7342236992.html" id="postid_7342236992">Fully furnished 1 b

In [4]:
len(h3s)

120

In [5]:
# after inspecting the page, the urls exists within h3
h3s = soup.find_all('h3', attrs={'class': 'result-heading'})

# look at the first instance
h3s[0]

<h3 class="result-heading">
<a class="result-title hdrlnk" data-id="7342241942" href="https://newyork.craigslist.org/fct/apa/d/bridgeport-bdr-apt-completely-renovated/7342241942.html" id="postid_7342241942">1 BDR Apt Completely Renovated</a>
</h3>

In [6]:
type(h3s[0])

bs4.element.Tag

In [7]:
h3s[0].find('a')

<a class="result-title hdrlnk" data-id="7342241942" href="https://newyork.craigslist.org/fct/apa/d/bridgeport-bdr-apt-completely-renovated/7342241942.html" id="postid_7342241942">1 BDR Apt Completely Renovated</a>

In [8]:
h3s[0].text

'\n1 BDR Apt Completely Renovated\n'

In [9]:
# links are in 'href', within 'a' object
h3s[0].find('a')['href']

'https://newyork.craigslist.org/fct/apa/d/bridgeport-bdr-apt-completely-renovated/7342241942.html'

In [10]:
# find links in each item in the list
links = [post.find('a')['href'] for post in h3s]
links

['https://newyork.craigslist.org/fct/apa/d/bridgeport-bdr-apt-completely-renovated/7342241942.html',
 'https://newyork.craigslist.org/mnh/apa/d/new-york-no-fee-month-free-newly/7342238086.html',
 'https://newyork.craigslist.org/mnh/apa/d/new-york-the-best-water-views-around/7342237406.html',
 'https://newyork.craigslist.org/que/apa/d/sunnyside-fully-furnished-bdrm-apt/7342236992.html',
 'https://newyork.craigslist.org/mnh/apa/d/new-york-remarkable-value-unbeatable/7342234322.html',
 'https://newyork.craigslist.org/mnh/apa/d/new-york-no-feehuge-studio-stunning/7342229775.html',
 'https://newyork.craigslist.org/mnh/apa/d/new-york-harlem-central-park-north-park/7342228234.html',
 'https://newyork.craigslist.org/mnh/apa/d/new-york-washington-heights-yeshiva/7342227911.html',
 'https://newyork.craigslist.org/stn/apa/d/staten-island-bedroom-apartmenr/7342223176.html',
 'https://newyork.craigslist.org/stn/apa/d/staten-island-bedroom-staten-island-gem/7342222299.html',
 'https://newyork.craigs

In [11]:
# confirm there are 120 links
len(links)

120

In [12]:
# grab first link to test and build a function
test_url = links[0]

In [13]:
test_url

'https://newyork.craigslist.org/fct/apa/d/bridgeport-bdr-apt-completely-renovated/7342241942.html'

In [14]:
# convert to soup object
page = rq.get(test_url)
soup = bs(page.content, 'html.parser')

# take a look
soup

<!DOCTYPE html>

<html class="no-js">
<head>
<meta charset="utf-8"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="craigslist" property="og:site_name"/>
<meta content="preview" name="twitter:card"/>
<meta content="1 BDR Apt Completely Renovated - apts/housing for rent - apartment..." property="og:title"/>
<meta content="Completly Renovated 1 Bdr Apt. W/ new flooring throughout, kitchen and appliances w/ off St. Parking. Electricity included and close to I95, Rt 8 and Rt 15." name="description"/>
<meta content="Completly Renovated 1 Bdr Apt. W/ new flooring throughout, kitchen and appliances w/ off St. Parking. Electricity included and close to I95, Rt 8 and Rt 15." property="og:description"/>
<meta content="https://images.craigslist.org/00808_id7fSO69TkYz_0CI0lM_600x450.jpg" property="og:image"/>
<meta content="https://newyork.craigslist.org/fct/apa/d/bridgeport-bdr-apt-completely-renovated/734

In [15]:
# look at url, and open in separate tab to inspect
test_url

'https://newyork.craigslist.org/fct/apa/d/bridgeport-bdr-apt-completely-renovated/7342241942.html'

In [16]:
soup.find('span', attrs={'class':'price'})

<span class="price">$925</span>

In [17]:
soup.find('span', attrs={'class':'price'}).contents

['$925']

In [18]:
# price
int(soup.find('span', attrs={'class':'price'}).contents[0].replace('$', '').replace(',', ''))

925

In [19]:
float(soup.find('span', attrs={'class':'price'}).contents[0]\
                          .replace('$', '').replace(',', ''))

925.0

In [20]:
# location
soup.find('small').contents[0].strip()\
    .replace('(', '')\
    .replace(')', '')\
    .replace(' ', '_')\
    .lower()

'bridgeport'

In [21]:
soup.find('span', attrs={'class':'shared-line-bubble'}).text  #text is similar to contents

'1BR / 1Ba'

In [22]:
soup.find('span', attrs={'class':'shared-line-bubble'}).text.split('/')[0][:-3]

'1'

In [23]:
# bedroom
int(soup.find('span', attrs={'class':'shared-line-bubble'}).text.split('/')[0][:-3])

1

In [24]:
# bath
int(soup.find('span', attrs={'class':'shared-line-bubble'}).text.split('/')[1][:-2])

1

In [30]:
def info_grabber(url):
    
    # convert page to soup object
    page = rq.get(url)
    soup = bs(page.content, 'html.parser')
    
    # instantient empty dictionary
    info = {}
    
    print(url)
    
    # scrape price
    try:
        info['price'] = float(soup.find('span', attrs={'class':'price'}).contents[0]\
                          .replace('$', '').replace(',', ''))
    except ValueError:
        print(f'Price Error: {url}')   # track any posts without a price
        info['price'] = np.nan   # put NaN in its position
    
    # scrape location
    try:
        info['location'] = soup.find('small').contents[0].strip()\
                        .replace('(', '')\
                        .replace(')', '')\
                        .replace(' ', '_')\
                        .lower()
    except IndexError:   # track any posts without a location
        print(f'Location Error: {url}')
        info['location'] = np.nan   # put NaN in its position
    except ValueError:
        print(f'Location Error: {url}')   # track any posts without a location
        info['location'] = np.nan   # put NaN in its location
    except:
        print(f'Location Error: {url}')   # track any posts without a location
        info['location'] = np.nan   # put NaN in its location
    
    # scrape bedrooms
    info['num_beds'] = int(soup.find('span', attrs={'class':'shared-line-bubble'}).text.split('/')[0][:-3])
    
    # scrape bathrooms
    try:
        info['num_baths'] = int(soup.find('span', attrs={'class':'shared-line-bubble'}).text.split('/')[1][:-2])
    except ValueError:
        print(f'Bathroom Error: {url}')   # track any posts without a bathroom, generally has a value of "shared"
        info['num_baths'] = np.nan   # put NaN in its position
        
    # include Craigslist url, for reference
    info['url'] = url
    
    return info

In [31]:
%%time

# get info for each link/apartment
apts_info = [info_grabber(link) for link in links]

apts_info[0]   # look at first example

https://newyork.craigslist.org/fct/apa/d/bridgeport-bdr-apt-completely-renovated/7342241942.html
https://newyork.craigslist.org/mnh/apa/d/new-york-no-fee-month-free-newly/7342238086.html
https://newyork.craigslist.org/mnh/apa/d/new-york-the-best-water-views-around/7342237406.html
https://newyork.craigslist.org/que/apa/d/sunnyside-fully-furnished-bdrm-apt/7342236992.html
https://newyork.craigslist.org/mnh/apa/d/new-york-remarkable-value-unbeatable/7342234322.html
https://newyork.craigslist.org/mnh/apa/d/new-york-no-feehuge-studio-stunning/7342229775.html
https://newyork.craigslist.org/mnh/apa/d/new-york-harlem-central-park-north-park/7342228234.html
https://newyork.craigslist.org/mnh/apa/d/new-york-washington-heights-yeshiva/7342227911.html
https://newyork.craigslist.org/stn/apa/d/staten-island-bedroom-apartmenr/7342223176.html
https://newyork.craigslist.org/stn/apa/d/staten-island-bedroom-staten-island-gem/7342222299.html
https://newyork.craigslist.org/brk/apa/d/brooklyn-cheap-bedroom-

https://newyork.craigslist.org/que/apa/d/bronx-comuters-paradise-outside-patio/7342104397.html
https://newyork.craigslist.org/brx/apa/d/bronx-se-renta-apartamento-de/7342101926.html
Location Error: https://newyork.craigslist.org/brx/apa/d/bronx-se-renta-apartamento-de/7342101926.html
https://newyork.craigslist.org/que/apa/d/astoria-no-fee-2mo-free-brand-new-bed/7342094752.html
https://newyork.craigslist.org/mnh/apa/d/new-york-sweet-1br-duplex-with-lots-of/7342094665.html
https://newyork.craigslist.org/mnh/apa/d/new-york-alphabet-city-3brs-on-lovely/7342094243.html
https://newyork.craigslist.org/mnh/apa/d/new-york-large-confortable-studio-no/7342093982.html
https://newyork.craigslist.org/que/apa/d/astoria-spacious-bed-utilities-included/7342093995.html
https://newyork.craigslist.org/mnh/apa/d/new-york-sweet-2br-with-private-patio/7342093838.html
https://newyork.craigslist.org/mnh/apa/d/new-york-sunny-2br-on-lovely-11th-st/7342093458.html
https://newyork.craigslist.org/mnh/apa/d/new-york

{'price': 925.0,
 'location': 'bridgeport',
 'num_beds': 1,
 'num_baths': 1,
 'url': 'https://newyork.craigslist.org/fct/apa/d/bridgeport-bdr-apt-completely-renovated/7342241942.html'}

In [32]:
# convert to dataframe
df = pd.DataFrame(apts_info)

df.head()   # take a look

Unnamed: 0,price,location,num_beds,num_baths,url
0,925.0,bridgeport,1,1.0,https://newyork.craigslist.org/fct/apa/d/bridg...
1,2990.0,midtown,1,1.0,https://newyork.craigslist.org/mnh/apa/d/new-y...
2,3200.0,financial_district,2,1.0,https://newyork.craigslist.org/mnh/apa/d/new-y...
3,1980.0,sunnyside,1,1.0,https://newyork.craigslist.org/que/apa/d/sunny...
4,2599.0,financial_district,1,1.0,https://newyork.craigslist.org/mnh/apa/d/new-y...


In [33]:
# if you need to further inspect a listing
df.loc[2, 'url']

'https://newyork.craigslist.org/mnh/apa/d/new-york-the-best-water-views-around/7342237406.html'

In [34]:
# begin data inspection
df.isna().sum()

price        0
location     3
num_beds     0
num_baths    2
url          0
dtype: int64