In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
def get_page(url):
    """
    returns a soup object that contains all the information of a given webpage
    """
    result = requests.get(url)
    content = result.content
    return BeautifulSoup(content, features='html.parser')


def get_room_classes(soup_page):
    """
    returns all the listings that can be found on the page (soup object) in a list
    """
    rooms = soup_page.findAll('div', {'class':'_8ssblpx'})
    result = []
    for room in rooms:
        result.append(room)
    return result


In [3]:
las_vegas_link = 'https://www.airbnb.com/s/all?refinement_paths%5B%5D=%2Fhomes&query=las+vegas'

In [4]:
# get_page(las_vegas_link)
rc = get_room_classes(get_page(las_vegas_link))
len(rc), type(rc)#, rc[0]

(20, list)

In [5]:
def get_listing_link(listing):
    """
    returns the URL link of given listing
    """
    return 'https://airbnb.com' + listing.find('a')['href']

get_listing_link(rc[0])

'https://airbnb.com/rooms/47197033?previous_page_section_name=1000&federated_search_id=2404aeba-cfe4-4f44-ad64-0857449875d3'

In [6]:
def get_listing_title(listing):
    """
    returns the title of given listing
    """
    return listing.find('meta')['content']

get_listing_title(rc[0])

'Condo Next to Desert Shores - null - Las Vegas'

In [7]:
get_listing_title(rc[1])

'Great Room in a House close to the strip - null - Las Vegas'

In [8]:
def get_top_row(listing):
    """
    returns the top row of given listing's info
    """
    return listing.find('div', {'class':'_1tanv1h'}).text  # _167gordg

get_top_row(rc[0])

'Entire condominium in Las Vegas'

In [9]:
get_top_row(rc[1])

'Private room in Las Vegas'

In [10]:
def get_room_info(listing):
    """
    returns room info of listing 
    """
    return listing.find('div', {'class', '_kqh46o'}).text

get_room_info(rc[0])

'2 guests · 1 bedroom · 1 bed · 1 bath'

In [11]:
get_room_info(rc[1])

'2 guests · 1 bedroom · 1 bed · 1 shared bath'

In [12]:
def get_room_price(listing):
    """
    returns the nightly rate (price) of given listing
    """
    price_text = listing.find('div', {'class':'_ls0e43'}).text
    price = price_text.split('Price:')
    return price[1]

get_room_price(rc[0])

'$70 / night'

In [13]:
get_room_price(rc[1])

'$21 / night'

In [14]:
def get_basic_facilities(listing):
    ''' Returns the basic facilities'''
    try:
        output = listing.findAll("div", {"class":"_kqh46o"})[1].text.replace(" ","") #Speeds up cleaning
    except:
        output = []
    return output

get_basic_facilities(rc[0])

'Wifi·Airconditioning·Pool·Kitchen'

In [15]:
get_basic_facilities(rc[1])

'Wifi·Selfcheck-in·Airconditioning·Petsallowed'

In [16]:
def get_room_rating(listing):
    """
    returns star rating of given listing
    """
    try:
        return listing.find('div', {'class':'_vaj62s'}).text
    except:
        return listing.find('div', {'class':'_vaj62s'})

get_room_rating(rc[0])

In [17]:
print(get_room_rating(rc[0]))

None


In [18]:
get_room_rating(rc[1])

'Rating 4.80 out of 5;4.805 reviews\xa0(5)'

In [31]:
get_room_rating(rc[16])

'Rating 4.56 out of 5;4.5652 reviews\xa0(52)'

In [30]:
rc[6]

<div class="_8ssblpx"><div class="_gig1e7"><div itemprop="itemListElement" itemscope="" itemtype="http://schema.org/ListItem"><meta content="Cozy 3rd floor space. Slightly roomy. - null - Las Vegas" itemprop="name"/><meta content="7" itemprop="position"/><meta content="undefined/rooms/47221579?previous_page_section_name=1000" itemprop="url"/><div><div><div class="_8s3ctt"><a aria-label="Cozy 3rd floor space. Slightly roomy." class="_gjfol0" data-check-info-section="true" href="/rooms/47221579?previous_page_section_name=1000&amp;federated_search_id=2404aeba-cfe4-4f44-ad64-0857449875d3" rel="noopener noreferrer" target="listing_47221579"></a><div class="_1nz9l7j"><div class="_2n7voam"><div class="_gjw2an" style="padding-top:66.6667%;background:#484848"><div class="_1szwzht"><div aria-busy="false" aria-label="Cozy 3rd floor space. Slightly roomy." class="_hxt6u1e" role="img" style="padding-top:66.6667%"><div class="_4626ulj"><img alt="Cozy 3rd floor space. Slightly roomy." aria-hidden="tr

In [19]:
print(get_room_rating(rc[1]))

Rating 4.80 out of 5;4.805 reviews (5)


In [20]:
def get_n_reviews(listing):
    '''
    Returns the number of reviews
    '''
    try:  # Not all listings have reviews // extraction failed
        output = listing.findAll("span", {"class":"_krjbj"})[1].text
    except:
        output = None   # Indicate that the extraction failed -> can indicate no reviews or a mistake in scraping
    return output


get_n_reviews(rc[0])

In [21]:
get_n_reviews(rc[1])

'5 reviews'

In [22]:
import pandas as pd

def record_dataset(listings, file_path='output.parquet'):
    """
    take scraped room classes and record their information to parquet
    """
    data = []
    for l in listings:
        a = get_listing_link(l)
        b = get_listing_title(l)
        c = get_top_row(l)
        d = get_room_info(l)
        e = get_room_price(l)
        f = get_basic_facilities(l)
        g = get_room_rating(l)
        h = get_n_reviews(l)
        out = [a, b, c, d, e, f, g, h]
        data.append(out)
    names = [l for l in 'abcdefgh']
    df = pd.DataFrame(data, columns=names)
    df.to_parquet(file_path, index=False)
    return df


record_dataset(rc)

Unnamed: 0,a,b,c,d,e,f,g,h
0,https://airbnb.com/rooms/47197033?previous_pag...,Condo Next to Desert Shores - null - Las Vegas,Entire condominium in Las Vegas,2 guests · 1 bedroom · 1 bed · 1 bath,$70 / night,Wifi·Airconditioning·Pool·Kitchen,,
1,https://airbnb.com/rooms/45827691?previous_pag...,Great Room in a House close to the strip - nul...,Private room in Las Vegas,2 guests · 1 bedroom · 1 bed · 1 shared bath,$21 / night,Wifi·Selfcheck-in·Airconditioning·Petsallowed,Rating 4.80 out of 5;4.805 reviews (5),5 reviews
2,https://airbnb.com/rooms/44416011?previous_pag...,Eggeman 8 - null - Las Vegas,Entire condominium in Las Vegas,4 guests · 2 bedrooms · 3 beds · 1.5 baths,$52 / night,Wifi·Airconditioning·Petsallowed·Pool,Rating 4.92 out of 5;4.9237 reviews (37),37 reviews
3,https://airbnb.com/rooms/21055421?previous_pag...,Private suite with private bathroom. - null - ...,Private room in Las Vegas,1 guest · 1 bedroom · 1 bed · 1 private bath,$23 / night,Wifi·Pool·Kitchen·Freeparking,,
4,https://airbnb.com/rooms/47237465?previous_pag...,9min from the strip. 1min from the shopping ce...,Private room in Las Vegas,1 guest · 1 bedroom · 1 bed · 1.5 shared baths,$24 / night,Gym·Wifi·Airconditioning·Kitchen,,
5,https://airbnb.com/rooms/46263193?previous_pag...,"Master bedroom in a new, CLEAN, safe home! - n...",Private room in Las Vegas,1 guest · 1 bedroom · 1 bed · 1.5 baths,$29 / night,Wifi·Selfcheck-in·Airconditioning·Kitchen,,
6,https://airbnb.com/rooms/47221579?previous_pag...,Cozy 3rd floor space. Slightly roomy. - null -...,Private room in Las Vegas,2 guests · 1 bedroom · 0 beds · 1.5 shared baths,$21 / night,Wifi·Airconditioning·Petsallowed·Kitchen,,
7,https://airbnb.com/rooms/37289480?previous_pag...,Private king master bedroom near center of Veg...,Private room in Las Vegas,1 guest · 1 bedroom · 1 bed · 2 shared baths,$16 / night,Wifi·Airconditioning·Kitchen·Freeparking,,
8,https://airbnb.com/rooms/41602679?previous_pag...,Hospitality House 3 - null - Las Vegas,Private room in Las Vegas,1 guest · 1 bedroom · 1 bed · 1 shared bath,$24 / night,Wifi·Airconditioning·Indoorfireplace·Kitchen,Rating 4.54 out of 5;4.5413 reviews (13),13 reviews
9,https://airbnb.com/rooms/45439619?previous_pag...,"Tahiti Village Bora, Bora in Las Vegas 1 BR Su...",Entire condominium in Las Vegas,4 guests · 1 bedroom · 2 beds · 1 bath,$60 / night,Wifi·Selfcheck-in·Airconditioning·Pool,Rating 5.0 out of 5;5.04 reviews (4),4 reviews


In [23]:
link2 = 'https://www.airbnb.com/s/all?refinement_paths%5B%5D=%2Fhomes&query=las+vegas' 

record_dataset(get_room_classes(get_page(link2)), file_path='output2.csv')

ArrowTypeError: ("Expected bytes, got a 'list' object", 'Conversion failed for column f with type object')

In [None]:
pd.read_parquet('output.parquet') == pd.read_parquet('output2.parquet')

In [None]:
pd.read_parquet('output.parquet')['b'] == pd.read_parquet('output2.parquet')['b']

# Notes
### get_listing_link()
#### Link formatting
- Base: https://www.airbnb.com/rooms/45827691 (unlikely in wild)
- Scraped: https://www.airbnb.com/rooms/45827691?source_impression_id=p3_1608949724_fGn6tZpchR09VS7I
- Clicked: https://www.airbnb.com/rooms/45827691?source_impression_id=p3_1608949764_SigIFnGwmorU%2Bo%2BI

### get_listing_title()
#### Output
- Where does null come from?
- Does city (location) come from page title?
- `'Great Room in a House close to the strip - null - Las Vegas'`

### get_basic_facilities()
#### Formatting
- `·` separator
- `'Wifi·Airconditioning·Pool·Kitchen'`

### Search / Base Link
- First few listings of these links are the same
- 2nd link returns out of order (based on GUI) 
- Overlapping results returned in different order
- 2nd link doesn't return all displayed results (maybe what it's returning are results from a few days ago? no ref.?)
  - https://www.airbnb.com/s/all?refinement_paths%5B%5D=%2Fhomes&query=las+vegas
  - https://www.airbnb.com/s/las-vegas--nv/homes?tab_id=home_tab&refinement_paths[]=%2Fhomes&source=structured_search_input_header&search_type=search_query