# Airbnb Web Scraping

## Imports

In [5]:
import requests
import bs4
import pandas as pd

## Get HTML Code

In [6]:
url = "https://www.airbnb.com/s/Bali--Indonesia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=calendar&query=Bali%2C%20Indonesia&place_id=ChIJoQ8Q6NNB0S0RkOYkS7EPkSQ&checkin=2021-12-29&checkout=2022-01-03&source=structured_search_input_header&search_type=autocomplete_click"
base_url = "https://airbnb.com"

In [7]:
def get_page(url):
    check = 'no'
    while check == 'no':
        response = requests.get(url)
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        if len(soup.find_all(class_= '_12oal24')) >= 1:
            check = "yes"
        
    return soup

In [8]:
get_page(url)

KeyboardInterrupt: 

## Retrieve Data

### Preparation

In [7]:
soup = get_page(url)

In [8]:
all_codetags = soup.find_all(class_='_12oal24')
first_listing = all_codetags[0]

### Data Extraction

In [14]:
def get_listing_title(tag_object):
    try:
        return tag_object.find(class_ = '_1whrsux9').text
    except:
        return None

In [15]:
get_listing_title(first_listing)

'❣️Romantic Staycation-PrivateSunset Pool@megananda'

In [16]:
def get_listing_subtitle(tag_object):
    try:
        return tag_object.find(class_ = '_1tanv1h').text
    except:
        return None

In [17]:
get_listing_subtitle(first_listing)

'Entire villa in Ubud'

In [18]:
def get_listing_info(tag_object):
    try:
        x = tag_object.find_all("div", {"class": '_3c0zz1'})
        return x[0].text
    except:
        return None

In [19]:
get_listing_info(first_listing)

'2 guests · 1 bedroom · 1 bed · 1 bath'

In [20]:
def get_listing_ammenities(tag_object):
    try:
        x = tag_object.find_all("div", {"class": '_3c0zz1'})
        return x[1].text
    except:
        return None

In [21]:
get_listing_ammenities(first_listing)

'Pool · Wifi · Air conditioning'

In [22]:
def get_listing_rating(tag_object):
    try:        
        x = tag_object.find(class_ = '_10fy1f8').text
        return float(x)
    except:
        return None 

In [23]:
get_listing_rating(first_listing)

4.94

In [24]:
def get_listing_reviews(tag_object):
    try:
        x = tag_object.find(class_ = '_a7a5sx').text
        y = x.split()[0]
        z = y.split('(')[1]
        return int(z)
    except:
        return None

In [25]:
get_listing_reviews(first_listing)

243

In [26]:
def get_listing_price_per_night(tag_object):
    price = tag_object.find("span", {"class": "_tyxjp1"})
    if price == None:
        return None
    else:
        a = filter(str.isdigit, price.text)
        return(float("".join(a)))

In [27]:
get_listing_price_per_night(first_listing)

92.0

In [28]:
def get_listing_total_price(tag_object):
    total_price = tag_object.find_all("span", {"class": "a8jt5op"})
    if total_price == []:
        return None
    else:
        for a in total_price:
            total = "total"   
            if total in a.text:
                    d = filter(str.isdigit, a.text)
                    return(float("".join(d)))

In [29]:
get_listing_total_price(all_codetags[1])

355.0

### Looking for additional data

In [31]:
def get_next_page(bs_object): 
    page = bs_object.find("a", {"class": "_za9j7e"})
    if page == None:
        return None
    else:
        return(base_url + page.get("href"))

In [32]:
html_data = get_page(url)
get_next_page(html_data)

'https://airbnb.com/s/Bali--Indonesia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=calendar&query=Bali%2C%20Indonesia&place_id=ChIJoQ8Q6NNB0S0RkOYkS7EPkSQ&checkin=2021-12-29&checkout=2022-01-03&source=structured_search_input_header&search_type=autocomplete_click&federated_search_session_id=afe3cda7-f582-4ac5-9733-f318fc969a91&pagination_search=true&items_offset=20&section_offset=2'

## Saving the Data

In [33]:
def get_data(soup_obj):
    """
    get_data implementation using next-button links. This method suffers from the non-deterministic results
    of the airbnb page
    """
    title = []
    subtitle = []
    info = []
    ammenities = []
    rating = []
    reviews = []
    price_per_night = []
    total_price = []

    current_page = soup_obj
    while current_page != None:
        all_codetags = current_page.find_all(class_='_12oal24')
        for i in all_codetags:
            title.append(get_listing_title(i))
            subtitle.append(get_listing_subtitle(i))
            info.append(get_listing_info(i))
            ammenities.append(get_listing_ammenities(i))
            rating.append(get_listing_rating(i))
            reviews.append(get_listing_reviews(i))
            price_per_night.append(get_listing_price_per_night(i))
            total_price.append(get_listing_total_price(i))
        next_url = get_next_page(current_page)
        if next_url:
            current_page = get_page(next_url)
        else:
            current_page = None
    listings_dict = {"title": title, "subtitle": subtitle, "info": info, "ammenities": ammenities, 
             "rating": rating, "reviews": reviews, "price_per_night": price_per_night, 
             "total_price": total_price, "total_price": total_price}
    return pd.DataFrame(listings_dict)    

In [34]:
airbnb = get_data(soup)
airbnb.head(10)

Unnamed: 0,title,subtitle,info,ammenities,rating,reviews,price_per_night,total_price
0,❣️Romantic Staycation-PrivateSunset Pool@megan...,Entire villa in Ubud,2 guests · 1 bedroom · 1 bed · 1 bath,Pool · Wifi · Air conditioning,4.94,243.0,92.0,458.0
1,"NEW Private Luxe Villa, Jungle View-Pool-Koi Pond",Entire villa in Kecamatan Ubud,2 guests · 1 bedroom · 1 bed · 1 bath,Pool · Wifi · Air conditioning · Kitchen,5.0,16.0,71.0,355.0
2,Cozy 2BR Villa with Panoramic View of Rice Fields,Entire villa in Kecamatan Ubud,6 guests · 2 bedrooms · 2 beds · 2 baths,Pool · Wifi · Air conditioning · Kitchen,4.93,41.0,348.0,1740.0
3,Sparkling Gem - private pool - Mins to La Brisa,Entire villa in Mengwi,3 guests · 1 bedroom · 2 beds · 1 bath,Pool · Wifi · Air conditioning,4.85,98.0,69.0,342.0
4,Newly Built Villa Made East Near Seminyak Beach,Entire villa in Kuta,4 guests · 2 bedrooms · 2 beds · 2 baths,Pool · Wifi · Air conditioning · Kitchen,4.89,89.0,70.0,350.0
5,River Studio,Entire residential home in Perenenan,2 guests · 1 bedroom · 1 bed · 1 bath,Air conditioning · Kitchen,4.98,47.0,434.0,2168.0
6,"King Bed Bungalows w/ Gardens, Pools and Break...",Hotel room in Kuta Utara,2 guests · 1 bedroom · 1 bed · 1 private bath,Pool · Wifi · Air conditioning,4.86,21.0,33.0,162.0
7,baahi ( free scooter),Entire residential home in Kuta,2 guests · 1 bedroom · 1 bed · 1 bath,Pool · Wifi · Air conditioning · Kitchen,4.82,119.0,45.0,222.0
8,Private 2BR Luxury Villa with Fiber Optic Inte...,Entire villa in Seminyak,6 guests · 2 bedrooms · 2 beds · 2 baths,Pool · Wifi · Air conditioning · Kitchen,4.83,31.0,188.0,936.0
9,3 bedroom 3 bathroom villa in seminyak,Entire villa in Kuta Utara,6 guests · 3 bedrooms · 3 beds · 3.5 baths,Pool · Wifi · Air conditioning · Kitchen,4.74,50.0,703.0,3512.0


In [35]:
#airbnb.to_csv('Dataframe_final.csv')
#airbnb.to_excel("airbnb.xls")

## Additional Changes

In [36]:
import re
def get_listing_info_2(indiv_listing):
    result = {'guests':None ,'bedroom':None,'bed':None , 'bath':None}
    try:
        info = indiv_listing.find_all("div", {"class": '_3c0zz1'})
        info_text = info[0].text
        elements = info_text.split('·')
        for el in elements:
            number_matched = re.findall("\d+", el)
            if number_matched:
                val = float(number_matched[0])
            if 'guest' in el:
                result.update({'guests': val})
            if 'bedroom' in el:
                result.update({'bedroom': val})
            if ('bed' in el) and ('bedroom' not in el):
                result.update({'bed': val})
            if 'bath' in el:
                if 'half-bath'.upper() in el.upper():
                    result.update({'bath': 0.5})
                else:
                    float_matched = re.findall("\d+\.\d+", el)
                    if float_matched:
                        result.update({'bath': float(float_matched[0])})
                    else:
                        result.update({'bath': val})
        result_list = result.values()
        return tuple(result_list)
    except:
        return None

In [37]:
def get_listing_ammenities_2(indiv_listing):
    result = {'Pool':'False' ,'Wifi':'False','Air conditioning':'False' , 'Kitchen':'False'}
    try:
        x = get_listing_ammenities(indiv_listing)
        if 'Pool' in x:
            result.update({'Pool': 'True'})
        if 'Wifi' in x:
            result.update({'Wifi': 'True'})
        if 'Air conditioning' in x:
            result.update({'Air conditioning': 'True'})
        if 'Kitchen' in x:
            result.update({'Kitchen': 'True'})
        result_list = result.values()
        return tuple(result_list)
    except:
        return None

In [38]:
def get_data_append_2(soup):
    info_2 = []
    ammenities_2 = []
    title = []
    subtitle = []
    info = []
    ammenities = []
    rating = []
    reviews = []
    price_per_night = []
    total_price = []
    """
    get_data implementation using next-button links. This method suffers from the non-deterministic results
    of the airbnb page
    """
    current_page = soup
    while get_next_page(current_page) != None:
        all_codetags = current_page.find_all(class_='_12oal24')
        for i in all_codetags:
            title.append(get_listing_title(i))
            subtitle.append(get_listing_subtitle(i))
            info.append(get_listing_info(i))
            ammenities.append(get_listing_ammenities(i))
            rating.append(get_listing_rating(i))
            reviews.append(get_listing_reviews(i))
            price_per_night.append(get_listing_price_per_night(i))
            total_price.append(get_listing_total_price(i))
            info_2.append(get_listing_info_2(i))
            ammenities_2.append(get_listing_ammenities_2(i))
        next_url = get_next_page(current_page)
        if next_url:
            current_page = get_page(next_url)
        else:
            current_page = None
    list_dict_2 = {"title": title, "subtitle": subtitle, "info": info, "ammenities": ammenities, 
             "rating": rating, "reviews": reviews, "price_per_night": price_per_night, 
             "total_price": total_price, "total_price": total_price, 'info_2': info_2, 'ammenities_2': ammenities_2}
    return pd.DataFrame(list_dict_2)

In [39]:
airbnb_extended = get_data_append_2(soup)
airbnb_extended.head()

Unnamed: 0,title,subtitle,info,ammenities,rating,reviews,price_per_night,total_price,info_2,ammenities_2
0,❣️Romantic Staycation-PrivateSunset Pool@megan...,Entire villa in Ubud,2 guests · 1 bedroom · 1 bed · 1 bath,Pool · Wifi · Air conditioning,4.94,243.0,92.0,458.0,"(2.0, 1.0, 1.0, 1.0)","(True, True, True, False)"
1,"NEW Private Luxe Villa, Jungle View-Pool-Koi Pond",Entire villa in Kecamatan Ubud,2 guests · 1 bedroom · 1 bed · 1 bath,Pool · Wifi · Air conditioning · Kitchen,5.0,16.0,71.0,355.0,"(2.0, 1.0, 1.0, 1.0)","(True, True, True, True)"
2,Cozy 2BR Villa with Panoramic View of Rice Fields,Entire villa in Kecamatan Ubud,6 guests · 2 bedrooms · 2 beds · 2 baths,Pool · Wifi · Air conditioning · Kitchen,4.93,41.0,348.0,1740.0,"(6.0, 2.0, 2.0, 2.0)","(True, True, True, True)"
3,Sparkling Gem - private pool - Mins to La Brisa,Entire villa in Mengwi,3 guests · 1 bedroom · 2 beds · 1 bath,Pool · Wifi · Air conditioning,4.85,98.0,69.0,342.0,"(3.0, 1.0, 2.0, 1.0)","(True, True, True, False)"
4,Newly Built Villa Made East Near Seminyak Beach,Entire villa in Kuta,4 guests · 2 bedrooms · 2 beds · 2 baths,Pool · Wifi · Air conditioning · Kitchen,4.89,89.0,70.0,350.0,"(4.0, 2.0, 2.0, 2.0)","(True, True, True, True)"


In [40]:
#airbnb_extended.to_csv('airbnb_extended.csv')
#airbnb_extended.to_excel('airbnb_extended.xls')