# Fixing Scrape.py

In [6]:
import math
import time
import requests
import pandas as pd
import dask.delayed
from time import sleep
from dask import compute
from bs4 import BeautifulSoup
from datetime import date, datetime

We are going to start by running each function.

In [71]:
def get_page(url):
    """
    returns a soup object that contains all the information of a given webpage
    
    response type: tuple
    
    example response: (bs4.element.ResultSet, '2023-10-08 22:35:37.830176')
    """
    tos = str(datetime.now()) 
    result = requests.get(url)
    content = result.content
    page = BeautifulSoup(content, features='html')
    return page, tos

location = 'Japan'
a_url = f'http://www.airbnb.com/s/{location}/homes'

# get_page(a_url)

In [13]:
result = requests.get(a_url)

type(result.content)

bytes

In [16]:
type(BeautifulSoup(result.content, features='html'))

bs4.BeautifulSoup

In [72]:
a_page = get_page(a_url)

In [73]:
type(a_page)

tuple

In [74]:
type(a_page[0].findAll('div')) # , {'class':'_8ssblpx'}  #  _8ssblpx _uhpzdny _gig1e7 _1wcpzyga

bs4.element.ResultSet

In [75]:
len(a_page[0].findAll('div'))

1397

In [76]:
len(a_page[0])

2

In [77]:
a_page[1]

'2023-10-08 23:29:57.478695'

In [78]:
# a_page

One way it appears we can find all the listings is by finding everywhere where `target="listing_` exists. The part after `listing_` is the listing number, which is also a way to find the URL of the listing. For example, `44182350` can be found `https://www.airbnb.com/rooms/44182350`.

It looks like this `get_room_classes()` function was meant to pull each of the image + text boxes from a page on Airbnb. Do we need to do that? Or do we just need the URLs? A big reason I ask this is the function depends on a `.findAll()` method call from a `soup_page` object which uses parameters `'div', {'class':'_8ssblpx'}`. Past `div` breaking, `_8ssblpx` feels almost certain to break.

The functions below `get_room_classes()` extract specific information. What was I extracting from these listing boxes? It looks like almost everything...
- listing url
- listing title
- top row (what_it_is, where_it_is)
- room info
- room price
- room rating and number of ratings

This is all information which can be collected from the page of the listing as well. Let's migrate it to there. It looks like the `class` below, which calls all these functions, can also be cleaned up so I think let's just rework everything and make sure the data coming out is the same format or we can reformat old data.

To start with the fix, let's make a function, `get_listing_urls()` which collects all the listing URLs from the browse page. It could be interesting to get information like the display image chosen or other things which may only be available on the main browse pages from the browse page as well, but I am not going to worry about that now.

First, I need to make sure the idea of collecting all the listing URLs works and figure out how to do it from the soup object.

In [30]:
# def get_room_classes(soup_page):
#     """
#     returns all the listings that can be found on the page (soup object) in a list
#     """
#     rooms = soup_page.findAll('div', {'class':'_8ssblpx'})  #  _8ssblpx _uhpzdny _gig1e7 _1wcpzyga
#     result = []
#     for room in rooms:
#         result.append(room)
#     return result


# get_room_classes(get_page(a_url))
# # AttributeError: 'tuple' object has no attribute 'findAll'

In [79]:
type(a_page[0])

bs4.BeautifulSoup

In [80]:
len(a_page[0].find_all("div"))

1397

In [45]:
a_page[0].find_all(string="listing")

[]

Trying to figure out how to find all the listings https://stackoverflow.com/questions/33396785/how-to-find-a-particular-word-in-html-page-through-beautiful-soup-in-python

Finding everything that's like `aria-labelledby="tit` would work but finding all the `aria-labelledby` would be better then check which cointain "listing_".

In [49]:
import bs4
import re

data = '''
<html>
<body>
<div>today is a sunny day</div>
<div>I love when it's sunny outside</div>
Call me sunny
<div>sunny is a cool word sunny</div>
</body>
</html>
'''

searched_word = 'sunny'

soup = bs4.BeautifulSoup(data, 'html.parser')
results = soup.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)

print('Found the word "{0}" {1} times\n'.format(searched_word, len(results)))

for content in results:
    words = content.split()
    for index, word in enumerate(words):
        # If the content contains the search word twice or more this will fire for each occurence
        if word == searched_word:
            print('Whole content: "{0}"'.format(content))
            before = None
            after = None
            # Check if it's a first word
            if index != 0:
                before = words[index-1]
            # Check if it's a last word
            if index != len(words)-1:
                after = words[index+1]
            print('\tWord before: "{0}", word after: "{1}"'.format(before, after))

Found the word "sunny" 4 times

Whole content: "today is a sunny day"
	Word before: "a", word after: "day"
Whole content: "I love when it's sunny outside"
	Word before: "it's", word after: "outside"
Whole content: "
Call me sunny
"
	Word before: "me", word after: "None"
Whole content: "sunny is a cool word sunny"
	Word before: "None", word after: "is"
Whole content: "sunny is a cool word sunny"
	Word before: "word", word after: "None"


**I am concerned this method would be inefficient at scale.. but let's see if it can work on a single page to start.**

In [64]:
# searched_word = 'listing'

# # len(a_page[0].find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)[0])
# type(a_page[0].find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)[0])

# for _ in a_page[0].find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)[0].join(''):
#     print(_)

This method is taking far too long to work at scale, or it broke...

This may be more useful: https://stackoverflow.com/questions/52656353/get-specific-links-with-target-in-python-beautifulsoup

It looks like if I can find all the tags, `a` in this example, then specify what I am looking for within each of those tags, I will be able to extract each bit I am after. What is the tag I am looking for?data-testid="card-container"

In [None]:
data-testid="card-container"

In [88]:
a_page[0].find('div', {'data-testid':'card-container'})['aria-labelledby']

'title_44182350'

In [89]:
for _ in a_page[0].find_all('div', {'data-testid':'card-container'}):
    print(_['aria-labelledby'])

title_44182350
title_43127419
title_15974146
title_28142598
title_24688711
title_36286963
title_5608532
title_1298200
title_875191656813979483
title_52393892
title_8487288
title_745111324566410595
title_39117472
title_11701395
title_41357662
title_12770524
title_9765116
title_44637320


In [90]:
def get_page(url):
    """
    returns a soup object that contains all the information of a given webpage
    
    response type: tuple
    
    example response: (bs4.element.ResultSet, '2023-10-08 22:35:37.830176')
    """
    tos = str(datetime.now()) 
    result = requests.get(url)
    content = result.content
    page = BeautifulSoup(content, features='html')
    return page, tos

location = 'Japan'
a_url = f'http://www.airbnb.com/s/{location}/homes'

a_page = get_page(a_url)

def get_listing_ids_from_page(page):
    listings = [_['aria-labelledby'].split('_')[1] for _ in page[0].find_all('div', {'data-testid':'card-container'})]
    
    return listings

get_listing_ids_from_page(a_page)


['51113056',
 '48596594',
 '895088127637716421',
 '970046008971749390',
 '46318718',
 '581010866199809991',
 '9036683',
 '875687088662113784',
 '29113700',
 '9280298',
 '990798364955992889',
 '14832676',
 '976582464843609267',
 '30326048',
 '959070829847421670',
 '962616042346760815',
 '9813434',
 '21620323']

In [127]:
type(a_page.find_all('div', {'data-testid':'card-container'}))

bs4.element.ResultSet

In [130]:
for _ in a_page.find_all('div', {'data-testid':'card-container'}):
    print(_['aria-labelledby'])

title_624217548914064158
title_44182350
title_15974146
title_28142598
title_24688711
title_36286963
title_5608532
title_1298200
title_875191656813979483
title_52393892
title_8487288
title_745111324566410595
title_39117472
title_11701395
title_23639823
title_41357662
title_2621702
title_765556804156974539


In [131]:
len(a_page.find_all('div', {'data-testid':'card-container'}))

18

Great - now we have a way to get all the listing IDs from a page. Let's see if it works at scale....

How do page URLs change??

In [91]:
page_3 = "https://www.airbnb.com/s/japan/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&query=Japan&place_id=ChIJLxl_1w9OZzQRRFJmfNR1QvU&flexible_trip_lengths%5B%5D=one_week&monthly_start_date=2023-11-01&monthly_length=3&price_filter_input_type=0&price_filter_num_nights=5&channel=EXPLORE&search_type=unknown&federated_search_session_id=6072c860-2732-4c6f-bb29-4699462829a0&pagination_search=true&cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0IjozNiwidmVyc2lvbiI6MX0%3D"
print(page_3)

https://www.airbnb.com/s/japan/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&query=Japan&place_id=ChIJLxl_1w9OZzQRRFJmfNR1QvU&flexible_trip_lengths%5B%5D=one_week&monthly_start_date=2023-11-01&monthly_length=3&price_filter_input_type=0&price_filter_num_nights=5&channel=EXPLORE&search_type=unknown&federated_search_session_id=6072c860-2732-4c6f-bb29-4699462829a0&pagination_search=true&cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0IjozNiwidmVyc2lvbiI6MX0%3D


In [None]:
# ?tab_id=home_tab&pagination_search=true&cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0IjozNiwidmVyc2lvbiI6MX0%3D

In [None]:
# https://www.airbnb.com/s/japan/homes?cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0IjozNiwidmVyc2lvbiI6MX0%3D

#### Breaking down cursor
The below URLs will take you to page 5 and page 6 of a search in Texas.. what is the difference?

In [92]:
url_5 = "https://www.airbnb.com/s/Texas--United-States/homes?cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0Ijo3MiwidmVyc2lvbiI6MX0%3D"
url_6 = "https://www.airbnb.com/s/Texas--United-States/homes?cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0Ijo5MCwidmVyc2lvbiI6MX0%3D"

len(url_5), len(url_6)

(129, 129)

In [95]:
url_5 == url_6

False

In [98]:
len(url_5) == len(url_6)

True

In [101]:
url_5_diff = ''
url_6_diff = ''

for i in range(len(url_5)):

    if url_5[i] == url_6[i]:
        pass
    else:
        print(i)
        url_5_diff += url_5[i]        
        url_6_diff += url_6[i]     
        
        
url_5_diff, url_6_diff

106
108


('3i', '5C')

In [105]:
x5 = "cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0Ijo3MiwidmVyc2lvbiI6MX0%3D"
x6 = "cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0Ijo5MCwidmVyc2lvbiI6MX0%3D"

url_5_diff = ''
url_6_diff = ''

for i in range(len("cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0Ijo3MiwidmVyc2lvbiI6MX0%3D")):

    if x5[i] == x6[i]:
        pass
    else:
        print(i)
        url_5_diff += x5[i]        
        url_6_diff += x6[i]     
        
        
url_5_diff, url_6_diff

54
56


('3i', '5C')

In [111]:
"cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0Ijo5MCwidmVyc2lvbiI6MX0%3D"[:53]
"cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0Ijo5MCwidmVyc2lvbiI6MX0%3D"[53:]
x5[54:57], x6[54:57]

('3Mi', '5MC')

Alright, the difference between URLs for each page can be very minor and can be found in the `courser=` section.

Here's page 15 for that search https://www.airbnb.com/s/Texas--United-States/homes?cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0IjoyNTIsInZlcnNpb24iOjF9

And here's page 2 https://www.airbnb.com/s/Texas--United-States/homes?cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0IjoxOCwidmVyc2lvbiI6MX0%3D

Both with everything but `coursor` removed...

## Take 2
Based on my notes in the `wip_notebooks` section of the repo, for [building the multi page scrape](https://github.com/gumdropsteve/airbnb/blob/feature/scrape/wip_notebooks/01_building_multi_page_base_scrape.ipynb), it looks like I was using `?offset` param in the URL to sift through listings. I just tried this in the browser and it seemed to work, though the page number didn't change from 1. That makes sense though given the search's page 1 would start after the `offset`. There are 6 rows of listings per page, each row has 3 listings. 

Note: It looks like at the time of my wiring that, there was a cap of 300 listings. I just tried a url with offset 360 and it worked. Maybe the limit of 300 results has been lifited.

In [16]:
def get_page(url):
    """
    returns a soup object that contains all the information of a given webpage
    
    response type: tuple
    
    example response: (bs4.element.ResultSet, '2023-10-08 22:35:37.830176')
    """
    tos = str(datetime.now()) 
    result = requests.get(url)
    content = result.content
    page = BeautifulSoup(content, features='html')
    return page, tos

location = 'Japan'
a_url = f'http://www.airbnb.com/s/{location}/homes'

# a_page = get_page(a_url)

def get_listing_ids(base_url, max_offset=500):
    # start list for listing ids and figure out how many pages we will run
    listing_ids = []
    n_pages = int(max_offset / 18)
    
    # run all the pages and add the listing ids to the listing ids list
    for _ in range(n_pages):
        
        # set the page url and pull it's content
        page = base_url + f"?offset={18 * _}"
#         print(page)
        page = get_page(page)[0]
        
        listing_ids += [_['aria-labelledby'].split('_')[1] for _ in page.find_all('div', {'data-testid':'card-container'})]
        listing_ids = set(listing_ids)
        listing_ids = list(listing_ids)
#         print(len(listing_ids))

    # return the listings
    print(len(listing_ids))
    return listing_ids


japan_ids = get_listing_ids(a_url, max_offset=690)
# japan_ids

57


### Not working
This approach doesn't work in the browser, and it doesn't seem to be working with the function either, though the fuction is getting more listings with each run.. 

How are we going to find the next page?

Can we get it from the bottom of each page?

In [79]:
a_page = get_page(a_url)

a_page = a_page[0]

# # .find_all('div', {'data-testid':'card-container'}
# # aria-label="Search results pagination"
# for _ in a_page.find_all('nav', {'aria-label':'Search results pagination'}):
#     try:
#         print(_['dev'])
#     except:
#         pass
    
# a_page.find_all('nav', {'aria-label':'Search results pagination'})

#  =""

# for _ in a_page.find_all('button', {'aria-current':'page'}):
#     print(_)
#     print(type(_))
#     print(len(_))

# type(a_page.find_all('button', {'aria-current':'page'}))
# bs4.element.ResultSet

str(a_page.find_all('a', {'aria-label':'Next'})).split('href=')[1].split(">")[0].replace('"', "").replace("'", '')

'/s/Japan/homes?tab_id=home_tab&amp;refinement_paths%5B%5D=%2Fhomes&amp;query=Japan&amp;place_id=ChIJLxl_1w9OZzQRRFJmfNR1QvU&amp;flexible_trip_lengths%5B%5D=one_week&amp;monthly_start_date=2023-11-01&amp;monthly_length=3&amp;price_filter_input_type=0&amp;price_filter_num_nights=5&amp;channel=EXPLORE&amp;federated_search_session_id=3f91fc35-0fad-4765-8f97-a49c08df4b8b&amp;search_type=unknown&amp;pagination_search=true&amp;cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0IjoxOCwidmVyc2lvbiI6MX0%3D'

In [90]:
type(a_page)

bs4.BeautifulSoup

In [89]:
type(a_page.find_all('nav', {'aria-label':'Search results pagination'}))

bs4.element.ResultSet

In [148]:
def get_page(url):
    """
    returns a soup object that contains all the information of a given webpage
    
    response type: tuple
    
    example response: (bs4.element.ResultSet, '2023-10-08 22:35:37.830176')
    """
    tos = str(datetime.now()) 
    result = requests.get(url)
    content = result.content
    page = BeautifulSoup(content, features='html')
    return page, tos


base_url = 'http://www.airbnb.com'
location = 'Japan'
search_url = f"/s/{location}/homes"

In [205]:
# create holdings for room id, room link, and page link
room_ids = []
room_links = []
page_links = []

# remember which page links we have pulled
used_page_links = []

# pick a location to scrape
location = "Japan"

# tag the base airbnb url and the basic search addage
base_url = 'http://www.airbnb.com'
search_url = f"/s/{location}/homes"

# set first page
page_url = base_url + search_url
a_page = get_page(page_url)
used_page_links.append(page_url)

# focus on the bs4 results
a_page = a_page[0]

# define a function to get all the links from that page
def get_links_from_page(bs_page
                       ):
    
    room_ids = []
    room_links = []
    page_links = []    
    
    # find all the links in the page
    for _ in a_page.find_all('a'):
        try:
            # focus the link in the a tag
            link = _['href']

            # is this a rooms link?
            if "/rooms/" in link:
                
                # if we don't already have this room link
                if base_url + link not in room_links:
                    # add room link with airbnb starter
                    room_links.append(base_url + link)
                    
                    # extract room id from room
                    room = link.split('rooms/')[1].split('?')[0]
                    try:
                        room_ids.append(int(room))
                    except:
                        print(f'room id error: {room}')
                        room_ids.append(room)
                else:
                    pass

            # is this a results page link?
            elif "/s/" in link:
                # if it is not an experience link
                if '/experiences/' not in link:
                    # if we do not already have this link
                    if base_url + link not in page_links:
                        # add it to the results
                        page_links.append(base_url + link)
                    else:
                        pass
                else:
                    pass
                
            else:
                pass

        except:
            pass
        
    return room_ids, room_links, page_links

        
room_ids, room_links, page_links = get_links_from_page(a_page)

In [206]:
for page_link in page_links:
    if page_link not in used_page_links:
        # pull that link
        a_page = get_page(page_link)
        a_page = a_page[0]
        # pull the page for room ids
        new_room_ids, new_room_links, new_page_links = get_links_from_page(a_page)
        room_ids = room_ids + new_room_ids
        room_links = room_links + new_room_links
        page_links = page_links + new_page_links
        # add it to the used page links list
        used_page_links.append(page_link)
    else:
        pass
    

print(len(page_links))
len(room_ids)

20


90

In [207]:
len(set(room_ids))

85

In [208]:
len(room_ids)

90

In [209]:
len(room_links)

90

In [210]:
len(set(room_links))

90

In [211]:
set(room_links)

{'http://www.airbnb.com/rooms/11220099?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&check_in=2023-11-20&check_out=2023-11-25&source_impression_id=p3_1696835539_G5jRY1SqZQWt1Ee8&previous_page_section_name=1000&federated_search_id=3ca773d2-d338-4eb2-906e-f7cabde4e0d3',
 'http://www.airbnb.com/rooms/13771795?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&check_in=2023-11-28&check_out=2023-12-03&source_impression_id=p3_1696835540_YI8FdRQE%2FvKPF0DN&previous_page_section_name=1000&federated_search_id=2b0d7abc-061e-42dc-9245-87e6f2e484d7',
 'http://www.airbnb.com/rooms/14832676?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&check_in=2023-11-06&check_out=2023-11-11&source_impression_id=p3_1696835534_WUnyCGfN8U9A%2BC2V&previous_page_section_name=1000&federated_search_id=bb03383e-6880-4824-a268-4022660e4469',
 'http://www.airbnb.com/rooms/16056480?adults=1&category_tag=Tag%3A8678&children=0&enable_m3_private_room=true&infants=0&pets=0&phot

In [192]:
len(room_links)

2518

In [161]:
room_ids

set()

In [132]:
len(a_page.find_all('nav', {'aria-label':'Search results pagination'}))

1

In [133]:
result = a_page.find_all('a', {'aria-label':'Next'})

# for th in result:
#     result.extend(th.find_all('href'))
    
len(result)

1

In [122]:
# result.pop()

In [123]:
result.remove('href')

ValueError: list.remove(x): x not in list

In [125]:
# result.sort('href')
result

TypeError: list indices must be integers or slices, not str

This is a method for finding the next page from the current page. For some reason, I am not able to run a find_all or extract a ['href'] from this result set, even though I can from a result set before..

In [82]:
def get_page(url):
    """
    returns a soup object that contains all the information of a given webpage
    
    response type: tuple
    
    example response: (bs4.element.ResultSet, '2023-10-08 22:35:37.830176')
    """
    tos = str(datetime.now()) 
    result = requests.get(url)
    content = result.content
    page = BeautifulSoup(content, features='html')
    return page, tos

# location = 'Japan'
# a_url = f'/s/{location}/homes'

# a_page = get_page(a_url)

def get_listing_ids(base_url='http://www.airbnb.com', location='Japan', max_pages=30):
    # start list for listing ids and figure out how many pages we will run
    listing_ids = []
    current_url = base_url + f'/s/{location}/homes'
    
    for _ in range(max_pages):
        
        # set the page url and pull it's content
        page = get_page(current_url)[0]
        
        # pull listing ids
        listing_ids += [_['aria-labelledby'].split('_')[1] for _ in page.find_all('div', {'data-testid':'card-container'})]
        
        listing_ids = set(listing_ids)
        listing_ids = list(listing_ids)
        print(len(listing_ids))
        
        print(str(a_page.find_all('a', {'aria-label':'Next'})).split('href=')[1].split(">")[0].replace('"', "").replace("'", ''))
        
#         current_url = str(a_page.find_all('a', {'aria-label':'Next'})).split('href=')[1].split(">")[0].replace('"', "").replace("'", '')
#         current_url = base_url + current_url
#         print(current_url)


japan_ids = get_listing_ids()
# japan_ids

18
/s/Japan/homes?tab_id=home_tab&amp;refinement_paths%5B%5D=%2Fhomes&amp;query=Japan&amp;place_id=ChIJLxl_1w9OZzQRRFJmfNR1QvU&amp;flexible_trip_lengths%5B%5D=one_week&amp;monthly_start_date=2023-11-01&amp;monthly_length=3&amp;price_filter_input_type=0&amp;price_filter_num_nights=5&amp;channel=EXPLORE&amp;federated_search_session_id=3f91fc35-0fad-4765-8f97-a49c08df4b8b&amp;search_type=unknown&amp;pagination_search=true&amp;cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0IjoxOCwidmVyc2lvbiI6MX0%3D
36
/s/Japan/homes?tab_id=home_tab&amp;refinement_paths%5B%5D=%2Fhomes&amp;query=Japan&amp;place_id=ChIJLxl_1w9OZzQRRFJmfNR1QvU&amp;flexible_trip_lengths%5B%5D=one_week&amp;monthly_start_date=2023-11-01&amp;monthly_length=3&amp;price_filter_input_type=0&amp;price_filter_num_nights=5&amp;channel=EXPLORE&amp;federated_search_session_id=3f91fc35-0fad-4765-8f97-a49c08df4b8b&amp;search_type=unknown&amp;pagination_search=true&amp;cursor=eyJzZWN0aW9uX29mZnNldCI6MSwiaXRlbXNfb2Zmc2V0IjoxOCwidmVyc2lvb

KeyboardInterrupt: 

In [None]:
class AirBnbScrape:
    
    def __init__(self, location, location_alias):
        """
        set location, base (url) link, and blank record books
        """
        self.base_link = f'http://www.airbnb.com/s/{location}/homes'
        self.location = location
        self.location_alias = location_alias        

In [None]:



def get_listing_link(listing):
    """
    returns the URL link of given listing
    """
    listing_link = 'http://airbnb.com' + listing.find('a')['href']
    listing_link = listing_link.split('?')[0]
    return listing_link


def get_listing_title(listing):
    """
    returns the title of given listing
    """
    title = listing.find('meta')['content']
    title = title.split(' - null - ')
    return title[0]


def get_top_row(listing):
    """
    returns the top row of given listing's info
    """
    top_row = listing.find('div', {'class':'_1tanv1h'}).text  # _167gordg
    top_row = top_row.split(' in ')
    # what are we looking at?
    what_it_is = top_row[0]
    # where is it?
    where_it_is = top_row[1]
    return what_it_is, where_it_is


def get_room_info(listing):
    """
    returns room info of listing 
    """
    room_info = listing.find('div', {'class', '_kqh46o'}).text
    split_info = [i.split() for i in room_info.split(' · ')]
    room_dict = {}
    for i in split_info:
        if i not in [['Studio'], ['Half-bath']]:
            if len(i) == 2:
                room_dict[i[1]] = i[0]
            # shared-baths
            elif len(i) == 3:
                i = [i[0], '-'.join([i[1], i[2]])]
                room_dict[i[1]] = i[0]
            else:
                if i[1] == 'total':
                    room_dict['bedrooms'] = [i[0]]
                else:
                    print(f'unexpected room_info | unexpected split_info len(i)=={len(i)}!=2!=3\n{i}')
                    room_dict[' '.join(i)] = i[0]
        else:
            # Half-baths and Studios
            if i[0] == 'Studio':
                room_dict['is_studio'] = True
            room_dict[i[0]] = 0
    
    # need better solution for bedrooms
    weird_bedrooms = 0 
    try:
        b = room_dict['bedrooms']
        del b
    except:
        try:
            room_dict['bedrooms'] = room_dict['bedroom']
        except:
            try:
                room_dict['bedrooms'] = room_dict['Studio']
            except:
                weird_bedrooms += 1
                print(f'weird bedrooms {weird_bedrooms}')
                room_dict['bedrooms'] = room_dict.get('bedrooms')
    
    try:
        room_dict['baths']
    except:
        try:
            room_dict['baths'] = room_dict['bath']
        except:
            room_dict['baths'] = None
    
    room_dict['half_baths'] = room_dict.get('Half-bath')
    room_dict['shared_baths'] = room_dict.get('shared-baths')
    room_dict['is_studio'] = room_dict.get('is_studio', False)
    room_dict['beds'] = room_dict.get('beds')
    room_dict['guests'] = room_dict.get('beds')

    # check for bedrooms list
    if type(room_dict['bedrooms']) == list:
        if len(room_dict['bedrooms']) == 1:
            room_dict['bedrooms'] = float(room_dict['bedrooms'][0])
        else:
            raise Exception(f'unexpected bedrooms list | {room_dict["bedrooms"]}')
            
    room_dict = {key:value for key,value in room_dict.items() if key in ['guests', 'bedrooms', 'beds', 'is_studio', 'baths', 'half_baths', 'shared_baths']}
            
    return room_dict


def get_room_price(listing):
    """
    returns the nightly rate (price) of given listing
    """
    price_text = listing.find('div', {'class':'_ls0e43'}).text
    price = price_text.split('$')
    price = price[1]
    # extract float value
    price = price.split(" ")[0]  # skip the $
    # remove possible / at end of string
    if '/' in price:
        price = price[:len(price) - 1]
    # adjust for places with > 999 reviews
    if ',' in price:
        price = ''.join(price.split(','))
    return float(price)


def get_room_rating_and_reviews(listing):
    """
    returns star rating and number of reviews of given listing
    """
    try:
        output = listing.find('span', {'class':'_18khxk1'}).text
        output = output.split('\xa0')
        
        avg_rating = float(output[0])
        n_reviews = float(output[1][:-1].split('(')[1])

        return avg_rating, n_reviews
    except:
        try:
            return listing.find('span', {'class':'_18khxk1'}), listing.find('span', {'class':'_18khxk1'})
        except:
            raise Exception(f'get_room_rating_and_reviews | listing == {type(listing), len(listing)}')


class airbnb_scrape():
    
    def __init__(self, location, location_alias):
        """
        set location, base (url) link, and blank record books
        """
        self.base_link = f'http://www.airbnb.com/s/{location}/homes'
        self.location = location
        self.location_alias = location_alias
        
        self.n_pages = None
        self.n_results = None
        self.page_urls = []
        self.data_dir = 'data/'
        
        # set known basic amenities
        self.possible = ['Gym', 'Wifi', 'Self check-in', 'Air conditioning', 'Pets allowed', 'Indoor fireplace', 'Hot tub', 'Free parking', 'Pool', 'Kitchen', 'Breakfast', 'Elevator', 'Washer', 'Dryer', 
                         'Heating', 'Waterfront', 'Dishwasher', 'Beachfront', 'Ski-in/Ski-out', 'Terrace', 'Sonos sound system', 'BBQ grill', 'Hair dryer', "Chef's kitchen", 'Wet bar', 'Sun loungers', 
                         'Home theater', 'Housekeeping', 'Gated property', 'Gas fireplace', 'Plunge pool', 'Infinity pool', 'Sun deck', 'Game room', 'Surround sound system', 'Resort access']

        # set current schema column names
        self.names = ['ds', 'search_filter', 'url', 'title', 'type', 'location', 'guests', 'bedrooms', 'beds', 'is_studio', 'baths', 'half_baths', 'shared_baths', 'price', 'avg_rating', 'n_reviews', 'gym_bool', 
                      'wifi_bool', 'self_check_in_bool', 'air_conditioning_bool', 'pets_allowed_bool', 'indoor_fireplace_bool', 'hot_tub_bool', 'free_parking_bool', 'pool_bool', 'kitchen_bool', 'breakfast_bool', 
                      'elevator_bool', 'washer_bool', 'dryer_bool', 'heating_bool', 'waterfront_bool', 'dishwasher_bool', 'beachfront_bool', 'ski_in_ski_out_bool', 'terrace_bool', 'sonos_sound_system_bool', 
                      'bbq_grill_bool', 'hair_dryer_bool', 'chefs_kitchen_bool', 'wet_bar_bool', 'sun_loungers_bool', 'home_theater_bool', 'housekeeping_bool', 'gated_property_bool', 'gas_fireplace_bool', 
                      'plunge_pool_bool', 'infinity_pool_bool', 'sun_deck_bool', 'game_room_bool', 'surround_sound_system_bool', 'resort_access_bool']
        
        self.dtypes = {'ds': 'object', 'search_filter': 'object', 'url': 'object', 'title': 'object', 'type': 'object', 'location': 'object', 'guests': 'float64', 'bedrooms': 'float64', 'beds': 'float64', 
                       'is_studio': 'bool', 'baths': 'float64', 'half_baths': 'float64', 'shared_baths': 'float64', 'price': 'float64', 'avg_rating': 'float64', 'n_reviews': 'float64', 'gym_bool': 'bool', 
                       'wifi_bool': 'bool', 'self_check_in_bool': 'bool', 'air_conditioning_bool': 'bool', 'pets_allowed_bool': 'bool', 'indoor_fireplace_bool': 'bool', 'hot_tub_bool': 'bool', 'free_parking_bool': 
                       'bool', 'pool_bool': 'bool', 'kitchen_bool': 'bool', 'breakfast_bool': 'bool', 'elevator_bool': 'bool', 'washer_bool': 'bool', 'dryer_bool': 'bool', 'heating_bool': 'bool', 
                       'waterfront_bool': 'bool', 'dishwasher_bool': 'bool', 'beachfront_bool': 'bool', 'ski_in_ski_out_bool': 'bool', 'terrace_bool': 'bool', 'sonos_sound_system_bool': 'bool', 
                       'bbq_grill_bool': 'bool', 'hair_dryer_bool': 'bool', 'chefs_kitchen_bool': 'bool', 'wet_bar_bool': 'bool', 'sun_loungers_bool': 'bool', 'home_theater_bool': 'bool', 'housekeeping_bool': 'bool', 
                       'gated_property_bool': 'bool', 'gas_fireplace_bool': 'bool', 'plunge_pool_bool': 'bool', 'infinity_pool_bool': 'bool', 'sun_deck_bool': 'bool', 'game_room_bool': 'bool', 
                       'surround_sound_system_bool': 'bool', 'resort_access_bool': 'bool'}

    def get_basic_facilities(self, listing):
        '''
        returns a dictionary of the given listing's basic facilities with True / None values based on known possible basic facilites
        '''
        # make list of this listing's basic facilites
        try:
            basic_facilities = listing.findAll("div", {"class":"_kqh46o"})[1].text
            basic_facilities = basic_facilities.split(' · ')
        except:
            basic_facilities = []

        # open a record for this listing
        room_dict = {}
        
        # add each basic facility to this room's record 
        for f in basic_facilities:
            if f in self.possible:
                room_dict[f] = True
            else:
                # looks liek we have a new basic facility
                i = input(f'unexpected basic_facilites | {f} | is new? (y/n) ')
                if i == 'y':
                    i = input(f'ok, new basic facility\nwhat should the column name be?\ne.g. Hot tub is hot_tub_bool\n"exit" to quit\n column name == ')
                    if i != 'exit':
                        # set new amenity
                        room_dict[f] = True
                        # update possible amenities and column names
                        self.possible.append(f)
                        self.names.append(i)
                        print(f'\nnew self.possible ==\n{self.possible}\n\nnew self.names ==\n{self.names}\n\nplease update now (sleeping 60 seconds)\n')
                        sleep(60)
                    else:
                        raise Exception(f"not sure what's going on.. | unexpected basic_facilites | {f} | user exit")
                else:
                    raise Exception(f"not sure what's going on.. | unexpected basic_facilites | {f}")
        
        # add None for any basic facilities this listing doesn't offer
        for f in self.possible:
            room_dict[f] = room_dict.get(f, None)
        
        return room_dict
    
    def find_n_results(self, soup_page):
        """
        finds total number of search results from page 1 (of search results)
        """
        try:
            # keep track of how many results we have
            self.n_results = soup_page.find('div', {'class':'_1h559tl'}).text
        except:
            raise Exception('n results not found on 1st page')
    
    def find_n_pages(self, soup_page, listings_per_page=20):
        """
        finds number of existing pages from 1st page of search results
        """
        try:
            n_results_string = soup_page.find('div', {'class':'_1h559tl'}).text 
            # check if 300+ club
            if '300+' in n_results_string:
                self.n_pages = 15
            else:
                split_results_string = n_results_string.split(' of ')
                n_total_results_string = split_results_string[1]
                # check for unknown + edge case
                if '+' in n_total_results_string:
                    raise Exception(f'+ in n_total_results_string but 300+ is not\nn_total_results_string == {n_total_results_string}')
                else:
                    # find number of results
                    split_total_results_string = n_total_results_string.split(' ')
                    n_total_results = int(split_total_results_string[0])
                    n_pages = n_total_results / listings_per_page 
                    n_pages = math.ceil(n_pages)
                    self.n_pages = n_pages
        except:
            print(f'find_n_pages error | {self.location}')
            self.n_pages = 1
        # tell me how many pages there are
        print(self.n_pages)
    
    def make_page_urls(self, base_page, n_pages='auto', listings_per_page=20):
        """
        makes pages for search results (sets of 20)
        """
        # reset page urls
        self.page_urls = []
        # if n_pages wasn't set
        if n_pages == 'auto':
            # find out how many pages there are
            self.find_n_pages(base_page, listings_per_page=listings_per_page)
        # items_offset is 1st filter (?) or after 1st filter (&)
        if '?' not in base_page:
            c = '?'
        else:
            c = '&'
        # create page urls
        for i in range(self.n_pages):
            # 1st page alread done earlier
            if i != 0:
                url = f'{base_page}{c}items_offset={i * listings_per_page}'
                self.page_urls.append(url)
            else:
                pass
    
    def record_dataset(self, listings, tos, _filter):
        """
        take scraped room classes and record their information to csv

        tos: time of scrape
            > str datetime.datetime.now()

        _filter: filter applied to scrape
            > str, None if no filter
        """
        data = []
        for l in listings:
            # listing link
            a = get_listing_link(l)
            # listing title
            b = get_listing_title(l)
            # top row info
            c, d = get_top_row(l)
            # room info (beds, baths, etc..)
            _ = get_room_info(l)
            e, f, g, h, i, j, k = _['guests'], _['bedrooms'], _['beds'], _['is_studio'], _['baths'], _['half_baths'], _['shared_baths']
            del _
            # room nightly rate
            m = get_room_price(l)
            # room rating and n reviews
            n, o = get_room_rating_and_reviews(l)
            # basic facilites
            _ = self.get_basic_facilities(l)
            p = [_[bf] for bf in self.possible]
            # list of all listing info
            out = [_filter] + [a, b, c, d, e, f, g, h, i, j, k, m, n, o] + p
            # add time of scrape to data as 1st datapoint (jan 15 2021)
            out = [tos] + out
            # add it to the data collection 
            data.append(out)
        
        # add this scrape to the location's existing dataset
        try:
            pd.concat([pd.read_parquet(f'{self.data_dir}{self.location_alias}.parquet'), 
                       pd.DataFrame(data, columns=self.names)], axis=0).to_parquet(f'{self.data_dir}{self.location_alias}.parquet', index=False)
        # first time we've scraped this location, make a new dataset
        except:
            # check this is actually new so we don't accidenly overwrite existing data (delete 'y'# from the below line if you want to perform manual check, outherwise defaults to make new file)
            i = 'y'#input(f'recording new location: {self.location_alias}? (y/n) ')
            if i == 'y':
                # make dataframe from scraped data, column names from __init__()
                df = pd.DataFrame(data, columns=self.names)
                # go through each column
                for column in self.dtypes:
                    # our bool data is scraped as True/None, we need True/False
                    if 'bool' in column:
                        # fill None values in bool column with False
                        df[column] = df[column].fillna(False)
                    # convert column to expected dtype for parquet
                    df[column] = df[column].astype(self.dtypes[column])
                # write new parquet file
                df.to_parquet(f'{self.data_dir}{self.location_alias}.parquet', index=False)
                del df  # free up space
            else:
                raise Exception("not recording a new location, what's going on?")
    
    def scrape_search(self, base_link, search_alias, _filter, n_pages='auto', printout=False):
        """
        record results of a given search link
        """        
        # get 1st page
        base_link_page_1, t = get_page(base_link)
        
        # record the 1st page
        if printout:
            print(self.record_dataset(get_room_classes(base_link_page_1), tos=t, _filter=_filter))
        else:
            self.record_dataset(get_room_classes(base_link_page_1), tos=t, _filter=_filter)
        
        # get urls for other pages 
        if n_pages=='auto':
            self.make_page_urls(self.base_link, self.find_n_pages(base_link_page_1))
        else:
            self.make_page_urls(self.base_link, n_pages)        
        
        for url in self.page_urls:
            if printout:
                page, t = get_page(url)
                print(self.record_dataset(get_room_classes(page), tos=t, _filter=_filter))
            else:
                page, t = get_page(url)
                self.record_dataset(get_room_classes(page), tos=t, _filter=_filter)
                
        # output where we can find the file (relative path)
        return f'{self.data_dir}{self.location_alias}.parquet'
    
    @dask.delayed
    def scrape_types(self, printout=False):
        """
        record data from a loacations results for each of the big 4 room type filters and for each of those with superhosts only filter applied (8 total)
        """
        print(f'starting {self.location.split("--")[0]} @ {self.base_link}')  # scrape all 4 room types (default and with superhost filter)
        
        today = str(date.today())
        try:
            last_date_recorded = pd.read_parquet(f'{self.data_dir}{self.location_alias}.parquet').ds.str.split()[-1:].values[0][0]
        except:
            last_date_recorded = None
            
        # check to make sure we haven't already recorded this place today
        if last_date_recorded != today:
            # default search
            self.scrape_search(self.base_link, f'{self.location_alias}', _filter='', printout=printout)
            self.scrape_search(f'{self.base_link}?superhost=true', f'{self.location_alias}_super_hosts', _filter='super_hosts', printout=printout)

            # entire homes only
            self.scrape_search(f'{self.base_link}?room_types[]=Entire home', f'{self.location_alias}_entire_homes', _filter='entire_homes', printout=printout) 
            self.scrape_search(f'{self.base_link}?room_types[]=Entire home&superhost=true', f'{self.location_alias}_entire_home_super_hosts', _filter='entire_home_super_hosts', printout=printout)

            # hotes rooms only
            self.scrape_search(f'{self.base_link}?room_types[]=Hotel room', f'{self.location_alias}_hotel_rooms', _filter='hotel_rooms', printout=printout)
            self.scrape_search(f'{self.base_link}?room_types[]=Hotel room&superhost=true', f'{self.location_alias}_hotel_room_super_hosts', _filter='hotel_room_super_hosts', printout=printout)

            # private rooms only
            self.scrape_search(f'{self.base_link}?room_types[]=Private room', f'{self.location_alias}_private_rooms', _filter='private_rooms', printout=printout)
            self.scrape_search(f'{self.base_link}?room_types[]=Shared room&superhost=true', f'{self.location_alias}_private_room_super_hosts', _filter='private_room_super_hosts', printout=printout)

            # shared rooms only
            self.scrape_search(f'{self.base_link}?room_types[]=Private room', f'{self.location_alias}_shared_rooms', _filter='shared_rooms', printout=printout)
            self.scrape_search(f'{self.base_link}?room_types[]=Shared room&superhost=true', f'{self.location_alias}_shared_room_super_hosts', _filter='shared_room_super_hosts', printout=printout)
        # we already recorded today
        else:
            print(f'{self.location.split("--")[0]} already recorded today')


if __name__=='__main__':
    from where_are_you_going import locations, location_aliases
    
    # start timer
    start = time.time()

    # add each delayed location to a collection for delayed (parallel) scrape
    collection = []
    for _ in range(len(locations)):
        # make airbnb scrape class instance for this location
        l = airbnb_scrape(location=locations[_], location_alias=location_aliases[_])
        
        # make delayed scrape_types() method for this location
        delayed_scrape = dask.delayed(l.scrape_types)(l, printout=False)

        collection.append(delayed_scrape)

    # execute delayed scrapes
    compute(*collection)

    print(f'runtime: {time.time() - start}')
