In [None]:
# Importing necessary packages
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from multiprocessing import Pool

In [None]:
# Config for scraping information from the website landing page
landing_page_config = {
    'url': {
        'class': '_mm360j',
        'attribute': 'href'
    }
}

# Config for scraping information from each listing page
internal_page_config = {
    'listing_name': {
        'class': '_fecoyn4'
    },
    'listing_type': {
        'class': '_tqmy57'
    },
    'star_rating': {
        'class': '_1ne5r4rt'
    },
    'price': {
        'class': '_1jo4hgw'
    },
    'review': {
        'class': '_162hp8xh'
    },
    'amenities': {
        'class': '_19xnuo97'
    },
    'num_reviews': {
        'class': '_1qf7wt4w'
    },
    'location_name': {
        'class': '_pbq7fmm'
    },
    'owner_info': {
        'class': 'tehcqxo.dir.dir-ltr'
    },
    'owner_details': {
        'class': '_88xxct'
    },
    'house_timings': {
        'class': 'c1lue5su.dir.dir-ltr'
    },
    'listing_highlights': {
        'class': '_1vjikx5'
    },
    'response_times': {
        'class': 'fhhmddr.dir.dir-ltr'
    }
}

# Link config
base_link = 'https://www.airbnb.com/s/'

city_names = ['New-York--NY--United-States/']

extension = '?items_offset='

In [None]:
# Scraping the website
class AirbnbParser:
    def __init__(self):
        self.final_data = {}
        pass

    def get_driver(self):
        """
        Returns a new instance of selenium webdriver
        """
        return webdriver.Chrome('/usr/local/bin/chromedriver')

    def get_next_link(self, curr_offset, city, offset=20):
        """
        Generates next link from the current parsed link
        """
        return ''.join([base_link, city, 'homes', extension]) + str(curr_offset + offset)

    def get_url_list_from_landing_page(self, start_index, city):
        """
        Scrapes the landing pages to get list of listing URLs
        """
        driver = self.get_driver()
        link = self.get_next_link(0, city, start_index)
        driver.get(link)
        time.sleep(7)
        for key, value in landing_page_config.items():
            class_name = value['class']
            url_list = driver.find_elements(By.CLASS_NAME, class_name)
            attribute = value['attribute']
            if key in self.final_data.keys():
                self.final_data[key] += [data_val.get_attribute(attribute) for data_val in url_list]
            else:
                self.final_data[key] = [data_val.get_attribute(attribute) for data_val in url_list]
            if not url_list:
                return False
        return True
    
    def get_data_from_listing_page(self, link):
        """
        Scrapes each internal page to get data of each listing
        """
        driver = self.get_driver()
        driver.get(link)
        time.sleep(9)
        result = {}
        for key, value in internal_page_config.items():
            scraped_data = driver.find_elements(By.CLASS_NAME, value['class'])
            result[key] = [data_val.text for data_val in scraped_data]
        result['url'] = link
        return result
    
    def scrape_pages(self):
        for city in city_names:
            index = 0
            continue_scraping = self.get_url_list_from_landing_page(index, city)
            index+=20
            while continue_scraping and index<8000:
                continue_scraping = self.get_url_list_from_landing_page(index, city)
                index+=20
        with Pool(2) as pool:
            result = pool.map(self.get_data_from_listing_page, self.final_data['url'])
        pool.close()
        pool.join()
        return result
            
    def start_scraping(self):
        return self.scrape_pages()
            
scraper = AirbnbParser()
final_data = scraper.start_scraping()
listings_df = pd.DataFrame(final_data)