In [1]:
# Importing necessary packages
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from multiprocessing import Pool

In [2]:
# Local system config
# chromedriver_path = '/usr/local/bin/chromedriver'
chromedriver_path = 'C:/webdrivers/chromedriver'

# Config for scraping information from the website landing page
landing_page_config = {
    'url': {
        'class': '_mm360j',
        'attribute': 'href'
    }
}

# Config for scraping information from each listing page
internal_page_config = {
    'listing_name': {
        'class': '_fecoyn4'
    },
    'listing_type': {
        'class': '_tqmy57'
    },
    'star_rating': {
        'class': '_1ne5r4rt'
    },
    'price': {
        'class': '_1jo4hgw'
    },
    'review': {
        'class': '_162hp8xh'
    },
    'amenities': {
        'class': '_19xnuo97'
    },
    'num_reviews': {
        'class': '_1qf7wt4w'
    },
    'location_name': {
        'class': '_pbq7fmm'
    },
    'owner_info': {
        'class': 'tehcqxo.dir.dir-ltr'
    },
    'owner_details': {
        'class': '_88xxct'
    },
    'house_timings': {
        'class': 'c1lue5su.dir.dir-ltr'
    },
    'listing_highlights': {
        'class': '_1vjikx5'
    },
    'response_times': {
        'class': 'fhhmddr.dir.dir-ltr'
    }
}

# Link config
base_link = 'https://www.airbnb.com/s/'

city_names = ['New-York--NY--United-States/']

extension = '?items_offset='

In [3]:
# Scraping the website
class AirbnbParser:
    def __init__(self):
        self.final_data = {}
        pass

    def get_driver(self):
        """
        Returns a new instance of selenium webdriver
        """
        return webdriver.Chrome(chromedriver_path)

    def get_next_link(self, curr_offset, city, offset=20):
        """
        Generates next link from the current parsed link
        """
        return ''.join([base_link, city, 'homes', extension]) + str(curr_offset + offset)

    def get_url_list_from_landing_page(self, driver, start_index, city):
        """
        Scrapes the landing pages to get list of listing URLs
        """
        link = self.get_next_link(0, city, start_index)
        driver.get(link)
        time.sleep(4)
        for key, value in landing_page_config.items():
            class_name = value['class']
            url_list = driver.find_elements(By.CLASS_NAME, class_name)
            attribute = value['attribute']
            if key in self.final_data.keys():
                self.final_data[key] += [data_val.get_attribute(attribute) for data_val in url_list]
            else:
                self.final_data[key] = [data_val.get_attribute(attribute) for data_val in url_list]
            if not url_list:
                return False
        return True
    
    def get_data_from_listing_page(self, driver, link):
        """
        Scrapes each internal page to get data of each listing
        """
        driver.get(link)
        time.sleep(4)
        result = {}
        for key, value in internal_page_config.items():
            scraped_data = driver.find_elements(By.CLASS_NAME, value['class'])
            result[key] = [data_val.text for data_val in scraped_data]
        result['url'] = link
        return result
    
    def scrape_pages(self):
        driver = self.get_driver()
        for city in city_names:
            index = 0
            continue_scraping = self.get_url_list_from_landing_page(driver, index, city)
            index+=20
            while continue_scraping and index<20:#8000:
                continue_scraping = self.get_url_list_from_landing_page(driver, index, city)
                index+=20
#         with Pool(2) as pool:
        result = [self.get_data_from_listing_page(driver, url) for url in self.final_data['url']]
#         pool.close()
#         pool.join()
        return result
            
    def start_scraping(self):
        return self.scrape_pages()
            
scraper = AirbnbParser()
final_data = scraper.start_scraping()
listings_df = pd.DataFrame(final_data)

  return webdriver.Chrome(chromedriver_path)


In [13]:
listings_df.to_csv('listings.csv', sep='|', encoding='utf-8')

In [17]:
listings_df_csv = pd.read_csv('listings.csv', sep = '|', error_bad_lines=False, index_col=0)

In [15]:
listings_df.head()

Unnamed: 0,listing_name,listing_type,star_rating,price,review,amenities,num_reviews,location_name,owner_info,owner_details,house_timings,listing_highlights,response_times,url
0,[Met Gallery stunner w/ maid & laundry],[Entire rental unit hosted by Joshua\n2 guests...,[5.0],"[, $29\n / night]",[Cheryl\nNovember 2021\nJoshua and his family ...,"[Kitchen, Fast wifi – 686 Mbps, 50"" HDTV with ...","[(5 reviews), New York, United States]","[New York, United States]",[Hosted by Joshua\nJoined in December 2014],[Met Gallery stunner w/ maid & laundry\n5.0\n(...,[House rules\nCheck-in: After 3:00 PM\nCheckou...,[Entire home\nYou’ll have the apartment to you...,[Response rate: 100%\nResponse time: within an...,https://www.airbnb.com/rooms/48369592?previous...
1,[DOUBLE SHARED BATHROOM],[Room in boutique hotel hosted by Reservation ...,[5.0],"[, $55\n / night]",[Mike\nOctober 2021\nYou can get a mini fridge...,"[Wifi, Free street parking, TV, Elevator, Air ...","[(3 reviews), New York, United States]","[New York, United States]",[Hosted by Reservation Desk\nJoined in Decembe...,[DOUBLE SHARED BATHROOM\n5.0\n(3 reviews)\n·\n...,[House rules\nCheck-in: After 3:00 PM\nCheckou...,[Enhanced Clean\nThis Host committed to Airbnb...,[Response rate: 95%\nResponse time: within an ...,https://www.airbnb.com/rooms/25944708?previous...
2,[SINGLE SHARED BATHROOM],[Room in boutique hotel hosted by Reservation ...,[4.33],"[, $55\n / night]",[Cullen\nOctober 2020\nFun location. Room is n...,"[Wifi, Free street parking, TV, Elevator, Air ...","[(3 reviews), New York, United States]","[New York, United States]",[Hosted by Reservation Desk\nJoined in Decembe...,[SINGLE SHARED BATHROOM\n4.33\n(3 reviews)\n·\...,[House rules\nCheck-in: After 3:00 PM\nCheckou...,[Enhanced Clean\nThis Host committed to Airbnb...,[Response rate: 95%\nResponse time: within an ...,https://www.airbnb.com/rooms/23061804?previous...
3,[Charming Astoria studio on quiet tree-lined s...,[Entire rental unit hosted by Anastasia\n2 gue...,[5.0],"[, $50\n / night]",[Molly\nOctober 2021\nEverything about this pl...,"[Kitchen, Wifi, Free street parking, Pets allo...","[(4 reviews), Queens, New York, United States]","[Queens, New York, United States]",[Hosted by Anastasia\nJoined in January 2014],[Charming Astoria studio on quiet tree-lined s...,[House rules\nCheck-in: After 3:00 PM\nNo smok...,[Entire home\nYou’ll have the apartment to you...,[Language: English\nResponse rate: 100%\nRespo...,https://www.airbnb.com/rooms/48253767?previous...
4,[Midtown West Hotel ( Economy Single)],[Room in hotel hosted by Joe\n1 guest · 1 bedr...,[4.89],"[, $75\n / night]",[Shawn\nOctober 2021\nDo yourself a favor and ...,"[Wifi, TV, Air conditioning, Hair dryer, Refri...","[(9 reviews), New York, United States]","[New York, United States]",[Hosted by Joe\nJoined in August 2021],[Midtown West Hotel ( Economy Single)\n4.89\n(...,[House rules\nCheck-in: 3:00 PM - 9:00 PM\nChe...,[Enhanced Clean\nThis Host committed to Airbnb...,[Response rate: 100%\nResponse time: within an...,https://www.airbnb.com/rooms/52554296?previous...


In [18]:
listings_df_csv.head()

Unnamed: 0,listing_name,listing_type,star_rating,price,review,amenities,num_reviews,location_name,owner_info,owner_details,house_timings,listing_highlights,response_times,url
0,['Met Gallery stunner w/ maid & laundry'],['Entire rental unit hosted by Joshua\n2 guest...,['5.0'],"['', '$29\n / night']",['Cheryl\nNovember 2021\nJoshua and his family...,"['Kitchen', 'Fast wifi – 686 Mbps', '50"" HDTV ...","['(5 reviews)', 'New York, United States']","['New York, United States']",['Hosted by Joshua\nJoined in December 2014'],['Met Gallery stunner w/ maid & laundry\n5.0\n...,['House rules\nCheck-in: After 3:00 PM\nChecko...,['Entire home\nYou’ll have the apartment to yo...,['Response rate: 100%\nResponse time: within a...,https://www.airbnb.com/rooms/48369592?previous...
1,['DOUBLE SHARED BATHROOM'],['Room in boutique hotel hosted by Reservation...,['5.0'],"['', '$55\n / night']","[""Mike\nOctober 2021\nYou can get a mini fridg...","['Wifi', 'Free street parking', 'TV', 'Elevato...","['(3 reviews)', 'New York, United States']","['New York, United States']",['Hosted by Reservation Desk\nJoined in Decemb...,"[""DOUBLE SHARED BATHROOM\n5.0\n(3 reviews)\n·\...",['House rules\nCheck-in: After 3:00 PM\nChecko...,"[""Enhanced Clean\nThis Host committed to Airbn...",['Response rate: 95%\nResponse time: within an...,https://www.airbnb.com/rooms/25944708?previous...
2,['SINGLE SHARED BATHROOM'],['Room in boutique hotel hosted by Reservation...,['4.33'],"['', '$55\n / night']",['Cullen\nOctober 2020\nFun location. Room is ...,"['Wifi', 'Free street parking', 'TV', 'Elevato...","['(3 reviews)', 'New York, United States']","['New York, United States']",['Hosted by Reservation Desk\nJoined in Decemb...,"[""SINGLE SHARED BATHROOM\n4.33\n(3 reviews)\n·...",['House rules\nCheck-in: After 3:00 PM\nChecko...,"[""Enhanced Clean\nThis Host committed to Airbn...",['Response rate: 95%\nResponse time: within an...,https://www.airbnb.com/rooms/23061804?previous...
3,['Charming Astoria studio on quiet tree-lined ...,['Entire rental unit hosted by Anastasia\n2 gu...,['5.0'],"['', '$50\n / night']",['Molly\nOctober 2021\nEverything about this p...,"['Kitchen', 'Wifi', 'Free street parking', 'Pe...","['(4 reviews)', 'Queens, New York, United Stat...","['Queens, New York, United States']",['Hosted by Anastasia\nJoined in January 2014'],"[""Charming Astoria studio on quiet tree-lined ...",['House rules\nCheck-in: After 3:00 PM\nNo smo...,['Entire home\nYou’ll have the apartment to yo...,['Language: English\nResponse rate: 100%\nResp...,https://www.airbnb.com/rooms/48253767?previous...
4,['Midtown West Hotel ( Economy Single)'],['Room in hotel hosted by Joe\n1 guest · 1 bed...,['4.89'],"['', '$75\n / night']",['Shawn\nOctober 2021\nDo yourself a favor and...,"['Wifi', 'TV', 'Air conditioning', 'Hair dryer...","['(9 reviews)', 'New York, United States']","['New York, United States']",['Hosted by Joe\nJoined in August 2021'],"[""Midtown West Hotel ( Economy Single)\n4.89\n...",['House rules\nCheck-in: 3:00 PM - 9:00 PM\nCh...,"[""Enhanced Clean\nThis Host committed to Airbn...",['Response rate: 100%\nResponse time: within a...,https://www.airbnb.com/rooms/52554296?previous...
