In [None]:
# Importing necessary packages
import re
import time
import string
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from multiprocessing import Pool
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
# Config for scraping information from the website landing page
landing_page_config = {
    'url': {
        'class': '_mm360j',
        'attribute': 'href'
    }
}

# Config for scraping information from each listing page
internal_page_config = {
    'listing_name': {
        'class': '_fecoyn4'
    },
    'listing_type': {
        'class': '_tqmy57'
    },
    'star_rating': {
        'class': '_1ne5r4rt'
    },
    'price': {
        'class': '_1jo4hgw'
    },
    'review': {
        'class': '_162hp8xh'
    },
    'amenities': {
        'class': '_19xnuo97'
    },
    'num_reviews': {
        'class': '_1qf7wt4w'
    },
    'location_name': {
        'class': '_pbq7fmm'
    },
    'owner_info': {
        'class': 'tehcqxo.dir.dir-ltr'
    },
    'owner_details': {
        'class': '_88xxct'
    },
    'house_timings': {
        'class': 'c1lue5su.dir.dir-ltr'
    },
    'listing_highlights': {
        'class': '_1vjikx5'
    },
    'response_times': {
        'class': 'fhhmddr.dir.dir-ltr'
    }
}

# Link config
base_link = 'https://www.airbnb.com/s/'

city_names = ['Palo-Alto--CA--United-States/']
# , 'New-York--NY--United-States/', 'Washington--D.C.--USA/', 'College-Park--MD--United-States/', 'Palo-Alto--CA--United-States/', 'Dallas--TX--United-States/'

extension = '?items_offset='

In [None]:
# Scraping the website
class AirbnbParser:
    def __init__(self):
        self.curr_city = None
        self.final_data = None
        pass

    def get_driver(self):
        """
        Returns a new instance of selenium webdriver
        """
        return webdriver.Chrome('/usr/local/bin/chromedriver')

    def get_next_link(self, curr_offset, city, offset=20):
        """
        Generates next link from the current parsed link
        """
        return ''.join([base_link, city, 'homes', extension]) + str(curr_offset + offset)

    def get_url_list_from_landing_page(self, start_index, city):
        """
        Scrapes the landing pages to get list of listing URLs
        """
        driver = self.get_driver()
        link = self.get_next_link(0, city, start_index)
        driver.get(link)
        for key, value in landing_page_config.items():
            class_name = value['class']
            try:
                WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, class_name)))
            except:
                pass
            url_list = driver.find_elements(By.CLASS_NAME, class_name)
            attribute = value['attribute']
            if key in self.final_data.keys():
                self.final_data[key] += [data_val.get_attribute(attribute) for data_val in url_list]
            else:
                self.final_data[key] = [data_val.get_attribute(attribute) for data_val in url_list]
            if not url_list:
                return False
        return True
    
    def get_data_from_listing_page(self, link):
        """
        Scrapes each internal page to get data of each listing
        """
        driver = self.get_driver()
        driver.get(link)
        result = {}
        for _, value in internal_page_config.items():
            try:
                WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, value['class'])))
            except:
                pass
        for key, value in internal_page_config.items():
            scraped_data = driver.find_elements(By.CLASS_NAME, value['class'])
            result[key] = [data_val.text for data_val in scraped_data]
        result['url'] = link
        result['city'] = self.curr_city
        return result
    
    def scrape_pages(self, curr_city):
        index = 0
        continue_scraping = self.get_url_list_from_landing_page(index, curr_city)
        index+=20
        while continue_scraping and index<281:
            continue_scraping = self.get_url_list_from_landing_page(index, curr_city)
            index+=20
        with Pool(5) as pool:
            result = pool.map(self.get_data_from_listing_page, self.final_data['url'])
        pool.close()
        pool.join()
        return result
            
    def start_scraping(self):
        final_data = []
        for city in city_names:
            self.final_data = {}
            self.curr_city = city
            final_data += self.scrape_pages(city)
        return final_data
            
scraper = AirbnbParser()
final_data = scraper.start_scraping()
listings_df = pd.DataFrame(final_data)
listings_df.to_csv('California.csv')

## Processing

In [None]:
# load ny data intp df
df_listings = pd.read_csv('NewYork.csv', sep = ',', error_bad_lines=False, index_col=0)

In [None]:
# generate Id column
df_listings['listing_id'] = df_listings['url'].str.extract('\/rooms\/(\d+)')

In [None]:
# re-arrange columns 
column_names = list(df_listings.columns.values)
column_names.insert(0, column_names.pop())
df_listings = df_listings.reindex(columns=column_names)

In [None]:
# create df for amenities
df_amenities = df_listings[['listing_id', 'amenities']]
df_amenities.set_index('listing_id')

In [None]:
# process amenities further

# df_amenities.unstack()['amenities'].str.strip('[]').str.split(', ', expand=True).value_counts()
# res = df_amenities.set_index(['listing_id'])['amenities'].apply(pd.Series).stack()
# res = res.reset_index()
# res.columns = ['listing_id','level_1','amenities']
# res.drop(columns='level_1', inplace = True)
# res
# df_amenities['amenities'][0]

In [None]:
# Clean price data
df_listings['price'] = df_listings['price'].str.extract('\$(\d*\.?\d*)').astype(float)

In [None]:
# check records having bad data for price

# def is_integer_num(n):
#     if isinstance(n, int):
#         return True
#     if isinstance(n, float):
#         return not n.is_integer()
#     return False
# df_listings[df_listings['price'].apply(is_integer_num)]

In [None]:
# clean star_rating 
df_listings['star_rating'] = df_listings['star_rating'].str.extract('(\d*\.\d*)').astype(float)
# fill na with mean

In [None]:
# number of reviews

df_listings['num_reviews'] = df_listings['num_reviews'].str.extract('(\d+)\s+review').astype(float)
# len(df_listings[df_listings['num_reviews'].isnull()])

In [None]:
def clean_unicode(listing_name):
    text_encode = listing_name.encode(encoding="ascii", errors="ignore")
    text_decode = text_encode.decode()
    return " ".join([word for word in text_decode.split()])

In [None]:
punct = set(string.punctuation) 

def clean_tags(listing_name):
    # removing mentions e.g @CityBoy12
    listing_name = re.sub("@\S+", "", listing_name)
    # remove $ signs
    listing_name = re.sub("\$", "", listing_name)
    # remove urls
    listing_name = re.sub("https?:\/\/.*[\r\n]*", "", listing_name)
    # remove hashtags
    listing_name = re.sub("#", "", listing_name)
    # remove punctations
    listing_name = "".join([ch for ch in listing_name if ch not in punct])
    return listing_name

In [None]:
# clean listing names
df_listings['listing_name'] = df_listings['listing_name'].apply(clean_unicode)
df_listings['listing_name'] = df_listings['listing_name'].apply(clean_tags)

In [None]:
# clean listing type

df_listing_type = df_listings[['listing_id', 'listing_type']]
listing_type_pattern = """\['(?P<type>[\w*\s*]*)hosted by\s*(?P<host>[\w*\s*]*)\\\\n(?P<no_of_guests>\d*)\s*guest[s]?\s*.\s*(?P<no_of_bedrooms>\d*)\s*bedroom\s*.\s*(?P<no_of_beds>\d*)\s*bed[s]?\s*.\s*(?P<no_of_bath>\d*)"""
df_listing_type = df_listing_type['listing_type'].str.extract(listing_type_pattern, expand=True).copy()

In [None]:
df_listing_type.head()