In [None]:
import pandas as pd 
import requests
from requests import get
import time
from random import seed
from random import random
from random import randint
from bs4 import BeautifulSoup
import itertools 
import re

In [None]:
import logging

# Create a custom logger
logger = logging.getLogger(__name__)

# Create handlers
f_handler = logging.FileHandler('DataCollector.log')

# Create formatters and add it to handlers
f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
f_handler.setFormatter(f_format)
# Set level of logging
logger.setLevel(logging.INFO)

# Add handlers to the logger
logger.addHandler(f_handler)

In [1]:
class DataCollector:
    '''
    DataCollector web scraper class to scrape information from www.zumper.com rental website
    
    data: the final result representation for the houses data as DataFrame
    houses: array of houses, each element is a row data about a house
    url: the url need to be scraped
    '''
    
    def __init__(self):
        '''Constructor to initialize an object'''
        self.data = pd.DataFrame()
        self.houses = []
        self.url = 'https://www.zumper.com/apartments-for-rent/new-york-ny'
    
    
    def collect_houses(self):
        '''
        This function reponsible of collecting houses row data , each house as BeautifulSoup 
        object, this object contains html tags which will be used to extract useful data
        '''
        count = 1
        url = self.url

        # scrape data from rental website www.zumper.com
        '''collect houses with AC, furnished, parking'''
        while count <= 600: # number of pages need to be scraped and scanned 
            # update url with number of page, when we pass the first page
            if count != 1: 
                url = self.url+'?page=' + str(count) 
            
            # get responce of calling the url
            response = get(url)
            # parse through the html 
            url_soup = BeautifulSoup(response.text, 'html.parser')
            
            # in the html of the page, find all the bins with <div> and class:
            #class=Listables_listItemContainer__2j0Fo to get all houses
            #each house is a div of class Listables_listItemContainer__2j0Fo in the html page
            house_data = url_soup.find_all('div', class_="Listables_listItemContainer__2j0Fo")
            logger.info('URL: {}'.format(url))
            print(url)

            # if the response was not empty (if something was actually scraped)
            if house_data != []:
                # add to the list of houses
                self.houses.extend(house_data)
                # random wait time, before calling and getting the next response
                value = random()
                scaled_value = 1 + (value * (9 - 5))
                print(scaled_value)
                time.sleep(scaled_value)
                # if you get empty response, stop the loop
            else:
                logger.info('Getting empty response from URL: {}'.format(url))
                print('empty')
                break     
            count += 1
        logger.info('The total number of houses: {}'.format(len(self.houses)))
        print(len(self.houses))
        # organize the row data of houses
        logger.info('Scraping has finished, starting organize houses data')
        self.organize_house_data()
        
    def organize_house_data(self):
        '''
        this function will arrange and organize row data of houses in a format of DataFrame
        each row is an observation >> house
        each column is a feature for these houses
        '''
        # list of houses rental price
        house_price = []
        # list of bedrooms
        bedrooms = []
        # list of baths
        baths = []
        # list of amenities
        amenities_list = []
        
        # list of all possiple amenities any house can have 
        cols = ['On Site Laundry', 'Air Conditioning', 'Dishwasher', 'Deck',
             'Hardwood Floor', 'Central Heat','Concierge Service', 'Fitness Center',
             'Package Service','Residents Lounge', 'Storage', 'Business Center',
             'Door Person','Roof Deck', 'Elevator','Dry Cleaning Service',
             'Garden','In Unit Laundry','Assigned Parking','Carpet',
             'Furnished','LEED Certified','Swimming Pool','Residents Lounge',
             'Onsite Management','Balcony','Ceiling Fan','Controlled Access','FirePlace',
             'Garage Parking','High Ceilings','Outdoor Space','Walk In Closet']
        
        # loop over the list of all houses
        for house in self.houses:
            # getting prices by finding all divs with class ListItemMobileView_price__1IH5H
            # in the BeautifulSoup objects 
            price = house.find_all('div',{"class":"ListItemMobileView_price__1IH5H"})[0].text               
            house_price.append(price)
            
            # getting bedrooms by finding all divs with class ListItemMobileView_bedBathText__3ID4h
            # in the BeautifulSoup objects 
            beds = house.find_all('div',{'class':'ListItemMobileView_bedBathText__3ID4h'})[0].text
            # re formatting bedrooms number 
            res = list(map(int, re.findall(r'\d+', beds)))
            if len(res) != 0:
                bedrooms.append(max(res))
            else:
                bedrooms.append(0) 
                
            # getting baths by finding all divs with class ListItemMobileView_bedBathText__3ID4h
            # in the BeautifulSoup objects
            bath = house.find_all('div',{'class':'ListItemMobileView_bedBathText__3ID4h'})[1].text
            # re formatting bedrooms number 
            res = list(map(int, re.findall(r'\d+', bath)))    
            baths.append(max(res))
            

            # getting amenities by finding all divs with class ListItemMobileView_overlayInfo__cDEY4
            # in the BeautifulSoup objects            
            amenities = house.find_all('div',{'class':'ListItemMobileView_overlayInfo__cDEY4'})[0].text
            amenities_list.append(amenities)

        # get all lists and represnt them a DataFrame    
        self.data['house_price'] = house_price
        self.data['bedrooms'] = bedrooms
        self.data['baths'] = baths
        self.data['am'] = amenities_list
        # format house price 
        self.data['house_price'] = self.data['house_price'].str.replace("+","")
        self.data['house_price'] = self.data['house_price'].str.replace("$","")
        self.data['house_price'] = self.data['house_price'].str.replace(",","")
        
        # check all amenities for each house and build new columns for all possiple amenities we have
        # if the amenity is applied in this house then will have a value of 1 or True
        for col in cols:
            self.data[col] = self.data['am'].apply(lambda x: col in x)
            
        # check if there is any empty or missing values and replace them with NaN    
        self.data.replace("", float("NaN"), inplace=True)
        self.data = self.data.dropna()
        
        logger.info('Houses data has been saved as excel sheet')
        # save the data aas excel sheet 
        self.data.to_excel('dataset.xlsx') 
        


In [None]:
# run the Data Collector 
dc = DataCollector()
dc.collect_houses()


In [None]:
# show data
dc.data

In [None]:
#response = get('https://www.zumper.com/apartments-for-rent/new-york-ny?page=2')
           # parse through the html 
#url_soup = BeautifulSoup(response.text, 'html.parser')
#url_soup.findAll("div")
#url_soup.find_all('div', class_='ListItemMobileView_address__B8pfK')#[0].text
#url_soup.find_all('div', class_='ListItemMobileView_price__1IH5H')
#url_soup.select('div div div div div div div div div div')