# Web Scraping with Scrapy: Getting reviews from Booking

## 1. Importing packages

In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
import sys
from scrapy.http import Request
from scrapy.linkextractors import LinkExtractor
import json
import logging
import pandas as pd
import re
import os

## 2. Some class and functions

In [2]:
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

class HotelreviewsItem(scrapy.Item):
    # define the fields for your item here like:
    rating = scrapy.Field()
    review_title = scrapy.Field()
    review_neg = scrapy.Field()
    review_pos = scrapy.Field()
    review_language = scrapy.Field()
    reviewer_name = scrapy.Field()
    reviewer_nationality = scrapy.Field()
    reviewer_nb_comments = scrapy.Field()
    published_date = scrapy.Field()
    hotel_name = scrapy.Field()
    trip_date = scrapy.Field()

## 2. Creating the JSon pipeline 

In [3]:
#JSon pipeline, you can rename the "trust.jl" to the name of your choice
class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('booking.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

## 3. Spider

Now you know how to get data from one page, we want to automate the spider so it will crawl through all pages of reviews, ending with a full spider able to scrape every reviews of the selected parc. You will modify here the parse function since this is where you tell the spider to get the links and to follow them. <br>
<b>To Do</b>: Complete the following code, to scrape all the reviews of one parc. 

In [4]:
class MySpider(CrawlSpider):
    name = 'BasicSpider'
    domain_url = "https://www.booking.com"

    start_urls = [
        "https://www.booking.com/reviews/nl/hotel/center-parcs-de-eemhof.fr.html",
        "https://www.booking.com/reviews/nl/hotel/center-parcs-de-eemhof.en.html"
    ]
    
        #Custom settings to modify settings usually found in the settings.py file 
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'booking_eemhof.json'                        # Used for pipeline 2
    }

    def parse(self, response):
        n_comments = int(response.xpath('//p[@class="page_showing"]/text()').extract_first().strip()[-2:])
        
        scores = list(map(lambda x: float(x.strip().replace(',','.')), list(response.xpath('//span[@class="review-score-badge"]/text()').extract())))
        
        descriptions = list(response.xpath('//div[@class="review_item_review_content"]'))
        
        reviewer_names = response.xpath('//p[@class="reviewer_name"]/span/text()').extract()
        
        published_dates = response.xpath('//meta[@itemprop="datePublished"]/@content').extract()
        
        n_comments_per_reviewers = response.xpath('//div[@class="review_item_user_review_count"]/text()').extract()
        
        reviewer_nationalities = response.xpath('//span[@itemprop="nationality"]/span/text()').extract()
        
        hotel_name = response.xpath('//h1[@class="item hotel_name"]/a[@class="standalone_header_hotel_link"]/text()').extract_first()
        
        review_titles = response.xpath('//div[@class="review_item_header_content\n"]/span/text()').extract()
        
        review_language = response.xpath('//div[@class="review_sort_container "][1]/select[@id="language"]/option[@selected]/text()').extract_first()
        
        for i in range(n_comments):
            
            item = HotelreviewsItem()

            item['rating'] = scores[i]
            
            item['review_title'] = review_titles[i]
            
            item["review_neg"] = Selector(text=descriptions[i].extract()).xpath('//p[@class="review_neg "]//span[@itemprop="reviewBody"]//text()').extract_first()
            
            item["review_pos"] = Selector(text=descriptions[i].extract()).xpath('//p[@class="review_pos "]//span[@itemprop="reviewBody"]//text()').extract_first()
            
            item['reviewer_name'] = reviewer_names[i]
            
            item['published_date'] = published_dates[i]
            
            item['reviewer_nb_comments'] = int(re.findall(r'\d+', n_comments_per_reviewers[i].strip())[0])
            
            item['reviewer_nationality'] = reviewer_nationalities[i].strip()
            
            item['hotel_name'] = hotel_name
            
            item['review_language'] = review_language
            
            item['trip_date'] = re.findall('\\S+\\s+\\S+$', 
                                           Selector(text=descriptions[2].extract()).xpath('//p[@class="review_staydate "]/text()').extract_first().strip()
                                          )[0] or None

            yield item
        



## 4. Crawling

In [5]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
})

process.crawl(MySpider)
process.start()

2019-01-24 20:39:05 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: scrapybot)
2019-01-24 20:39:05 [scrapy.utils.log] INFO: Versions: lxml 4.3.0.0, libxml2 2.9.9, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 18.9.0, Python 3.6.8 (v3.6.8:3c6b436a57, Dec 24 2018, 02:04:31) - [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)], pyOpenSSL 18.0.0 (OpenSSL 1.1.0j  20 Nov 2018), cryptography 2.4.2, Platform Darwin-18.2.0-x86_64-i386-64bit
2019-01-24 20:39:05 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': 'booking_eemhof.json', 'LOG_LEVEL': 30, 'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}


## 5. Importing and reading data scraped

In [6]:
dfjson = pd.read_json('booking_eemhof.json')

# Removing \r characters causing issues in csv
for col in dfjson.columns:
    if dfjson.dtypes[col] == 'object':
        dfjson[col] = dfjson[col].apply(lambda x: x.replace('\r', '') if x else x)

In [7]:
dfjson.to_csv('booking_eemhof.csv', index=False)