In [2]:
!pip install parsel

Collecting parsel
  Downloading parsel-1.9.1-py2.py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=1.2.0 (from parsel)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting jmespath (from parsel)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting w3lib>=1.19.0 (from parsel)
  Downloading w3lib-2.2.1-py3-none-any.whl.metadata (2.1 kB)
Downloading parsel-1.9.1-py2.py3-none-any.whl (17 kB)
Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Downloading w3lib-2.2.1-py3-none-any.whl (21 kB)
Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Installing collected packages: w3lib, jmespath, cssselect, parsel
Successfully installed cssselect-1.2.0 jmespath-1.0.1 parsel-1.9.1 w3lib-2.2.1


In [7]:
import asyncio
import json
import math
import csv
from typing import List, Dict, Optional
from httpx import AsyncClient, Response
from parsel import Selector

client = AsyncClient(
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.tripadvisor.com/",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    },
    follow_redirects=True
)

def parse_hotel_page(result: Response) -> Dict:
    """Анализ данных об отелях со страниц отелей."""
    selector = Selector(result.text)
    basic_data = json.loads(selector.xpath("//script[contains(text(),'aggregateRating')]/text()").get())
    description = selector.css("div.fIrGe._T::text").get()
    amenities = []
    for feature in selector.xpath("//div[contains(@data-test-target, 'amenity')]/text()"):
        amenities.append(feature.get())
    reviews = []
    for review in selector.xpath("//div[@data-reviewid]"):
        title = review.xpath(".//div[@data-test-target='review-title']/a/span/span/text()").get()
        text = "".join(review.xpath(".//span[contains(@data-automation, 'reviewText')]/span/text()").extract())
        rate = review.xpath(".//div[@data-test-target='review-rating']/span/@class").get()
        rate = (int(rate.split("ui_bubble_rating")[-1].split("_")[-1].replace("0", ""))) if rate else None
        trip_data = review.xpath(".//span[span[contains(text(),'Date of stay')]]/text()").get()
        reviews.append({
            "title": title,
            "text": text,
            "rate": rate,
            "tripDate": trip_data
        })

    return {
        "basic_data": basic_data,
        "description": description,
        "features": amenities,
        "reviews": reviews
    }

async def fetch_hotel_page(url: str) -> Response:
    """Получить страницу отеля асинхронно."""
    await asyncio.sleep(3)  # Задержка перед запросом
    return await client.get(url)

async def scrape_hotel(url: str, max_review_pages: Optional[int] = None) -> Dict:
    """Собераем данные и отзывы об отелях."""
    first_page = await fetch_hotel_page(url)

    if first_page.status_code != 200:
        print(f"Request failed with status code {first_page.status_code}")
        return {}

    hotel_data = parse_hotel_page(first_page)

    #Получить общее количество страниц отзыва
    _review_page_size = 20
    total_reviews = int(hotel_data["basic_data"]["aggregateRating"]["reviewCount"])
    total_review_pages = math.ceil(total_reviews / _review_page_size)

    #Получите количество страниц отзыва для парсинга
    if max_review_pages and max_review_pages < total_review_pages:
        total_review_pages = max_review_pages

    #
    # Создайте список задач для одновременного парсинга всех страниц отзывов.
    review_urls = [
        url.replace("-Reviews-", f"-Reviews-or{_review_page_size * i}-")
        for i in range(1, total_review_pages)
    ]

    tasks = [fetch_hotel_page(review_url) for review_url in review_urls]

    for response in asyncio.as_completed(tasks):
        data = parse_hotel_page(await response)
        hotel_data["reviews"].extend(data["reviews"])

    print(f"Scraped one hotel data with {len(hotel_data['reviews'])} reviews")
    return hotel_data

def save_reviews_to_csv(hotel_data: Dict, filename: str):
    """Сохраняйте отзывы и оценки в файл CSV."""
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Title", "Text", "Rating", "Trip Date"])  # Заголовки столбцов

        for review in hotel_data["reviews"]:
            writer.writerow([review["title"], review["text"], review["rate"], review["tripDate"]])


hotel_data = await scrape_hotel(
    url="https://www.tripadvisor.com/Hotel_Review-g190327-d264936-Reviews-1926_Hotel_Spa-Sliema_Island_of_Malta.html",
    max_review_pages=10,
)
# Сохранение отзывов в CSV файл
save_reviews_to_csv(hotel_data, 'hotel_reviews.csv')
# Печать результата в формате JSON



Scraped one hotel data with 100 reviews
