In [29]:
import os
import json
import requests
import time
import logging
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

In [30]:
# Configure logging
logging.basicConfig(level=logging.INFO)

def scrape_oyo_links(url):
    """
    Scrapes OYO links from the provided URL.

    Args:
        url (str): The URL to scrape the OYO links from.

    Returns:
        tuple: A tuple containing the list of scraped URLs and link codes.
    """
    # Code for scraping OYO links
    logging.info('Executing scrape_oyo_links function')

    # Configure Selenium options
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument('--blink-settings=imagesEnabled=false')

    # Set Chrome preferences to block images
    prefs = {
        'profile.managed_default_content_settings.images': 2
    }
    chrome_options.add_experimental_option('prefs', prefs)

    # Set desired capabilities to block images
    caps = DesiredCapabilities().CHROME
    caps['pageLoadStrategy'] = 'none'

    # Set path to your ChromeDriver executable
    chrome_driver_path = "/path/to/chromedriver"

    try:
        # Create a new Selenium driver
        driver = webdriver.Chrome(service=Service(chrome_driver_path), options=chrome_options, desired_capabilities=caps)

        # Define the user agent string
        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"

        # Set user agent for the driver
        driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": user_agent})

        # Open the URL in the driver
        driver.get(url)

        # Wait for the page to load
        time.sleep(3)

        if "There is no property available for this search" in driver.page_source:
            logging.warning("No properties available for this search. Skipping...")
            return [], []

        while True:
            try:
                load_more_span = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//span[contains(text(), 'Results Found')]")))
                load_more_span.click()
                time.sleep(3)
            except:
                break

        # Create a BeautifulSoup object to parse the HTML content
        soup = BeautifulSoup(driver.page_source, "html.parser")
        div_all_links = soup.find_all("a", class_="c-nn640c u-width100")

        base_url = "https://www.oyorooms.com/api/pwa/updateHotelCall?url=https%3A%2F%2Fbff.oyorooms.com%2Fv1%2Fhotels%2Freviews%3Fhotel_id%3D{hotel_id}%26ovh_property%3Dfalse%26limit%3D100&sort_option=&offset="
        urls = []
        link_codes = []

        for link in div_all_links:
            hotel_id = link.get("href")
            hotel_id_cleaned = hotel_id.replace("/", "")
            link_codes.append(hotel_id_cleaned)
            url = base_url.format(hotel_id=hotel_id_cleaned)
            urls.append(url)

        return urls, link_codes

    except Exception as e:
        logging.error(f"An error occurred in scrape_oyo_links function: {str(e)}")
        return [], []

    finally:
        if driver:
            driver.quit()


In [31]:
# Configure logging
logging.basicConfig(level=logging.INFO)

def fetch_data(url):
    """
    Fetches data from the provided URL.

    Args:
        url (str): The URL to fetch the data from.

    Returns:
        list: A list containing the fetched JSON data.
    """
    logging.info('Executing fetch_data function')

    # Define the user agent string
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"

    # Send a GET request to the URL with the user agent in the header
    headers = {"User-Agent": user_agent}

    data = []  # List to store the fetched JSON data
    base_url = url

    try:
        # Make API calls to fetch data in batches
        offset = 0
        while True:
            url = base_url + str(offset)

            response = requests.get(url, headers=headers)

            if response.status_code == 200:
                json_data = response.json()
                data.append(json_data)  # Add the fetched JSON data to the list
                next_offset = json_data['data'].get('next_offset')
                
                if next_offset is None:
                    break  # If next_offset is None, break out of the loop

                offset = next_offset
            else:
                logging.error(f"Error occurred in API call with offset {offset}. Status code: {response.status_code}")
                break

        return data

    except Exception as e:
        logging.error(f"An error occurred in fetch_data function: {str(e)}")
        return []

In [32]:
# Configure logging
logging.basicConfig(level=logging.INFO)

def process_data(data, hotel_id, city_name):
    """
    Processes the fetched data and extracts relevant information.

    Args:
        data (list): The fetched JSON data.
        hotel_id (str): The ID of the hotel.
        city_name (str): The name of the city.

    Returns:
        list: A list containing the processed data.
    """
    logging.info('Executing process_data function')

    fetch_review = []
    folder_path = os.path.join('json', city_name)
    os.makedirs(folder_path, exist_ok=True)

    json_file_path = os.path.join(folder_path, f'{hotel_id}.json')
    with open(json_file_path, 'w') as json_file:
        json.dump(data, json_file)
    
    for json_obj in data:
        reviews = json_obj['data']['reviews']
        for review in reviews:
            user_name = review.get('user_name') if review.get('user_name') else 'none'
            review_text = review.get('review_text') if review.get('review_text') else 'none'
            date = review.get('date') if review.get('date') else 'none'
            title = review['rating'].get('title') if review['rating'].get('title') else 'none'
            fetch_review.append([hotel_id, user_name, review_text, date, title])

    return fetch_review


In [33]:
# Configure logging
logging.basicConfig(level=logging.INFO)

def save_hotel_reviews(hotel_id, processed_data, city_name):
    """
    Saves the processed data as a CSV file.

    Args:
        hotel_id (str): The ID of the hotel.
        processed_data (list): The processed data to be saved.
        city_name (str): The name of the city.
    """
    logging.info('Executing save_hotel_reviews function')

    df = pd.DataFrame(processed_data, columns=['hotel_id', 'user_name', 'review_text', 'date', 'title'])

    folder_name = city_name.replace(' ', '_')  # Replacing spaces with underscores in the city name
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    csv_filename = os.path.join(folder_name, f'{hotel_id}.csv')
    df.to_csv(csv_filename, index=False)

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO)

city_code_file = pd.read_csv('city_wise_url.csv')

for index, city in city_code_file.iterrows():
    logging.info(f"Processing city {index + 1}/{len(city_code_file)}")
    city_name = city['name']
    city_url = city['url']
    result = scrape_oyo_links(city_url)
    if not result:
        logging.warning('No data available. Skipping...')
        continue
    urls, link_code = result

    for url, hotel_id in zip(urls, link_code):
        logging.info(f"Processing hotel {hotel_id} in {city_name}")
        data = fetch_data(url)
        processed_data = process_data(data, hotel_id, city_name)
        save_hotel_reviews(hotel_id, processed_data, city_name)
        logging.info("File saved successfully.")

logging.info("All cities and hotels processed.")
