# **Zomato - Restaurant Performance Analysis**

- **Objective:** Compare restaurant prices, ratings, and average costs.
- **Steps:**
    1. Scrape restaurant names, cuisine types, ratings, average costs for two people, and locations.
    2. Clean the data by handling missing values and standardizing cost categories.
    3. Store the data in an SQL database.
    4. Use Power BI to visualize restaurant performance by cuisine and cost-effectiveness.
    5. Conclude on the best-value restaurants for specific cuisines.

#Data scraping

In [6]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import tempfile
import os
import shutil

In [None]:
locations_to_scrape = [
  "mathura", "agra", "bangalore", "mumbai", "delhi"
]
max_scrolls = 15

scroll_pauses = 2
delay_between_locations = 10
output_file = "zomato_restaurants.csv"

restaurant_card_selector = 'div.jumbo-tracker'
name_selector = 'h4.sc-1hp8d8a-0'
rating_selector = 'div.sc-1q7bklc-1'
shared_info_selector = 'p.sc-1hez2tp-0.sc-gggouf'

all_restaurants_data = []

for loc in locations_to_scrape:
  print(f"\n{'='*10} Processing Location: {loc.upper()} {'='*10}")
  zomato_url = f"https://www.zomato.com/{loc}/restaurants"

  options = ChromeOptions()
  options.add_argument("--headless")
  options.add_argument("--incognito")
  user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
  options.add_argument(f"user-agent={user_agent}")
  options.add_argument("--disable-gpu")
  options.add_argument("--disable-extensions")
  options.add_argument("--no-sandbox")
  options.add_argument("--disable-infobars")
  options.add_argument("--disable-dev-shm-usage")

  driver = None
  try:
    driver = webdriver.Chrome(options=options)
    restaurants_data_for_loc = []
    driver.get(zomato_url)
    print(f"Opened: {zomato_url}")

    try:
      WebDriverWait(driver, 20).until(
        lambda d: d.title and loc.capitalize() in d.title or d.find_element(By.TAG_NAME, "body")
      )
      print("Page basic structure loaded.")
    except TimeoutException:
      print(f"Warning: Page title/body for {loc} did not load as expected. Continuing anyway.")

    #scrolling
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_count = 0
    print(f"Starting scroll process (max_scrolls = {max_scrolls})")
    while scroll_count < max_scrolls:
      print(f"Scrolling down {loc} ({scroll_count + 1}/{max_scrolls})")
      driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
      time.sleep(scroll_pauses)
      new_height = driver.execute_script("return document.body.scrollHeight")

      if new_height <= last_height + 10:
        print("Reached bottom or no significant new content loaded during scroll.")
        break
      last_height = new_height
      scroll_count += 1
    print("Finished scrolling phase.")

    print(f"Extracting data for {loc}...")
    scraped_urls_for_loc = set()

    try:
      WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, restaurant_card_selector))
      )
      restaurant_elements = driver.find_elements(By.CSS_SELECTOR, restaurant_card_selector)
      print(f"Found {len(restaurant_elements)} potential restaurant elements for {loc}.")

      if not restaurant_elements:
        print(f"WARNING: No restaurant elements found matching selector '{restaurant_card_selector}'. Check selector or page content.")

      for element_index, element in enumerate(restaurant_elements):
        try:
          link_element = element.find_element(By.TAG_NAME, 'a')
          res_url = link_element.get_attribute('href')
          if res_url in scraped_urls_for_loc:
            continue
          scraped_urls_for_loc.add(res_url)
        except NoSuchElementException:
          res_url = 'N/A'

        data = {
          'Target_Location': loc.capitalize(),
          'Name': 'N/A',
          'Cuisine': 'N/A',
          'Rating': 'N/A',
          'Cost_for_Two': 'N/A',
          'URL': res_url
        }
        res_name = 'N/A'

        try:
          data['Name'] = element.find_element(By.CSS_SELECTOR, name_selector).text.strip()
          res_name = data['Name']
        except NoSuchElementException:
          pass
        except Exception as e_name:
          print(f"ERROR extracting name for element {element_index+1}: {e_name}")

        found_cuisine = False
        found_cost = False
        try:
          info_paragraphs = element.find_elements(By.CSS_SELECTOR, shared_info_selector)
          for p_element in info_paragraphs:
            p_text = ""
            try:
              p_text = p_element.text.strip()
              if not p_text: continue
            except Exception as e_text:
              continue

            if not found_cost and '₹' in p_text and \
              ('for two' in p_text.lower() or 'per person' in p_text.lower()):
              data['Cost_for_Two'] = p_text
              found_cost = True
              continue

            if not found_cuisine and '₹' not in p_text and ',' in p_text and len(p_text) > 3:
              data['Cuisine'] = p_text
              found_cuisine = True
              continue
        except Exception as e_find:
          print(f"ERROR processing shared paragraphs for {res_name}: {e_find}")

        try:
          rating_text = element.find_element(By.CSS_SELECTOR, rating_selector).text.strip()
          if rating_text and (rating_text[0].isdigit() or rating_text in ["NEW", "-"]):
            data['Rating'] = rating_text
        except NoSuchElementException:
          pass
        except Exception as e_rating:
          print(f"ERROR extracting rating for {res_name}: {e_rating}")

        if res_name != 'N/A':
          restaurants_data_for_loc.append(data)

      print(f"Finished extraction for {loc}. Scraped {len(restaurants_data_for_loc)} restaurants.")
      all_restaurants_data.extend(restaurants_data_for_loc)

    except TimeoutException:
      print(f"Timed out waiting for restaurant cards (selector: '{restaurant_card_selector}') to load for {loc}. Check selector and page load.")
    except Exception as e:
      print(f"An error occurred during scraping {loc}: {e}")

  except Exception as e:
    print(f"CRITICAL ERROR for {loc}: Error initializing WebDriver or navigating: {e}")
    print("Skipping this location.")

  finally:
    if driver:
      print(f"Closing WebDriver for {loc}...")
      driver.quit()

    if loc != locations_to_scrape[-1]:
      print(f"\nWaiting {delay_between_locations} seconds before next location...")
      time.sleep(delay_between_locations)

#save combined data
if all_restaurants_data:
  print(f"\n{'='*10} Scraping Complete {'='*10}")
  print(f"Total scraped restaurants across all locations: {len(all_restaurants_data)}")
  df = pd.DataFrame(all_restaurants_data)

  df = df[['Target_Location', 'Name', 'Cuisine', 'Rating', 'Cost_for_Two', 'URL']]
  try:
    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"\nCombined data saved to {output_file}")
  except Exception as e:
    print(f"Error saving combined data to CSV: {e}")
else:
  print("\nNo restaurant data was scraped from any location.")

#Data Cleaning and handling missing values

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("zomato_restaurants.csv")
df.head()

Unnamed: 0,Target_Location,Name,Cuisine,Rating,Cost_for_Two,URL
0,Mathura,Pizza Hut,"Pizza, Fast Food, Desserts, Beverages",-,₹600 for two,https://www.zomato.com/mathura/pizza-hut-9-mat...
1,Mathura,The Foodies Bar,"Fast Food, Momos, Burger, Desserts, Coffee, Sh...",4.4,"₹1,000 for two",https://www.zomato.com/mathura/the-foodies-bar...
2,Mathura,The Belgian Waffle Co.,"Waffle, Pancake, Ice Cream, Desserts, Beverages",-,₹200 for two,https://www.zomato.com/mathura/the-belgian-waf...
3,Mathura,Brijwasi Centrum,"Chinese, Continental, Desserts, Drinks Only, F...",4.2,"₹1,000 for two",https://www.zomato.com/mathura/brijwasi-centru...
4,Mathura,Centre Point,"South Indian, North Indian, Chinese",4.9,₹250 for two,https://www.zomato.com/mathura/centre-point-ma...


In [21]:
#fill cuisine missing values using mode
df['Cuisine'] = df['Cuisine'].fillna(df['Cuisine'].mode()[0])
# fill rating missing values with mean
df['Rating'] = df['Rating'].replace("-", np.nan)
df['Rating'] = df['Rating'].astype(float)
df['Rating'] = df['Rating'].fillna(df['Rating'].mean())

In [24]:
# remove comma and extract numerical value from cost_for_two
df['Cost_for_Two'] = df['Cost_for_Two'].str.replace(',', '')
df['Cost_for_Two'] = df['Cost_for_Two'].str.extract(r'(\d+)')
df['Cost_for_Two'] = df['Cost_for_Two'].astype(float)

In [27]:
df['Cost_for_Two'] = df['Cost_for_Two'].fillna(df['Cost_for_Two'].mean())

In [29]:
#remove url column from the dataset
df = df.drop('URL', axis=1)

In [32]:
df['Rating'] = df['Rating'].round(1)

In [35]:
#save cleaned data to new csv file
df.to_csv("zomato_restaurants_cleaned.csv", index=False)