<a href="https://colab.research.google.com/github/christian-jaimes/TasteTheWorld/blob/main/Food_atlas_data_scrap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# References
- Food atlas: https://www.tasteatlas.com/algeria?ref=main-menu
- Wikipedia National Dish: https://en.wikipedia.org/wiki/National_dish
- Wikipedia full dish selection : https://en.wikipedia.org/wiki/Category:Cuisine_by_country
- Nations online: https://www.nationsonline.org/oneworld/countries_of_the_world.htm


In [1]:
# To update ubuntu to be able to run apt install correctly
!apt-get update &> /dev/null

In [2]:
# Install chromium browser and web driver
!apt-get install chromium-browser &> /dev/null
!apt install chromium-chromedriver &> /dev/null

In [3]:
# Install python libraries
!pip install selenium &> /dev/null

In [38]:
import os
import time
import uuid
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [40]:
# Generate Unique ID for each row based on any given column
def generate_uid(row):
    return uuid.uuid4().hex

# "Phisically" scroll to cretain attribute within a website
def scroll_to_element(driver, element):
    actions = ActionChains(driver)
    actions.move_to_element(element).perform()

In [41]:
# Options to disable chromium functionalities
options = webdriver.ChromeOptions()
options.add_argument('--verbose')
options.add_argument('--no-sandbox')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--window-size=1920, 1200')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options = options)

## Getting all regions and countries

In [42]:
home_link = "https://www.tasteatlas.com/"
driver.get(home_link)
time.sleep(2)

In [43]:
# Locating the destinations button to drill down and get the regions
destinations_button = driver.find_element(By.XPATH, '/html/body/div[2]/ta-header-desktop/header/div/ta-header-bottom/div/div/div[2]/a')
destinations_button.click()

In [44]:
# Getting all regions to loop on
region_list = driver.find_elements(By.XPATH, '/html/body/div[2]/ta-header-desktop/header/div/ta-header-bottom/div/div[2]/div/div/div')

In [None]:
countries_list = []
# Loop through each region to store it in the countries list later on
for region in region_list:
    # Click on the region button
    region_name = region.text
    print(region_name,"="*10,"ok")
    region.click()
    time.sleep(2)
    # Get the countries in the region
    country_list = driver.find_elements(By.XPATH, '//div[@class="submenu-item flex ng-scope"]/a[@class="submenu-item--name ng-binding"]')

    for country in country_list:
      country_url = country.get_attribute('href')
      country_name = country.text
      #print(country_name, country_url)

      countries_list.append({'Region': region_name,
                             'Country': country_name,
                             'Country_URL': country_url})

country_dataset = pd.DataFrame(countries_list)
country_dataset['UID'] = country_dataset.apply(lambda row: generate_uid(row), axis=1)

## Getting all contents for the countries

In [12]:
#### For testing
#country_dataset = country_dataset[:1]
#country_dataset = country_dataset[::-1]
#country_dataset

In [None]:
# Collect data for each country
country_contents = []
country_fail = []

# Creating a loop based on the countries dataset scraped above
for index, country in country_dataset.iterrows():
    country_id = country['UID']
    country_name = country['Country']
    country_link = country['Country_URL']
    print("="*100, country_name)

    # Getting the country page
    driver.get(country_link)

    # Getting the containers within the country (Dish, Food items, Drinks) to create types later on
    try:
        containers = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//div[@class="search-results__whattoeat-content ta-parent-section ng-scope"]'))
        )
    except Exception as e:
        print(f"Error: {e}")

    # Iterating the containers to get all the items inside
    try:
      for container in containers:
          container_name = container.find_elements(By.XPATH, './/h2[@class="h2 h2--lowercase h2--muli h2--large search-results__whattoeat-content-title ng-binding ng-scope"]')[0].text.strip()
          container_name = ' '.join(container_name.split(' ')[3:])
          print("="*50, container_name)

          # First of all checking if there is a "SHOW MORE" button, then clicking it to get the full list of items
          click_count = 0
          while True:
              try:
                  view_more_button = container.find_element(By.XPATH, './/button[contains(@class, "btn--underscore") and contains(text(), "VIEW MORE")]')
                  # Scrolling to the button because "what you don't see doesn't exist"
                  scroll_to_element(driver, view_more_button)
                  click_count += 1
                  print("="*25, f"click button {click_count} times")
                  view_more_button.click()
                  time.sleep(2)

              except Exception as e:
                  break

          contents = container.find_elements(By.XPATH, './/h2[@class="h2 h2--bold h2--lowercase"]/a[@class="ng-binding"]')

          # Iterating thru the contents to create a nice list that will be the base to create the country contents dataframe
          for content in contents:
              content_text = content.text
              print(content_text)
              content_url = content.get_attribute('href')

              country_contents.append({'type': container_name,
                                      'name': content_text,
                                      'url': content_url,
                                      "country_id": country_id})

    except Exception as e:
        print(f"="*200, f"{country_name} failed")
        country_fail.append(country_name)

# Creating the dataframe out of the list and adding a unique id for the different dishes
country_contents_dataset = pd.DataFrame(country_contents)
country_contents_dataset['dish_id'] = country_contents_dataset.apply(lambda row: generate_uid(row), axis=1)

In [57]:
from google.colab import files

filename = 'country_contents_dataset'
country_contents_dataset.to_excel(f'{filename}.xlsx', index=False)
files.download(f'{filename}.xlsx')
country_contents_dataset.to_csv(f'{filename}.csv', index=False)
files.download(f'{filename}.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>