# References
- Food atlas: https://www.tasteatlas.com/algeria?ref=main-menu
- Wikipedia National Dish: https://en.wikipedia.org/wiki/National_dish
- Wikipedia full dish selection : https://en.wikipedia.org/wiki/Category:Cuisine_by_country

In [None]:
# To update ubuntu to be able to run apt install correctly
!apt-get update &> /dev/null

In [None]:
# Install chromium browser and web driver
!apt-get install chromium-browser &> /dev/null
!apt install chromium-chromedriver &> /dev/null

In [None]:
# Install python libraries
!pip install selenium &> /dev/null

In [None]:
import os
import time
import uuid
import pandas as pd
from google.colab import files
from google.colab import drive
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException

drive.mount('/content/drive')

In [None]:
# Generate Unique ID for each row based on any given column
def generate_uid(row):
    return uuid.uuid4().hex

# "physically" scroll to cretain attribute within a website
def scroll_to_element(driver, element):
    actions = ActionChains(driver)
    actions.move_to_element(element).perform()

In [None]:
# Options to disable chromium functionalities
options = webdriver.ChromeOptions()
options.add_argument('--verbose')
options.add_argument('--no-sandbox')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--window-size=1920, 1200')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options = options)

## Getting all regions and countries

In [None]:
home_link = "https://www.tasteatlas.com/"
driver.get(home_link)
time.sleep(2)

In [None]:
# Locating the destinations button to drill down and get the regions
destinations_button = driver.find_element(By.XPATH, '/html/body/div[2]/ta-header-desktop/header/div/ta-header-bottom/div/div/div[2]/a')
destinations_button.click()

In [None]:
# Getting all regions to loop on
region_list = driver.find_elements(By.XPATH, '/html/body/div[2]/ta-header-desktop/header/div/ta-header-bottom/div/div[2]/div/div/div')

In [None]:
countries_list = []

# Loop through each region to store it in the countries list later on
for region in region_list:
    # Click on the region button
    region_name = region.text
    print(region_name,"="*10,"ok")
    region.click()
    time.sleep(2)
    # Get the countries in the region
    country_list = driver.find_elements(By.XPATH, '//div[@class="submenu-item flex ng-scope"]/a[@class="submenu-item--name ng-binding"]')

    for country in country_list:
      country_url = country.get_attribute('href')
      country_name = country.text
      #print(country_name, country_url)

      countries_list.append({'Region': region_name,
                             'Country': country_name,
                             'Country_URL': country_url})
#Adding some countries that the website dont list on the main menu
missing_countries = [
    {"Region": "Central America", "Country": "Panama", "Country_URL": "https://www.tasteatlas.com/panama?ref=main-menu"},
    {"Region": "Central America", "Country": "Costa Rica", "Country_URL": "https://www.tasteatlas.com/costa-rica?ref=main-menu"},
    {"Region": "Central America", "Country": "Nicaragua", "Country_URL": "https://www.tasteatlas.com/nicaragua?ref=main-menu"},
    {"Region": "Central America", "Country": "Honduras", "Country_URL": "https://www.tasteatlas.com/honduras?ref=main-menu"},
    {"Region": "Central America", "Country": "El Salvador", "Country_URL": "https://www.tasteatlas.com/el-salvador?ref=main-menu"},
    {"Region": "Central America", "Country": "Guatemala", "Country_URL": "https://www.tasteatlas.com/guatemala?ref=main-menu"},
    {"Region": "Central America", "Country": "Belize", "Country_URL": "https://www.tasteatlas.com/belize?ref=main-menu"},
    {"Region": "South America", "Country": "French Guiana", "Country_URL": "https://www.tasteatlas.com/french-guiana?ref=main-menu"},
    {"Region": "South America", "Country": "Falkland Islands", "Country_URL": "https://www.tasteatlas.com/falkland-islands?ref=main-menu"},
]

full_countries_list = countries_list + missing_countries

country_dataset = pd.DataFrame(full_countries_list)
country_dataset['UID'] = country_dataset.apply(lambda row: generate_uid(row), axis=1)

filename = 'countries'
country_dataset.to_json(f'/content/drive/My Drive/data scrap/foodpy/json/{filename}.json', orient='records', lines=True)
country_dataset.to_csv(f'/content/drive/My Drive/data scrap/foodpy/csv/{filename}.csv', index=False)
country_dataset.to_excel(f'/content/drive/My Drive/data scrap/foodpy/xlsx/{filename}.xlsx', index=False)

## Getting all contents for the countries

In [None]:
# Collect data for each country
country_fail = []

# Creating a loop based on the countries dataset scraped above
for index, country in country_datase_test.iterrows():
    country_id = country['UID']
    country_name = country['Country']
    country_link = country['Country_URL']
    print("="*100, country_name)

    # Getting the country page
    driver.get(country_link)

    # Getting the containers within the country (Dish, Food items, Drinks) to create types later on
    try:
        containers = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//div[@class="search-results__whattoeat-content ta-parent-section ng-scope"]'))
        )
    except Exception as e:
        print(f"Error: {e}")

    container_contents = []

    # Iterating the containers to get all the items inside
    try:
      for container in containers:
          container_name_raw = container.find_elements(By.XPATH, './/h2[@class="h2 h2--lowercase h2--muli h2--large search-results__whattoeat-content-title ng-binding ng-scope"]')[0].text.strip()
          container_name = container_name_raw.split()[-1]
          print("="*50, container_name)

          # First of all checking if there is a "SHOW MORE" button, then clicking it to get the full list of items
          click_count = 0
          while True:
              try:
                  view_more_button = container.find_element(By.XPATH, './/button[contains(@class, "btn--underscore") and contains(text(), "VIEW MORE")]')
                  # Scrolling to the button because "what you don't see doesn't exist"
                  scroll_to_element(driver, view_more_button)
                  click_count += 1
                  print("="*25, f"click button {click_count} times")
                  view_more_button.click()
                  time.sleep(2)

              except Exception as e:
                  break

          contents = container.find_elements(By.XPATH, './/h2[@class="h2 h2--bold h2--lowercase"]/a[@class="ng-binding"]')

          # Iterating thru the contents to create a nice list that will be the base to create the country contents dataframe
          for content in contents:
              content_text = content.text
              print(content_text)
              content_url = content.get_attribute('href')

              container_contents.append({'type': container_name,
                                      'name': content_text,
                                      'url': content_url})
    except Exception as e:
        print(f"="*200, f"{country_name} failed")
        country_fail.append(country_name)

    content_details = []

    for country_content_page in container_contents:
      content_name = country_content_page['name']
      content_url = country_content_page['url']
      print(content_name, content_url)

      driver.get(content_url)
      time.sleep(1)

      ### Multiple try/except
      try:
        image = driver.find_element(By.XPATH, '//div[@class="card__hero card__hero--food ng-scope"]//div[@class="swiper-slide ng-scope swiper-slide-active"]//img[@class="img"]').get_attribute('src')
      except NoSuchElementException:
        try:
          image = driver.find_element(By.XPATH, '//div[@class="card__hero card__hero--food ng-scope"]//div[@class="swiper-slide ng-scope"]//img[@class="img"]').get_attribute('src')
        except:
          image = None

      name = content_name

      try:
        other_name = driver.find_element(By.XPATH, '//h4[@class="h4 ng-binding ng-scope"]').text
      except:
        other_name = None

      categories_elements = driver.find_elements(By.XPATH, '//div[@class="group"]//span/a')
      categories = [category.text for category in categories_elements]

      country_flag = driver.find_element(By.XPATH, '//div[@class="right ng-scope"]//div[@class="emblem"]//img').get_attribute('src')

      try:
        readmore_btn = driver.find_element(By.XPATH, '//span[@class="read-more"]')
        scroll_to_element(driver, readmore_btn)
        readmore_btn.click()
        time.sleep(2)
      except:
        pass

      description = driver.find_element(By.XPATH, '//div[@class="text ng-scope ng-isolate-scope"]').text

      ingredients_elements = driver.find_elements(By.XPATH, '//div[@class="food-ingredients food-ingredients--scroll"]//li')
      ingredients = [ingredient.text for ingredient in ingredients_elements]

      ingredients_img_elements = driver.find_elements(By.XPATH, '//div[@class="food-ingredients food-ingredients--scroll"]//li//img[@class="food-ingredients__image"]')
      ingredients_img = [ingredient_img.get_attribute('src') for ingredient_img in ingredients_img_elements]

      combined_ingredients = [{'name': name, 'image_url': img_url} for name, img_url in zip(ingredients, ingredients_img)]

      try:
        rating_container = driver.find_element(By.XPATH, '//div[@class="rating-section__container"]')
        scroll_to_element(driver, rating_container)

        stars = rating_container.find_element(By.XPATH, '//div[@class="rating-card__review-main"]').text
        if stars.upper == 'N/A':
          stars = None
        likes = rating_container.find_elements(By.XPATH, '//div[@class="rating-card__review-likes"]//span[@class="rating-card__review-likes-rating ng-binding"]')[0].text.strip('%')
        indiferent = rating_container.find_elements(By.XPATH, '//div[@class="rating-card__review-likes"]//span[@class="rating-card__review-likes-rating ng-binding"]')[1].text.strip('%')
        dont_like = rating_container.find_elements(By.XPATH, '//div[@class="rating-card__review-likes"]//span[@class="rating-card__review-likes-rating ng-binding"]')[2].text.strip('%')
      except:
        stars = None
        likes = None
        indiferent = None
        dont_like = None

      content_details.append({'name': name,
                              'other_name': other_name,
                              'image': image,
                              'categories': categories,
                              'country_flag': country_flag,
                              'country_id': country_id,
                              'description': description,
                              'ingredients': combined_ingredients,
                              "rating_stars": stars,
                              "rating_likes": likes,
                              "rating_indiferent": indiferent,
                              "rating_dont_like": dont_like,
                              "content_url": content_url
                              })

    # Creating the dataframe out of the list
    content_details_dataset = pd.DataFrame(content_details)

    filename = f'{country_name}_details'

    content_details_dataset.to_json(f'/content/drive/My Drive/data scrap/foodpy/json/{filename}.json', orient='records', lines=True)
    content_details_dataset.to_csv(f'/content/drive/My Drive/data scrap/foodpy/csv/{filename}.csv', index=False)
    content_details_dataset.to_excel(f'/content/drive/My Drive/data scrap/foodpy/xlsx/{filename}.xlsx', index=False)
    print("="*150)
    print("="*150,"FILES SAVED TO G-DRIVE")
    print("="*150)