In [1]:
BASE_LINK = "https://www.greggs.co.uk"

#### 1- Chromium-browser installation

In [2]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

Executing: /tmp/apt-key-gpghome.qJENXvNPp0/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.cVkMbuUhJZ/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.hdoDZP967x/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Get:1 http://deb.debian.org/debian buster InRelease [122 kB]
Get:2 http://deb.debian.org/debian bust



#

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
from bs4 import BeautifulSoup
import pandas as pd
import re

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
from bs4 import BeautifulSoup

service = Service(executable_path=r'/usr/bin/chromedriver')
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options, service=service)
driver.get("https://www.greggs.co.uk/menu")


# refresh to remove popup window ---------
time.sleep(5)
driver.refresh()
time.sleep(5)
html = driver.page_source
driver.quit()

#### 2- Get Page Source using Selenium

In [5]:
def _get_page_source(link: str):
  service = Service(executable_path=r'/usr/bin/chromedriver')
  options = webdriver.ChromeOptions()
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-gpu')
  options.add_argument('--disable-dev-shm-usage')
  driver = webdriver.Chrome(options=options)
  driver.get(link)


  # refresh to remove popup window ---------
  time.sleep(8)
  driver.refresh()
  time.sleep(8)
  html = driver.page_source
  driver.quit()
  return html

#### 3- Use Page Source to extract list of Product Links

In [6]:
def get_href_list(link):

  html = _get_page_source(link)

  soup = BeautifulSoup(html, 'html.parser')
  page_links = soup.find_all('a', href=True)
  link_list = []
  for link in page_links:
    if '/menu/product/' in link['href']:
      link_list.append(f"{BASE_LINK}{link['href']}")
  return link_list

#get_href_list(f"{BASE_LINK}/menu")

#### 4- Extracting product image using specific tag and naming with product title

In [7]:
def get_image(product_type: str, soup: BeautifulSoup):
  # Find the specific meta tag
  try:
    image_url = soup.find('meta', property='og:image')['content']
    print(image_url)
  except TypeError as e:
    print(e)
    html = _get_page_source(link)
    soup = BeautifulSoup(html, 'html.parser')
    image_url = soup.find('meta', property='og:image')['content']

  product_title = soup.find('meta', property='og:title')['content']
  print(product_title)

  # Send a GET request to the image URL
  response = requests.get(image_url)

  # Check if the request was successful (status code 200)
  if response.status_code == 200:

      drive_path = f'/content/drive/MyDrive/greggs_product_images_v3/greggs_{product_type}/'
      filename = f"{product_title}.png"
      if not os.path.exists(drive_path):
        os.makedirs(drive_path)

      full_path = os.path.join(drive_path, filename)
      with open(full_path, "wb") as file:
          file.write(response.content)
      print(f"Image for {product_title} downloaded and saved as {product_title}.png at {full_path}")
  else:
      print("Failed to retrieve the image")

#### 5- Extracting nutrition using class "w-full" to extract nutrition from table rows

In [8]:
def get_nutrition(link: str, product_type: str, soup: BeautifulSoup) -> dict:
  try:
    item_name = soup.find('meta', property='og:title')['content']
    description = soup.find('meta', property='og:description')['content']
  except TypeError as e:
    print(e)
    html = _get_page_source(link)
    soup = BeautifulSoup(html, 'html.parser')
    item_name = soup.find('meta', property='og:title')['content']
    description = soup.find('meta', property='og:description')['content']

  #inspecting the element, we see the table class is w-full
  table = soup.find('table', {'class': 'w-full'}).tbody
  # Find all <tr> elements, skipping those with a "role" attribute equal to "presentation"
  rows = soup.find_all('tr', role=lambda x: x != 'presentation')

  nutrition_dict = {}
  nutrition_dict['Item'] = item_name
  nutrition_dict['Type'] = ' '.join(product_type.split('-')).title()
  nutrition_dict['Description'] = description
  for row in rows:
    # Extract text from each cell in the row
    cells = row.find_all('td')
    cell_texts = [cell.get_text(strip=True) for cell in cells]
    #print(cell_texts)
    nutrition_dict[cell_texts[1]] = cell_texts[2]
  return nutrition_dict

#### 6- Get href tags as per product categories provided by Greggs

In [9]:
product_type_list = ['breakfast', 'savouries-bakes', 'drinks-snacks', 'sandwiches-salads', 'sweet-treats', 'hot-food']
product_type_link_list_dict = {i:[] for i in product_type_list}

for product_type in product_type_list:
  type_link = f"{BASE_LINK}/menu?category={product_type}"
  href_list = get_href_list(type_link)
  product_type_link_list_dict[product_type] = href_list

In [11]:
from google.colab import drive
import os
drive.mount('/content/drive')

Mounted at /content/drive


#### 7- Use above functions to retreive images and send google docs, while retreiving a list of dictionaries to be put into a pandas dataframe

In [12]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os

nutrition_dict_list = []
i = 0
for product_type in product_type_link_list_dict.keys():
  for link in product_type_link_list_dict[product_type]:
    print(i)
    i+=1
    print(link)
    response = requests.get(link)
    soup = BeautifulSoup(response.text, 'html.parser')
    #get_image(product_type, soup)
    #print(get_nutrition(link, product_type, soup))
    nutrition_dict_list.append(get_nutrition(link, product_type, soup))

0
https://www.greggs.co.uk/menu/product/bacon-breakfast-roll-1000714
1
https://www.greggs.co.uk/menu/product/sausage-breakfast-roll-1000715
2
https://www.greggs.co.uk/menu/product/omelette-breakfast-roll-1000722
3
https://www.greggs.co.uk/menu/product/vegan-sausage-breakfast-roll-1002132
4
https://www.greggs.co.uk/menu/product/bacon-and-omelette-breakfast-roll-1000720
5
https://www.greggs.co.uk/menu/product/bacon-and-sausage-breakfast-roll-1000716
6
https://www.greggs.co.uk/menu/product/sausage-and-omelette-breakfast-roll-1000721
7
https://www.greggs.co.uk/menu/product/bacon-breakfast-baguette-1000724
8
https://www.greggs.co.uk/menu/product/sausage-breakfast-baguette-1000725
9
https://www.greggs.co.uk/menu/product/omelette-breakfast-baguette-1000728
10
https://www.greggs.co.uk/menu/product/bacon-and-omelette-breakfast-baguette-1000726
11
https://www.greggs.co.uk/menu/product/bacon-and-sausage-breakfast-baguette-1000723
12
https://www.greggs.co.uk/menu/product/sausage-and-omelette-break

In [13]:
df_100g = pd.DataFrame(data=nutrition_dict_list)
df_100g[['Item', 'Type', 'Description', 'Energy kcal', 'Fat', 'of which Saturates', 'Carbohydrate', 'of which Sugars', 'Protein', 'Salt']].head()

Unnamed: 0,Item,Type,Description,Energy kcal,Fat,of which Saturates,Carbohydrate,of which Sugars,Protein,Salt
0,Bacon Breakfast Roll,Breakfast,Oven baked bacon rashers in a corn top breakfa...,274kcal,10g,3.6g,28g,4.4g,16g,1.6g
1,Sausage Breakfast Roll,Breakfast,Oven baked sausages in a corn top breakfast ro...,252kcal,11g,3.7g,25g,3.8g,11g,1.2g
2,Omelette Breakfast Roll,Breakfast,Cheesy omelette in a corn top breakfast roll.\...,203kcal,8.5g,2.6g,20g,4g,11g,0.73g
3,Vegan Sausage Breakfast Roll,Breakfast,Made with vegan Quornâ¢ mycoprotein sausages ...,231kcal,8g,2.7g,25g,3.6g,13g,1.2g
4,Bacon and Omelette Breakfast Roll,Breakfast,Classic bacon and egg combo in a corn top brea...,235kcal,9.8g,3.3g,22g,4g,14g,1.2g


#### 8- Clean data for downstream analysis

In [14]:
df_100g['Description'] = df_100g['Description'].apply(lambda x: x.split('.')[0] if '.' in x else x) #get first sentence
df_100g['Description'] = df_100g['Description'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x)) #remove unwanted characters
df_100g['Energy (kcal)'] = df_100g['Energy kcal'].str.replace('kcal', '')
df_100g['Fat (g)'] = df_100g['Fat'].str.replace('g', '')
df_100g['Saturates (g)'] = df_100g['of which Saturates'].str.replace('g', '')
df_100g['Carbohydrate (g)'] = df_100g['Carbohydrate'].str.replace('g', '')
df_100g['Sugars (g)'] = df_100g['of which Sugars'].str.replace('g', '')
df_100g['Protein (g)'] = df_100g['Protein'].str.replace('g', '')
df_100g['Salt (g)'] = df_100g['Salt'].str.replace('g', '')
df_100g[['Item', 'Type', 'Description', 'Energy (kcal)', 'Fat (g)', 'Saturates (g)', 'Carbohydrate (g)', 'Sugars (g)', 'Protein (g)', 'Salt (g)']].head()

Unnamed: 0,Item,Type,Description,Energy (kcal),Fat (g),Saturates (g),Carbohydrate (g),Sugars (g),Protein (g),Salt (g)
0,Bacon Breakfast Roll,Breakfast,Oven baked bacon rashers in a corn top breakfa...,274,10.0,3.6,28,4.4,16,1.6
1,Sausage Breakfast Roll,Breakfast,Oven baked sausages in a corn top breakfast roll,252,11.0,3.7,25,3.8,11,1.2
2,Omelette Breakfast Roll,Breakfast,Cheesy omelette in a corn top breakfast roll,203,8.5,2.6,20,4.0,11,0.73
3,Vegan Sausage Breakfast Roll,Breakfast,Made with vegan Quorn mycoprotein sausages and...,231,8.0,2.7,25,3.6,13,1.2
4,Bacon and Omelette Breakfast Roll,Breakfast,Classic bacon and egg combo in a corn top brea...,235,9.8,3.3,22,4.0,14,1.2


In [15]:
df_100g[['Item', 'Type', 'Description', 'Energy (kcal)', 'Fat (g)', 'Saturates (g)', 'Carbohydrate (g)', 'Sugars (g)', 'Protein (g)', 'Salt (g)']]

Unnamed: 0,Item,Type,Description,Energy (kcal),Fat (g),Saturates (g),Carbohydrate (g),Sugars (g),Protein (g),Salt (g)
0,Bacon Breakfast Roll,Breakfast,Oven baked bacon rashers in a corn top breakfa...,274,10,3.6,28,4.4,16,1.6
1,Sausage Breakfast Roll,Breakfast,Oven baked sausages in a corn top breakfast roll,252,11,3.7,25,3.8,11,1.2
2,Omelette Breakfast Roll,Breakfast,Cheesy omelette in a corn top breakfast roll,203,8.5,2.6,20,4,11,0.73
3,Vegan Sausage Breakfast Roll,Breakfast,Made with vegan Quorn mycoprotein sausages and...,231,8,2.7,25,3.6,13,1.2
4,Bacon and Omelette Breakfast Roll,Breakfast,Classic bacon and egg combo in a corn top brea...,235,9.8,3.3,22,4,14,1.2
...,...,...,...,...,...,...,...,...,...,...
112,Southern Fried Chicken Goujons,Hot Food,Tender chicken breast goujons coated in lightl...,246,11,4.4,20,0,17,0.97
113,Mozzarella & Cheddar Bites,Hot Food,Say hello to your new favourite hot and cheesy...,338,21,9.8,23,0.6,13,1.3
114,Ham and Mature Cheddar Cheese Toastie,Hot Food,Honey roast ham and mature Cheddar cheese on t...,255,8.9,5.1,28,3.3,15,1.2
115,Hot Ham and Cheese Baguette,Hot Food,Honey roast ham with melted cheddar cheese on ...,265,8.4,4.3,32,1.7,15,1.3


In [16]:
df_100g[['Item', 'Type', 'Description', 'Energy (kcal)', 'Fat (g)', 'Saturates (g)', 'Carbohydrate (g)', 'Sugars (g)', 'Protein (g)', 'Salt (g)']].to_csv('greggs_nutrition_100g.csv', index=False)