# Web Scraping

Imagine that we are a group of developers who want to get fresh goods price data from a supermarket website daily.

Carrefour link:
https://online.carrefour.com.tw/en/fresh--goods?start=0#

In [1]:
# import section
import requests
from bs4 import BeautifulSoup
import time

In [3]:
# Specify User Agent (Web Browser) for get request
headers = {'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'}

In [5]:
# Use the request module to perfomr a get request
r = requests.get('https://online.carrefour.com.tw/en/fresh--goods?start=0#', headers=headers)

In [11]:
r.status_code

200

In [15]:
r.text

In [17]:
# BeautifulSoup to turn html text into soup object
soup = BeautifulSoup(r.text, 'html.parser')

In [21]:
# Find all div tags with class = "hot-recommend-item line", and save the list
lst_goods = soup.find_all("div", {"class": "hot-recommend-item line"})

In [25]:
len(lst_goods)

24

In [37]:
dct_goods = {} # empty dict to save data

# For each element in the list, get the name & price of the goods, 
# then create a key value pair of name:price in the dict
for ele in lst_goods:
    id_goods = ele.find("a", {"class":"gtm-product-alink"})["data-pid"]
    name_goods = ele.find("div", {"class": "commodity-desc"}).find("a").text
    price_goods = ele.find("div", {"class": "current-price"}).find("em").text
    dct_goods[id_goods] = [name_goods, price_goods]

In [39]:
len(dct_goods)

24

In [41]:
# Show the dict
dct_goods

{'2202131700401': ['CFBIO Cucumber', '$48'],
 '1526297200101': ['Duck Meatball', '$99'],
 '1502000100124': ['Kuang Chuan Low Fat Milk', '$480'],
 '2204028100501': ['CFBIO fungus', '$39'],
 '2201005400401': ['CQL Carrot 600g/Bag', '$45'],
 '1502005000106': ['Slim TP', '$75'],
 '2200007301301': ['CFBIO baby pak choy', '$39'],
 '2202102100401': ['CQL Papaya Pumpkin', '$79'],
 '1508090200103': ['Dahan non-base to cool tofu', '$45'],
 '1508000200101': ['Han Homemade Tofu Hot Pot(non-GM)', '$16'],
 '1502004700124': ['Slim TP', '$432'],
 '1502000100106': ['Kuang Chuan Low Fat Milk', '$120'],
 '2201002600401': ['Local Radish 600g', '$39'],
 '2202109000101': ['CFPLB Baby Corn', '$33'],
 '1502004700106': ['Slim TP', '$120'],
 '2204006800501': ['CFPLB Mushroom 150g', '$39'],
 '2204027900101': ['CFBIO King Oyster', '$45'],
 '2201041200101': ['Imported Onion 1kg/bag', '$55'],
 '1508002800101': ['Chinese Super Tofu(non-GM)', '$17'],
 '1502300400106': ['BLACK SOYA DRINK', '$71'],
 '2201003200101': ['

## Just in case you want to get all the data from all the pages

In [59]:
# Specify User Agent (Web Browser) for get request
headers = {'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'}

# Compute number of pages and the numbers to substitute in the url
num_goods_total = 5136
num_goods_per_page = 24
num_pages = -1 * (-num_goods_total // num_goods_per_page) # ceiling division

In [69]:
# For all the pages, get all the item name and price

dct_goods = {} # empty dict to save data

# Request and store all the data of all pages
for j in range(0, num_pages*num_goods_per_page, num_goods_per_page):  # range(start=0, stop=81*24, step=24)
    # Get request, use % to substitute string
    print("curently at item number", j)
    r = requests.get('https://online.carrefour.com.tw/en/fresh--goods?start=%s#' %(j), headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # Find all div tags with class = "hot-recommend-item line", and save the list
    lst_goods = soup.find_all("div", {"class": "hot-recommend-item line"})
    
    # For each element in the list, get the name & price of the goods, 
    # then create a key value pair of name:price in the dict
    for ele in lst_goods:
        id_goods = ele.find("a", {"class":"gtm-product-alink"})["data-pid"]
        name_goods = ele.find("div", {"class": "commodity-desc"}).find("a").text
        price_goods = ele.find("div", {"class": "current-price"}).find("em").text
        dct_goods[id_goods] = [name_goods, price_goods]
    
    # !!! Very important!!! Pause 15 seconds to not turn this into a Denial-of-Service (DoS) attack
    time.sleep(5)

# Display the dict
dct_goods

curently at item number 0
curently at item number 24
curently at item number 48
curently at item number 72
curently at item number 96


KeyboardInterrupt: 

In [None]:
# Print the whole dict
print(dct_goods)

In [None]:
# Print the length of the dict
len(dct_goods) # The length of the dict (number of goods) shows 1705 other than 1933, 
               # my guess is that therea are items with the same name.
               # We can find a better way to store our data next week.
