#### Felix Adam - Computing Lab Assignment 2 -  Webscraping Beer Prices

The goal of this notebook is to scrape current beer prices from a big german food delivery site. 
Secondary goal: Find beers that are on sale!

1. Make use of beautifoul soup 
2. Implement the whole process with ques!



In [1]:
# Importing Libraries
from bs4 import BeautifulSoup
import requests
import re
import queue
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor,as_completed
import time


# Selenium Options, we want to use a "headless" driver, so no windows pop up
#browser_options = Options()
#browser_options.add_argument("--headless")

# Setting Webdriver to Firefox
#driver = webdriver.Firefox(options=browser_options)


In [5]:
class BeerScraper:
    
    # Initializing with a starting URL,the queue and an empty list for results
    def __init__(self,start_url, friendly_scraping_time = 5):
        
        self.start_url = start_url
        self.beer_queue = queue.Queue()
        self.results =  []
        self.friendly_scraping_time = friendly_scraping_time
        
    ## Get Beer List
    def get_beer_list(self):
        # Setting up the starting link
        current_link = self.start_url
    
        # Initialising the while loop
        no_more_pages = False

        while no_more_pages == False:
    
            print(current_link + ' Sleeping for ' + str(self.friendly_scraping_time) + ' Seconds')
    
            # Gently scraping 
            time.sleep(self.friendly_scraping_time)
    
            print('Sleep done')
    
            # Making request
            beer_list_request = requests.get(current_link)
    
            beer_list_page = BeautifulSoup(beer_list_request.content, 'html.parser')
    
            # Get all beers on the page
            beers = beer_list_page.find_all('div', class_= 'search-service-ProductTileContent')

            # Appending all beers to the list 
            for beer in beers:
                # Find URL of current beer
                beer_url = 'https://shop.rewe.de'+beer.find('a')['href']
        
                # Put the beer in the queue
                self.beer_queue.put(beer_url)
    
            print('Total Nr. of Beers found: '+ str(self.beer_queue.qsize()))
    
            # Find next link
            next_link = beer_list_page.find_all('a', href = True, text ='>')

            if len(next_link) == 0:
                no_more_pages = True 
            else:
                current_link = next_link[0]['href']
    
        print('Done')
        
    ### Method get_beer, scrapes single beer page ###
    def get_beer(self,beer_url):
        
        # Sleep
        sleep(self.friendly_scraping_time)
        
        # Getting Request
        beer_page_request = requests.get(beer_url)
        
        # Parsing with beautiful soup
        beer_parsed = BeautifulSoup(beer_page_request.content, 'html.parser')

        # Title
        beer_title = beer_parsed.find('h1', class_= 'pd-QuickInfo__heading' ).text

        # Price, integer part
        integer_price = beer_parsed.find('span', class_='pd-price__predecimal').text

        # Price, decimal part, removing leading whitespace as well
        decimal_price = beer_parsed.find('span', class_='pd-price__decimal').text.lstrip()

        # Full Price, concat and convert to float
        beer_price = float(integer_price+'.'+decimal_price)

        #Get the info string
        info_string = beer_parsed.find('div', class_='rs-qa-price-base pd-Grammage pd-Grammage--Detail').text
        
        if '=' not in info_string:
            price_per_liter = 0
        else:
        
            # Extract info inside of the brackets
            price_per_liter_string = info_string[info_string.find("(")+1:info_string.find(")")]

            # Extract string left to the equal sign (e.g. 1 l = 2,17 € or 1 l = 2 €)
            temp_price_liter = price_per_liter_string.split('=')[1]
    
            # Error handling for missing commas
            if "," not in temp_price_liter:
                price_per_liter = int(re.search(r'\d+', temp_price_liter).group())

            else:
                # Get the integer 
                temp_price_liter_int  = temp_price_liter.split(',')[0]
    
                # Get the decimal
                temp_price_liter_dec = temp_price_liter.split(',')[1].split(' ')[0]
            
                # Convert to price per liter
                price_per_liter = float(temp_price_liter_int+'.'+temp_price_liter_dec)

        # Alcohol Content in percent adapt later!
        #alcohol_content = beer_parsed.find_all('div', class_='pd-Attribute')[2].text.split(' ')[1]

        # Collect all the beer info in a list
        beer = [beer_title, beer_price, price_per_liter] 
    
        #Return the beer
        return beer
    
    #### Private Method for multi threading ###
    def beer_worker(self,i):
        while True:
            # Get a beer link from the queue
            beer_link = self.beer_queue.get()
            if beer_link is None:
                break
            # Get the beer from the link
            beer = self.get_beer(beer_link)
            
            self.results.append(beer)
            
            # Append it to results
            #results.append(beer)
            self.beer_queue.task_done()
            
    
    ### Public method for getting all beers ###
    def scrape_all_beers(self):
        
        beer_list = list(self.beer_queue.queue)
        
        #for beer in beer_list:
            #print(beer)
            #current_beer = self.get_beer(beer)
            
            #self.results.append(current_beer)
            
        #return self.results
        
        with ProcessPoolExecutor() as pool:
            all_data = pool.map(parse_page, beer_list)
        all_data = list(all_data)
        
        return all_data
        

In [6]:
scraper = BeerScraper(start_url='https://shop.rewe.de/c/getraenke/?search=Bier', friendly_scraping_time= 0 )

In [8]:
scraper.get_beer_list()

https://shop.rewe.de/c/getraenke/?search=Bier Sleeping for 0 Seconds
Sleep done
Total Nr. of Beers found: 40
https://shop.rewe.de/c/getraenke/?page=2&search=Bier Sleeping for 0 Seconds
Sleep done
Total Nr. of Beers found: 80
https://shop.rewe.de/c/getraenke/?page=3&search=Bier Sleeping for 0 Seconds
Sleep done
Total Nr. of Beers found: 120
https://shop.rewe.de/c/getraenke/?page=4&search=Bier Sleeping for 0 Seconds
Sleep done
Total Nr. of Beers found: 160
https://shop.rewe.de/c/getraenke/?page=5&search=Bier Sleeping for 0 Seconds
Sleep done
Total Nr. of Beers found: 200
https://shop.rewe.de/c/getraenke/?page=6&search=Bier Sleeping for 0 Seconds
Sleep done
Total Nr. of Beers found: 240
https://shop.rewe.de/c/getraenke/?page=7&search=Bier Sleeping for 0 Seconds
Sleep done
Total Nr. of Beers found: 280
https://shop.rewe.de/c/getraenke/?page=8&search=Bier Sleeping for 0 Seconds
Sleep done
Total Nr. of Beers found: 320
https://shop.rewe.de/c/getraenke/?page=9&search=Bier Sleeping for 0 Secon

In [9]:
%%time
## Wall Time: 3min 21s

beer_list = list(scraper.beer_queue.queue)
        
        #for beer in beer_list:
            #print(beer)
            #current_beer = self.get_beer(beer)
            
            #self.results.append(current_beer)
            
        #return self.results
# Using 100 workers here, 200 would result in connection errors!
        
with ThreadPoolExecutor(max_workers= 50) as pool:
    all_data = pool.map(scraper.get_beer, beer_list)
    all_data = list(all_data)
all_data

CPU times: user 27 µs, sys: 0 ns, total: 27 µs
Wall time: 32.2 µs


In [70]:
%%time

e = []

for beer in beer_list:
    test = scraper.get_beer(beer)
    e.append(test)


CPU times: user 3min 8s, sys: 432 ms, total: 3min 8s
Wall time: 6min 12s


In [11]:
import multiprocessing
import itertools


In [12]:
%%time
with multiprocessing.Pool() as p:
    beer = p.map(scraper.get_beer, beer_list)
all_beers = set(it.chain.from_iterable(beer))

TypeError: can't pickle _thread.lock objects