#### Felix Adam - Computing Lab Assignment 2 -  Webscraping Beer Prices

The goal of this notebook is to scrape current beer prices from a big german food delivery site. 



In [67]:
# Importing Libraries
from bs4 import BeautifulSoup
import requests
import re
import queue
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor,as_completed, wait, TimeoutError
import time
import csv
import random
import pandas as pd


Making a class for scraping beer prices.

In [47]:
class BeerScraper:
    
    # Initializing with a starting URL,the queue and an empty list for results
    def __init__(self,start_url, user_agent, friendly_scraping_time = 10):
        
        self.start_url = start_url
        self.friendly_scraping_time = friendly_scraping_time
        self.beer_list = []
        self.user_agent = user_agent
       
    ## Get Beer List
    def get_beer_list(self, log_info = False):
        # Setting up the starting link
        current_link = self.start_url
    
        # Initialising the while loop
        no_more_pages = False

        while no_more_pages == False:
            
            if log_info:
                print(current_link + ' Sleeping for ' + str(self.friendly_scraping_time) + ' Seconds')
    
            # Gently scraping 
            time.sleep(self.friendly_scraping_time)
            
            if log_info:
                print('Sleep done')
            
            # Setting Headers
            headers = {'User-Agent': self.user_agent}
            
        
            # Making request
            beer_list_request = requests.get(current_link,headers = headers)
    
            beer_list_page = BeautifulSoup(beer_list_request.content, 'html.parser')
    
            # Get all beers on the page
            beers = beer_list_page.find_all('div', class_= 'search-service-ProductTileContent')

            # Appending all beers to the list 
            for beer in beers:
                # Find URL of current beer
                beer_url = 'https://shop.rewe.de'+beer.find('a')['href']
                
                # Put the beer in the queue 
                self.beer_list.append(beer_url)
                
            if log_info:
                print('Total Nr. of Beers found: '+ str(len(self.beer_list)))
    
            # Find next link
            next_link = beer_list_page.find_all('a', href = True, text ='>')

            if len(next_link) == 0:
                no_more_pages = True 
            else:
                current_link = next_link[0]['href']
    
        print('Done')
        
    ### Method get_beer, scrapes single beer page ###
    def get_beer(self,beer_url):
        
        # Setting Headers
        headers = {'User-Agent': self.user_agent}
         
        # Getting Request
        # Setting User Agent here as well
        beer_page_request = requests.get(beer_url, headers = headers)
        
        # Parsing with beautiful soup
        beer_parsed = BeautifulSoup(beer_page_request.content, 'html.parser')

        # Title
        beer_title = beer_parsed.find('h1', class_= 'pd-QuickInfo__heading' ).text

        # Price, integer part
        integer_price = beer_parsed.find('span', class_='pd-price__predecimal').text

        # Price, decimal part, removing leading whitespace as well
        decimal_price = beer_parsed.find('span', class_='pd-price__decimal').text.lstrip()

        # Full Price, concat and convert to float
        beer_price = float(integer_price+'.'+decimal_price)

        #Get the info string
        info_string = beer_parsed.find('div', class_='rs-qa-price-base pd-Grammage pd-Grammage--Detail').text
        
        if '=' not in info_string:
            price_per_liter = 0
        else:
        
            # Extract info inside of the brackets
            price_per_liter_string = info_string[info_string.find("(")+1:info_string.find(")")]

            # Extract string left to the equal sign (e.g. 1 l = 2,17 € or 1 l = 2 €)
            temp_price_liter = price_per_liter_string.split('=')[1]
    
            # Error handling for missing commas
            if "," not in temp_price_liter:
                price_per_liter = int(re.search(r'\d+', temp_price_liter).group())

            else:
                # Get the integer 
                temp_price_liter_int  = temp_price_liter.split(',')[0]
    
                # Get the decimal
                temp_price_liter_dec = temp_price_liter.split(',')[1].split(' ')[0]
            
                # Convert to price per liter
                price_per_liter = float(temp_price_liter_int+'.'+temp_price_liter_dec)

        
        # Collect all the beer info in a list
        beer = [beer_title, beer_price, price_per_liter] 
    
        #Return the beer
        return beer


In [48]:
# I had to cheat a little bit here, I already got blocked due to sending endless requests because of errors in the parrale threads
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'

# Initializing the Scraper with the start URL
# Sleeping 1 min after every request. This kind of destroys the performance, however, I got blocked before being to fast.
scraper = BeerScraper(start_url='https://shop.rewe.de/c/getraenke/?search=Bier', user_agent= user_agent, 
                      friendly_scraping_time= 60)

In [19]:
# Getting all the webpages
scraper.get_beer_list(log_info= True)

https://shop.rewe.de/c/getraenke/?search=Bier Sleeping for 60 Seconds
Sleep done
Total Nr. of Beers found: 40
https://shop.rewe.de/c/getraenke/?page=2&search=Bier Sleeping for 60 Seconds
Sleep done
Total Nr. of Beers found: 80
https://shop.rewe.de/c/getraenke/?page=3&search=Bier Sleeping for 60 Seconds
Sleep done
Total Nr. of Beers found: 120
https://shop.rewe.de/c/getraenke/?page=4&search=Bier Sleeping for 60 Seconds
Sleep done
Total Nr. of Beers found: 160
https://shop.rewe.de/c/getraenke/?page=5&search=Bier Sleeping for 60 Seconds
Sleep done
Total Nr. of Beers found: 200
https://shop.rewe.de/c/getraenke/?page=6&search=Bier Sleeping for 60 Seconds
Sleep done
Total Nr. of Beers found: 240
https://shop.rewe.de/c/getraenke/?page=7&search=Bier Sleeping for 60 Seconds
Sleep done
Total Nr. of Beers found: 280
https://shop.rewe.de/c/getraenke/?page=8&search=Bier Sleeping for 60 Seconds
Sleep done
Total Nr. of Beers found: 320
https://shop.rewe.de/c/getraenke/?page=9&search=Bier Sleeping for

In [53]:
# Parallel scraping of all beers in the lis.

%%time

# Defining a worker 
def worker(i):
    while True:
        item = q.get()
        if item == 'break':
            break
        results = scraper.get_beer(item)
        r.append(results)
        q.task_done()

# Making a queue and results list
q = queue.Queue()
r = []
for b in beer_list:
    q.put(b)


# Pooling workers to get all beers
with ThreadPoolExecutor(50) as pool:
    futures = pool.map(worker, range(50))

    # block until all tasks are done
    q.join()
    
    # Tell all our workers to stop
    for i in range(50):
        q.put('break')


CPU times: user 3min 13s, sys: 7.7 s, total: 3min 21s
Wall time: 3min 7s


In [70]:
# Store in data frame

# Adding headers
results = [['Name','Price','Price per Liter']] + r

# Convert to DF
beer_df = pd.DataFrame(results[1:],columns=results[0])

# Write dataframe to csv file
beer_df.to_csv('Beer_Prices.csv')