# 1. Load URL to Python using Selenium

In [1]:
#import relevant packages used in this script
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By

import pandas as pd
import statistics
import csv
from time import sleep

#load chrome webdriver 
driver = webdriver.Chrome() #,chrome_options=chrome_options)

In [2]:

driver.get('https://www.vivino.com')


### Why selenium? 
As you can see we are using selenium to scrape our data. Selenium lets us scrape the page as if we are users: this withholds us from errors and blocks from the website. 

# 2. Set up correct settings for chromedriver window

### To be able to scrape the data in our interest, it is important to set up the correct chromedriver window. Via selenium we let the computer navigate to all the correct settings. 


<b>Maximize Chromedriver window:</b> We maximize the chromedriver window to be able to scrape all the correct data. Besides, it makes our scraper more universally usable, since now it is sure that for all users of our scraper the chromedriver window is maximized and thus the scraper will scrape the right code. 

<b>Click cookie consent button:</b> To make our code more neat, we let the computer click the cookie consent button. Besides that it makes the scrolling process more visible. 

<b>Navigate to catelogue page:</b>
Since we want to scrape the wines from Vivino, it is an important step to let the computer navigate to the wines page. 

<b>Navigate to all red wines:</b>
Our sample size only contains red wines. That is why we let the computer filter for red wines only. The default buttons for the filter are "Red wines" and "White wines". Therefore, we let the computer click on the "White wines" filter, to actually de-filter for "White wines". Only the red wines are left now. 

<b>Click sort button:</b>
The default value of the sort by function is sort on highest rating. Since this would bias our research we adjust this to sort by price from low to high. To be able to click an option, first the sort button has to be clicked on. 

<b>Sort from low to high price:</b>
Now the options of the sort button are visible, we click price: low to high, as said before. 

<b>Scroll a bit lower to be able to see price range and average rating filter</b>
To be able to check whether the price range and the average rating filter will be set correctly we scroll a bit downwards

<b>Set price range from 0 to 500+:</b>
The default value of the price range is €7 - €30. In the scope of our research motivation we change this to the full price range. This is done by the "drag and drop by offset" function within the ActionChains module. 

<b>Show wine of all ratings:</b>
The default value of the average rating for which wines are shown, is all wines with rating 3.8 or higher. In the interest of our research motivation it is an important step to change this filter. We let the computer click on "Show all ratings". 

<b>Sroll down to load regions button:</b>
Since we will scrape the first 500 wines shown for all different regions, it is important to load the regions button. For this, we need to scroll down to the place these buttons are visible, since only then the buttons will load and consequently can be scraped. 

#### Final Window: 
Ultimately, we the page we are going to scrape is a maximized window that contains the Wines page of Vivino.com, Filtered on: Red Wines, Full Price Range, All Average Ratings and sorted by price from low to high. 

In [3]:
def prepare_window(): 
    #maximize chromedriver window 
    driver.maximize_window()
    
    #sleep 5 seconds to make sure page is loaded
    sleep(5)
    #click cookie consent button
    element = driver.find_element_by_xpath('//*[@id="cookie-notice-container"]/div/button/span/span')
    element.click()
    sleep(2)
    #navigate to catalogue page
    element = driver.find_element_by_xpath('//*[@id="navigation-container"]/div/div[2]/div/div[1]/div/a/span[2]')
    element.click()
    sleep(2)
    #navigate to all red wines
    element = driver.find_element_by_xpath('//*[@id="explore-page-app"]/div/div/div[2]/div[1]/div/div[1]/div[2]/label[2]/div')
    element.click()
    sleep(2)
    #click "sort" button to be able to change sort by
    element = driver.find_element_by_xpath('//*[@id="explore-page-app"]/div/div/div[1]/div[2]/div')
    element.click()
    sleep(2)
    #click "price: low to high" within sort button
    element = driver.find_element_by_xpath('//*[@id="menu-"]/div[3]/ul/li[4]')
    element.click()
    sleep(2)
    #scroll a bit lower to be able to see price range and average rating filter
    driver.execute_script('window.scrollTo(0, ' + str(500) + ')')
    
    #set price range from €0 to €500+
    elem1 = driver.find_element(By.XPATH,'//*[@id="explore-page-app"]/div/div/div[2]/div[1]/div/div[2]/div[2]/div[2]/div/div[4]')
    elem2 = driver.find_element(By.XPATH,'//*[@id="explore-page-app"]/div/div/div[2]/div[1]/div/div[2]/div[2]/div[2]/div/div[5]')
    ActionChains(driver).drag_and_drop_by_offset(elem1, -50,0).perform()
    ActionChains(driver).drag_and_drop_by_offset(elem2, 200,0).perform()
    sleep(2)
    #show wines of all average ratings
    element = driver.find_element_by_xpath('//*[@id="1"]')
    element.click()
    sleep(2)
    #scroll down to see regions filter, and wait 15 seconds to be sure all regions are loaded
    driver.execute_script('window.scrollTo(0, ' + str(1000) + ')')
    sleep(15)
    

In [4]:
#call prepare_window function
prepare_window()

In [5]:
from bs4 import BeautifulSoup
res = driver.page_source.encode('utf-8')
soup = BeautifulSoup(res, "html.parser")

# 3. Define Functions

#### Get_the_url function
We defined a get_the_url_function. The input value of this function is one of the Regions we want to scrape. The computer then tries to find a filter button linked to this Region and clicks it. By doing this, the page will be filtered for that Region and can then be scraped. 

In [6]:
#define get_the_url function which automatically clicks the Region to scrape in the filter on the website
def get_the_url(Region):
    element= driver.find_element_by_xpath("//span[text()='"+ Region + "']")
    element.click()
    url = driver.current_url
    return url

In [7]:
#define a scroll range high enough to scrape 500 wines
the_range = int(500)

#### Scroll_page function:
We defined a scroll_page function. The input value of this function is also one of the Regions. Then, the get_the_url function is put within the scroll_page function. So for example if we say scroll_page("Bordeaux), the scroll_page function will first navigate to the correct url (the wines that are filtered by Region "Bordeaux"). 

Then, a variable <b>"numbers"</b> is created, which tells the total amount of wines that is shown on the page. This variable will later on be used to stop the scraper when all wines are scraped. 

To be able to later on filter out duplicate wines, we create a variable <b>"scraping_round"</b>. For every time the full page is being scraped, the variable scraping_round will go up by 1. 

Then, a for-loop is created. This first for-loop scrolls through the website. As seen at the bottom of the function, scroll_range goes up by 1000 every time it goes through the loop. 

To <b>make our scraper more efficient</b>, we only scrape the page when there are more wines visible than in the previous scroll loop. If this is the case, the computer will perform the next for-loop: the scraping. The for loop scrapes all wines visible in the current window. To be precisely, it scrapes 4 attributes of all the wines: the name, the price, the number of reviews and the average rating. These are all found by searching via the find_all function within BeautifulSoup. 

The attributes that are scraped from the wines are then <b>transformed</b> for our users to better be able to work with the output data. 

Then, the transformed attributes per wine are written to a <b>csv</b> file that holds the name of the Region. 

To make sure our scroller does not unnescessarily scroll the amount of times put under the high number of the_range, we create an <b>if-statement</b> which stops the scroller when the total amount of wines scraped per Region are a given number (in this case 500), or when the total amount of wines scraped is already the total amount of wines that are available for scraping. 

In [8]:
def scroll_page(Region):    
    
    #call get_url function to get the correct url linked to the Region that is going to be scraped
    get_the_url(Region)      
    
    sleep(1)
    
    res = driver.page_source.encode('utf-8')
    soup = BeautifulSoup(res, "html.parser")
    scroll_range = 0
    
    sleep (5)
        
    #create variable numbers which finds the total amount of wines diplayed at the page that is being scraped
    all_wines = driver.find_element_by_xpath('//*[@id="explore-page-app"]/div/div/h2').text
    numbers = [int(word) for word in all_wines.split() if word.isdigit()][0]
    numbers = int(numbers)
    
    #create scraping_round variable that counts the amount of times all the wines are being scraped 
    scraping_round = 1
        
    num_wines_view_2 = 0
    #create a scroller that scrolls the_range amount of times.     
    for _ in range(the_range):
        res = driver.page_source.encode('utf-8')
        soup = BeautifulSoup(res, "html.parser")
        
        
        # total number of wines in current view
        num_wines_view_1 = int(len(soup.find_all(attrs={"data-testid" : "wineCard"})))
        
        #create attributes the_name_id, the_price_id, the_reviews_id and the_rating id
        if num_wines_view_1 > num_wines_view_2:
            for counter in range(num_wines_view_1):
                the_name_id = soup.find_all(attrs={"data-testid": "wineCard"})[counter].find_all(attrs={"class": "wineInfoVintage__vintage--VvWlU wineInfoVintage__truncate--3QAtw"})[0].text
                the_price_id = soup.find_all(attrs={"data-testid": "wineCard"})[counter].find_all(True, {"class": ["addToCartButton__price--qJdh4" , "addToCart__subText--1pvFt addToCart__ppcPrice--ydrd5", "addToCart__subText--1pvFt addToCart__soldOut--1dP2Z"]})[0].text
                the_reviews_id =  soup.find_all(attrs={"data-testid": "wineCard"})[counter].find_all(attrs={"class": "vivinoRating__caption--3tZeS"})[0].text
                the_rating_id = soup.find_all(attrs={"data-testid": "wineCard"})[counter].find_all(attrs={"class": "vivinoRating__averageValue--3Navj"})[0].text    
            
                #reshape the_price_id variable that is put into the csv file to make it more usable in further analysis
                try:
                    the_price_id = the_price_id.replace("€", "")
                    the_price_id = the_price_id.replace(".", "")
                    the_price_id = the_price_id.replace(",", ".")
                    the_price_id = the_price_id.split(" ")
                    the_price_id = float(the_price_id[-1])
                except: 
                    the_price_id = "Sold out, no price available"
                
                #reshape the_reviews_id variable that is put into the csv file to make it more usable in further analysis
                the_reviews_id = the_reviews_id.split(" ")
                the_reviews_id = int(the_reviews_id[0])
                
                
                #name the file that is going to be created 
                filename = Region
                fullpath = str(filename) + ".csv"
                #write variables per wine into csv file 
                with open(fullpath, mode='a', newline='', encoding='utf-8') as csv_file:
                    writer = csv.writer(csv_file)
                    writer.writerow([scraping_round, Region, the_name_id, the_price_id, the_reviews_id, the_rating_id])
                    
            #scraping_round counter +1        
            scraping_round += 1
        
        #the scroller scrolls in range +1000 in every loop 
        scroll_range += 1000
        driver.execute_script('window.scrollTo(0, ' + str(scroll_range) + ')')
        
        #update total number of wines in current view to compare with number of wines in view before scrolling
        num_wines_view_2 = int(len(soup.find_all(attrs={"data-testid" : "wineCard"})))
        
        #break loop if 500 wines are scraped
        if num_wines_view_2 >= int(500): 
            break
        #break loop if total number of wines are scraped
        if num_wines_view_2 == numbers:
            break
    
        # pause for 5 seconds
        sleep(1)   
    get_the_url(Region)

#### Filter_file Function

Now, the output file we created in the scroll_page function contains a lot of duplicate numbers. This is because the scraper we created scrapes the full page every time. This is where the variable "scraping_round" comes to use: we will filter the csv file as to only leave the wines of the last scraping round, a.k.a. all the wines visible in the last (furthest scrolled) window. The filtered files are saved as a new file: Region + "filtered.csv"

In [9]:
def filter_file(Region):
    
    #define filenames
    fullpath = Region + ".csv"
    filtered_file = Region + ' filtered.csv'
    
    #assign a variable to the maximum (a.k.a. last) scraping round, stated in the first column of the csv file    
    answer = int(max(int(column[0].replace(',', '')) for column in csv.reader(open(fullpath,'r'))))
    
    #write all wines that are scraped in the last scraping round to a csv "filtered_file".
    with open(fullpath,'r') as fin:
        with open(filtered_file,'w', newline='') as fout:
            header = ["Scraping Round", "Region", "Name", "Price", "Number of Ratings", "Average Rating"]
            csv.writer(fout).writerow(header)
            for row in csv.reader(fin):
                if int(row[0]) == int(answer):
                    csv.writer(fout).writerow(row)
                    
              
                

#### Delete_files Function
Now we have a unfiltered and a filtered file per region, we will delete all the unfiltered files. 

In [10]:
#delete all files that contain the duplicated data, and keep al the filtered files in the folder
def delete_files(Region):
    import os
    fullpath = Region + ".csv"
    os.remove(fullpath)

# 4. Call Functions

In [11]:
#create list of all regions
total_regions_list = []
for counter in range(0,6):
    total_regions_list.append(soup.find_all(attrs={"class": "filterPills__items--_grOA"})[1].find_all(attrs={"class": "pill__text--24qI1"})[counter].text)


In [12]:
#check whether Regions list is correct
print(total_regions_list)

['Bordeaux', 'Bourgogne', 'Napa Valley', 'Piemonte', 'Rhone Valley', 'Toscana']


In [None]:
#call the scroller and filter function for all Regions
for Region in total_regions_list: 
    scroll_page(Region)
    filter_file(Region)
    delete_files(Region)

# 5. Create One Final output File 

#### Merge_files Function 
Now we are left with the filtered files per Region. We now merge all these files together to get one big final output files, containing the wines of all scraped Regions together. 

In [None]:
#define function to merge the filtered files to get a complete dataset
def merge_files():
    import os
    import glob
    extension = 'csv'
    all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
    #combine all files in the list
    combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
    #export to csv
    combined_csv.to_csv( "All_regions_data.csv", index= False)

In [None]:
#call merge_files function
merge_files()

#### Delete_filtered_files Function 
Now we obtained the big final output files, all the loose filtered files per Region can be deleted. This makes the output folder more neat. 

In [None]:
def delete_filtered_files(Region):
    fullpath = Region + " filtered.csv"
    os.remove(fullpath)

In [None]:
#call delete_filtered_files function for all regions
for Region in total_regions_list: 
    delete_filtered_files(Region)