# 1. Load URL to Python using selenium

We are using selenium to extract data from the url of the red wine ranking page of Vivino.com. 
Selenium lets us scrape the page as if we are users: this withholds us from errors and blocks from the website. 


In [9]:
#import relevant packages used in this script
import re
from selenium import webdriver
import pandas as pd
import statistics
import csv
from time import sleep

#load chrome webdriver 
driver = webdriver.Chrome() #,chrome_options=chrome_options)

In [10]:
#call url of vivino website
driver.get('https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1UMtNrLA1NTBQS6609fNRS7Z1DQ1SKwDKpqfZliUWZaaWJOao5Rel2KrlJ1XaqpWXRMfaGgIAb8QUBg%3D%3D')


# 2. Set up correct settings for chromedriver

In [None]:
#maximize chromedriver window 
driver.maximize_window()
driver.execute_script('window.scrollTo(0, ' + str(1000) + ')')

#sleep 20 seconds to make sure the Regions in the filter sidebar are loaded
sleep(20)


In [4]:
#click cookie consent button
element = driver.find_element_by_xpath("//span[text()='OK']")
element.click()

In [5]:
from bs4 import BeautifulSoup
res = driver.page_source.encode('utf-8')
soup = BeautifulSoup(res, "html.parser")

# 3. Define Functions

In [7]:
#create list of all regions
total_regions_list = []
for counter in range(0,6):
    total_regions_list.append(soup.find_all(attrs={"class": "filterPills__items--_grOA"})[1].find_all(attrs={"class": "pill__text--24qI1"})[counter].text)


In [8]:
#check whether Regions list is correct
print(total_regions_list)

['Bordeaux', 'Bourgogne', 'Napa Valley', 'Piemonte', 'Rhone Valley', 'Toscana']


In [9]:
#define get_the_url function which automatically clicks the Region to scrape in the filter on the website
def get_the_url(Region):
    element= driver.find_element_by_xpath("//span[text()='"+ Region + "']")
    element.click()
    url = driver.current_url
    return url

In [11]:
#define a scroll range high enough to scrape 500 wines
the_range = int(500)

In [12]:
def scroll_page(Region):    
    
    from time import sleep
    import re
    
    #call get_url function to get the correct url linked to the Region
    get_the_url(Region)      
    
    sleep(1)
    
    
    #import url in chromedriver
    res = driver.page_source.encode('utf-8')
    soup = BeautifulSoup(res, "html.parser")
    scroll_range = 0
    
    sleep (5)
    
    #name the created file 
    filename = Region
    fullpath = str(filename) + ".csv"
    
    
    
    #create variable numbers which finds the total amount of wines diplayed at the page that is being scraped (for future use, now only the first 500 wines are scraped)
    all_wines = ((soup.find_all(attrs={"class" : "querySummary__querySummary--39WP2"}))[0].text)
    numbers = [int(word) for word in all_wines.split() if word.isdigit()][0]
    numbers = int(numbers)
    
    #create scraping_round variable that counts the amount of times all the wines are being scraped 
    scraping_round = 1
        
    num_wines_view_2 = 0
    #create a scroller that scrolls the_range amount of times.     
    for _ in range(the_range):
        res = driver.page_source.encode('utf-8')
        soup = BeautifulSoup(res, "html.parser")
        
        
        # total number of wines in current view
        num_wines_view_1 = int(len(soup.find_all(attrs={"data-testid" : "wineCard"})))
        
        #create attributes the_name_id, the_price_id, the_reviews_id and the_rating id
        if num_wines_view_1 > num_wines_view_2:
            for counter in range(num_wines_view_1):
                the_name_id = soup.find_all(attrs={"data-testid": "wineCard"})[counter].find_all(attrs={"class": "wineInfoVintage__vintage--VvWlU wineInfoVintage__truncate--3QAtw"})[0].text
                the_price_id = soup.find_all(attrs={"data-testid": "wineCard"})[counter].find_all(True, {"class": ["addToCartButton__price--qJdh4" , "addToCart__subText--1pvFt addToCart__ppcPrice--ydrd5", "addToCart__subText--1pvFt addToCart__soldOut--1dP2Z"]})[0].text
                the_reviews_id =  soup.find_all(attrs={"data-testid": "wineCard"})[counter].find_all(attrs={"class": "vivinoRating__caption--3tZeS"})[0].text
                the_rating_id = soup.find_all(attrs={"data-testid": "wineCard"})[counter].find_all(attrs={"class": "vivinoRating__averageValue--3Navj"})[0].text    
            
                #reshape the_price_id variable that is put into the csv file to make it more usable in further analysis
                try:
                    the_price_id = the_price_id.replace("€", "")
                    the_price_id = the_price_id.replace(".", "")
                    the_price_id = the_price_id.replace(",", ".")
                    the_price_id = the_price_id.split(" ")
                    the_price_id = float(the_price_id[-1])
                except: 
                    the_price_id = "Sold out, no price available"
                
                #reshape the_reviews_id variable that is put into the csv file to make it more usable in further analysis
                the_reviews_id = the_reviews_id.split(" ")
                the_reviews_id = int(the_reviews_id[0])
                
                #reshape the_rating_id variable that is put into the csv file to make it more usable in further analysis
                the_rating_id = float(the_rating_id)
                
                #write variables per wine into csv file 
                with open(fullpath, mode='a', newline='', encoding='utf-8') as csv_file:
                    writer = csv.writer(csv_file)
                    writer.writerow([scraping_round, Region, counter+1, the_name_id, the_price_id, the_reviews_id, the_rating_id])
                    
            #scraping_round counter +1        
            scraping_round += 1
        
        #the scroller scrolls in range +1000 in every loop 
        scroll_range += 1000
        driver.execute_script('window.scrollTo(0, ' + str(scroll_range) + ')')
        
        #update total number of wines in current view to compare with number of wines in view before scrolling
        num_wines_view_2 = int(len(soup.find_all(attrs={"data-testid" : "wineCard"})))
        
        #break loop if 500 wines are scraped
        if num_wines_view_2 >= int(500): 
            break
        #break loop if total number of wines are scraped
        if num_wines_view_2 == numbers:
            break
    
        # pause for 5 seconds
        sleep(1)   
    get_the_url(Region)

In [13]:
def filter_file(Region):
    
    #define filenames
    fullpath = Region + ".csv"
    filtered_file = Region + ' filtered.csv'
    
    #assign a variable to the maximum (a.k.a. last) scraping round, stated in the first column of the csv file    
    answer = int(max(int(column[0].replace(',', '')) for column in csv.reader(open(fullpath,'r'))))
    
    #write all wines that are scraped in the last scraping round to a csv "filtered_file".
    with open(fullpath,'r') as fin:
        with open(filtered_file,'w', newline='') as fout:
            header = ["Scraping Round", "Region", "Counter", "Name", "Price", "Number of Ratings", "Average Rating"]
            csv.writer(fout).writerow(header)
            for row in csv.reader(fin):
                if int(row[0]) == int(answer):
                    csv.writer(fout).writerow(row)
                    
              
                

# 4. Call Functions

In [1]:
#call the scroller and filter function for all Regions
for Region in total_regions_list: 
    scroll_page(Region)
    filter_file(Region)


NameError: name 'total_regions_list' is not defined

# 5. Clean Output Folder

In [None]:
#delete all files that contain the duplicated data, and keep al the filtered files in the folder
import os
for Region in total_regions_list: 
    fullpath = Region + ".csv"
    os.remove(fullpath)

In [None]:
#define function to merge the filtered files to get a complete dataset
def merge_file():
    import os
    import glob
    import pandas as pd
    extension = 'csv'
    all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
    #combine all files in the list
    combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
    #export to csv
    combined_csv.to_csv( "All_regions_data.csv", index= False)

In [None]:
#call merge_file function
merge_file()