In [2]:
import time
import requests
import sys
from bs4 import BeautifulSoup
import os
import random


In [3]:
# Here are the functions to get the information needed from the soups
def get_wine_info(soup):
    price = get_price(soup)
    varietal = get_varietal(soup)
    origin = get_origin(soup)
    year = get_year(soup)
    ABV = get_ABV(soup)
    is_green = get_is_green(soup)
    is_collectable = get_is_collectable(soup)
    is_boutique = get_is_boutique(soup)
    is_screw = get_is_screw(soup)
    is_magnum = get_is_magnum(soup)
    is_half = get_is_half(soup)
    is_3L = get_is_3L(soup)
    return [price, varietal, year, ABV, is_green, is_collectable, is_boutique, is_screw, \
            is_magnum, is_half, is_3L, origin]

def get_varietal(soup):   
    return soup.find('span', {'class': 'prodItemInfo_varietal'}).get_text()

def get_price(soup):
    dollar_info = soup.find('span', {'class': 'productPrice_price-regWhole'}).get_text().replace(',','')
    cent_info = soup.find('span', {'class': 'productPrice_price-regFractional'}).get_text()
    return float("".join(dollar_info + '.' + cent_info))

def get_origin(soup):
    return soup.find('span', {'class': 'prodItemInfo_originText'}).get_text()

def get_ABV(soup):
    return soup.find('span', {'class': 'prodAlcoholPercent_percent'}).get_text()

def get_is_green(soup):
    if soup.find('li', {'class':'icon icon-greenWine prodAttr_icon prodAttr_icon-greenWine'}):
        return 1
    else:
        return 0
    
def get_is_collectable(soup):
    if soup.find('li', {'class':'icon icon-collectible prodAttr_icon prodAttr_icon-collectible'}):
        return 1
    else:
        return 0
    
def get_is_boutique(soup):
    if soup.find('li', {'class':'icon icon-boutique prodAttr_icon prodAttr_icon-boutique'}):
        return 1
    else:
        return 0
    
def get_is_screw(soup):
    if soup.find('li', {'class':'icon icon-screwcap prodAttr_icon prodAttr_icon-screwcap'}):
        return 1
    else:
        return 0
    
def get_year(soup):
    itemprop = soup.find('h1', {'class': 'pipName'}).get_text()
    return itemprop.split(' ')[-1]

def get_is_magnum(soup):
    itemprop = soup.find('h1', {'class': 'pipName'}).get_text()
    if itemprop.find('1.5') == -1: 
        return 0
    else: 
        return 1

def get_is_half(soup):
    itemprop = soup.find('h1', {'class': 'pipName'}).get_text()
    if itemprop.find('375') == -1:
        return 0
    else: 
        return 1

def get_is_3L(soup):
    itemprop = soup.find('h1', {'class': 'pipName'}).get_text()
    if itemprop.find('3 Liter') == -1:
        return 0
    else: 
        return 1


In [17]:
#open file for reading in urls

filename = 'URL_files/AllURLS.txt'

# Comment this in to run the Failed URLS in order to get all the data

#filename = 'FailedURLS'

with open(filename) as f:
    wine_list = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
wine_list = [url.strip() for url in wine_list] 

outputfile = 'data/AllData.txt'


In [5]:
# Output header to the output file.


with open(outputfile, 'a+') as filehandle:  
    filehandle.writelines('price,varietal,year,ABV,is_green,is_collectable,is_boutique,is_screw,is_magnum,is_half,is_3L,origin,origin2,origin3,origin4\n') 



In [18]:
# This section is for scraping the URLs in Beautiful Soup!

wineData = []

for url in wine_list:
    response = requests.get(url)
    if (response.status_code) == 200:
        
        wineData.extend([get_wine_info(soup = BeautifulSoup(response.text, "lxml"))])

# Output data every 20 files in case of errror        
        
        if len(wineData) == 20:
            with open(outputfile, 'a+') as filehandle:  
                filehandle.writelines("%s\n" % wine for wine in wineData)
            wineData = []
            print('Writing to file...')
    else:
        print("Status code for " + url + " was " + str(response.status_code) + ". Did not scrape!")
        with open('FailedURLS', 'a+') as filehandle: 
            filehandle.writelines('' + url + '\n')
    time.sleep(.5+5*random.random())
    
with open(outputfile, 'a+') as filehandle:  
    filehandle.writelines("%s\n" % wine for wine in wineData)