In [1]:
#open search page
#get total results page
#get links to wine on results page
#collect data from links
#move onto next page

In [2]:
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import re
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor

In [7]:
def getnumberofpages(url = "https://www.majestic.co.uk/wine"):
    response = requests.get(url)
    html = response.text
    parsed = bs(html)
    return int(re.findall(r'\d+', str(parsed.find_all('span', {'class': 'ws-hide'})[1]))[0])

def getpages(url = "https://www.majestic.co.uk/wine", 
             maxpages=getnumberofpages(url = "https://www.majestic.co.uk/wine")):
    pagelinks = []
    for n in range(maxpages):
        if n == 0:
            pagelinks.append(url)
        else:
            pagelinks.append(url+"?page="+str(n+1))
    return pagelinks

def getlinks(pages=getpages()):
    links = []
    for url in pages:
        response = requests.get(url)
        html = response.text
        parsed = bs(html)
        links_dirty = parsed.find_all('a', {'class': 'ish-product-link kor-product-link ish-pricesContainer ish-product-prices'})
        for l in links_dirty:
            text = str(l)
            pos1 = text.find('href=')
            text = text[pos1:]
            pos2 = text.find(';')
            text = text[6:pos2]
            if text not in links:
                links.append(text)
    return links

links = getlinks()

In [8]:
def getparsed(link):
    response = requests.get(link)
    html = response.text
    return bs(html)

def extract(text, cut = '</div>'):
    text = str(text)[:str(text).find(cut)]
    return text[text.rfind('>')+1:]

def getcountry(parsed):
    return extract(parsed.find('p', {'class': 'ish-ca-value'}),cut='</p>')

def getgrape(parsed):
    return extract(parsed.find('div', {'class': 'ish-ca-value'}),cut='</a>')

def getsweetness(parsed):
    return extract(parsed.find('div', {'class': 'wine-sweetness'}),cut='</span>')

def getcolour(parsed):
    return extract(parsed.find('div', {'class': 'shape colour'}))

def getstyle(parsed):
    return extract(parsed.find('div', {'class': 'shape style'}))

def getclosure(parsed):
    return extract(parsed.find('div', {'class': 'shape closure'}))

def getabv(parsed):
    return extract(parsed.find('div', {'class': 'shape abv'}))

def getunitspbottle(parsed):
    return extract(parsed.find('div', {'class': 'shape unitsPerBottle'}))

def getdescription(parsed):
    return extract(parsed.find('div', {'class': 'ish-productDescription ish-productDescription-long'}))

def getinfo(parsed):
    infolist = []
    infolist.append(getcountry(parsed))
    infolist.append(getgrape(parsed))
    infolist.append(getsweetness(parsed))
    infolist.append(getcolour(parsed))
    infolist.append(getstyle(parsed))
    infolist.append(getclosure(parsed))
    infolist.append(getabv(parsed))
    infolist.append(getunitspbottle(parsed))
    infolist.append(getdescription(parsed))
    return infolist

In [14]:
# threaded code
df = []
newlinks = []
with ThreadPoolExecutor(max_workers=50) as executor:
    future_to_url = {executor.submit(getparsed, link): link for link in links}
    for future in concurrent.futures.as_completed(future_to_url):
        link = future_to_url[future]
        newlinks.append(link)
        df.append(getinfo(future.result()))

# non threaded code
#df = []
#for l in links:
#    parsed = getparsed(l)
#    df.append(getinfo(parsed))

In [16]:
df = pd.DataFrame(df)
df['link'] = newlinks #links
df.columns = ['Country', 'Grape', 'Sweetness', 'Colour', 'Style', 'Closure', 'abv', 'Units', 'Description', 'Link']
df.to_csv('run3.csv')