BIBLIOGRAFIA:
https://stackoverflow.com/questions/24226781/changing-user-agent-in-python-3-for-urrlib-request-urlopen/24226797
https://docs.python.org/3/howto/regex.html
https://www.dataquest.io/blog/web-scraping-tutorial-python/
http://toddhayton.com/2015/01/16/scraping-by-example-json-data/
https://stackoverflow.com/questions/48906246/extract-json-from-html-in-python-beautifulsoup?rq=1


In [1]:
import urllib
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from urllib.parse import urlparse
import datetime
import time
import csv

In [2]:
class Throttle:
    """Add a delay between downloads to the same domain
    """
    def __init__(self, delay):
        # amount of delay between downloads for each domain
        self.delay = delay
        # timestamp of when a domain was last accessed
        self.domains = {}
        
    def wait(self, url):
        domain = urlparse(url).netloc
        last_accessed = self.domains.get(domain)
        
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.datetime.now() - 
                                       last_accessed).seconds
            if sleep_secs > 0:
                # domain has been accessed recently
                # so need to sleep
                time.sleep(sleep_secs)
        # update the last accessed time
        self.domains[domain] = datetime.datetime.now()


#Creem funció que obrirà els links
def download(url, user_agent='wswp', num_retries=2):
    print ('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url,None,headers=headers)
    try:
        throttle.wait(path)
        html = urllib.request.urlopen(request).read().decode('utf-8')
    except urllib.request.URLError as e:
        print ('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries-1)
    return html

#Creem funció que obtindrà les categoríes del productes (menú superior web)
def lookCategoria(url,userAgent):
    html=download(url,userAgent)
    soup = BeautifulSoup(html,'html.parser')
    valors = soup.find_all('a',class_='top_menu-item')
    links = [pt['href'] for pt in valors]
    noms = [pt.get_text() for pt in valors]
    productes = pd.DataFrame({
        "nom": noms, 
        "link": links
    })
    return productes


#Creem funció que obtindrà els tipus de productes
def lookTipu(url,userAgent):
    html=download(url,userAgent)
    soup = BeautifulSoup(html,'html.parser')
    cong = soup.find_all('a',class_='link _primary _level3 tree-link ')
    links = [pt['href'] for pt in cong]
    noms = [pt.get_text() for pt in cong]
    productes = pd.DataFrame({
        "nom": noms, 
        "link": links
    })
    return productes

#Creem funció que obtindrà els productes (dins d'una categoria de producte)
def lookProd(path,userAgent):
    html=download(path,userAgent)
    soup = BeautifulSoup(html,'html.parser')
    prod = soup.find(type="application/ld+json")
    jsonData = json.loads(prod.get_text())    #Convert to JSON Object.
    #print(jsonData['itemListElement'])
    elem = jsonData['itemListElement']
    #elem
    noms = [pt['name'] for pt in elem]
    #noms
    urls = [pt['url'] for pt in elem]
    #urls
    productes = pd.DataFrame({
        "nom": noms, 
        "link": urls
    })
    return productes

#Creem funció que obtindrà els alergènics de cada producte
def lookAlerg(path,userAgent,producte):
    html=download(path,userAgent)
    soup = BeautifulSoup(html,'html.parser')
    info = soup.find_all('div',class_='info')
    alerg = soup.find_all('b')
    categories = [pt.get_text() for pt in alerg]
    productes = pd.DataFrame({
        "alèrgens": categories
    })
    productesUnic = productes.alèrgens.unique()
    productes = pd.DataFrame({
        "producte": producte,
        "alèrgens": productesUnic
    })
    return productes


#html=download(url,UserAg)
#print(html)


        

path="http://www.elcorteingles.es"
UserAg="student-uoc"
delay=0.1

In [3]:
#Comprobem el robots.txt a veure si el nostre userAgent no té cap objecció d'accedir.
#.............................robotparser.............

In [4]:
throttle = Throttle(delay)
throttle.wait(path)
categories = lookCategoria(path+"/supermercado",UserAg)
categories


Downloading: http://www.elcorteingles.es/supermercado


Unnamed: 0,link,nom
0,/supermercado/alimentacion-general/,Alimentación
1,/supermercado/desayunos-dulces-y-pan/,"Desayunos, Dulces y Pan"
2,/supermercado/lacteos/,Lácteos
3,/supermercado/congelados/,Congelados
4,/supermercado/dieteticos/,Dietética
5,/supermercado/bebidas/,Bebidas
6,/supermercado/frescos/,Frescos
7,/supermercado/bebes/,Bebés
8,/supermercado/higiene-personal/,Higiene
9,/supermercado/drogueria-y-limpieza/,Droguería


In [5]:
#Anem a buscar els productes de cada categoria de productes. 
#Com que només volem productes alminentaris, recorrerem només del 0 al 6.
#Farem una pausa entre crida i crida.


In [6]:
#Definim la funció que processa categories

Tipus = lookTipu(path+categories["link"][3],UserAg)
Tipus

Downloading: http://www.elcorteingles.es/supermercado/congelados/


Unnamed: 0,link,nom
0,/supermercado/congelados/aves-y-carne-congelada/,Aves y carne congelada
1,/supermercado/congelados/helados-postres-y-nata/,"Helados, postres y nata"
2,/supermercado/congelados/pan-y-reposteria-cong...,Pan y repostería congelada
3,/supermercado/congelados/pescados-mariscos-y-s...,"Pescados, mariscos y surimis"
4,/supermercado/congelados/pizza-lasana-y-canelo...,"Pizza, Lasaña y Canelones"
5,/supermercado/congelados/platos-preparados-con...,Platos preparados congelados
6,/supermercado/congelados/precocinados-rebozados/,Precocinados rebozados
7,/supermercado/congelados/salteados-congelados/,Salteados congelados
8,/supermercado/congelados/verduras-hortalizas-y...,"Verduras, hortalizas y frutas"


In [7]:
Prod = lookProd(path+Tipus["link"][1],UserAg)
Prod

Downloading: http://www.elcorteingles.es/supermercado/congelados/helados-postres-y-nata/


Unnamed: 0,link,nom
0,/supermercado/0110118952003673-haagen-dazs-sec...,Secret Sensation Chocolat Fondant helado con c...
1,/supermercado/0110118952003046-haagen-dazs-van...,Vanilla helado de vainilla
2,/supermercado/0110118952003145-haagen-dazs-min...,Mint Leaves & Chocolate helado de chocolate co...
3,/supermercado/0110118952000398-haagen-dazs-str...,Strawberries & Cream helado de crema y fresa c...
4,/supermercado/0110118953904572-haagen-dazs-car...,Caramel Attraction tarrinas de helado sabor ca...
5,/supermercado/0110118953902006-haagen-dazs-fru...,Fruit Collection tarrinas de helado sabores fr...
6,/supermercado/0110118952004937-haagen-dazs-sal...,Salted caramel cheesecake helado de tarta de q...
7,/supermercado/0110118952000356-haagen-dazs-coo...,Cookies & Cream helado de vainilla con trocito...
8,/supermercado/0110118952000380-haagen-dazs-mac...,Macadamia Nut Brittle helado de vainilla con n...
9,/supermercado/0110118952000372-haagen-dazs-bel...,Belgian Chocolate helado de chocolate belga co...


In [8]:
Alerg = lookAlerg(path+Prod["link"][0],UserAg,Prod["nom"][0])
Alerg

Downloading: http://www.elcorteingles.es/supermercado/0110118952003673-haagen-dazs-secret-sensation-chocolat-fondant-helado-con-centro-de-chocolate-liquido-tarrina-500-ml/


Unnamed: 0,alèrgens,producte
0,Nata,Secret Sensation Chocolat Fondant helado con c...
1,leche,Secret Sensation Chocolat Fondant helado con c...
2,huevo,Secret Sensation Chocolat Fondant helado con c...
3,trigo,Secret Sensation Chocolat Fondant helado con c...


In [9]:

throttle = Throttle(delay)
throttle.wait(path)
categories = lookCategoria(path+"/supermercado",UserAg)
categories
throttle.wait(path)
Tipus = lookTipu(path+categories["link"][3],UserAg)
Tipus
throttle.wait(path)
Prod = lookProd(path+Tipus["link"][1],UserAg)
Prod
throttle.wait(path)
#Alerg = lookAlerg(path+Prod["link"][0],UserAg)
#Alerg
A = [lookAlerg(path+x,UserAg,x) for x in Prod["link"]]
#Alerg = [Lista.append([categories['nom'][3],Tipus['nom'][1],Prod['nom'][0],",".join(lookAlerg(path+x,UserAg)["alèrgens"])]) for x in Prod["link"]]


filename = "alergenos.csv"
Lista=[]
headerLista=["Categoria","Tipo","Producto","Alergenos"]
Lista.append(headerLista)

valorLista=[Lista.append([categories['nom'][3],Tipus['nom'][1],Prod["nom"][x],",".join(A[x]["alèrgens"])]) for x in range(0,24)]

#Lista.append([categories['nom'][3],Tipus['nom'][1],Prod['nom'][0],",".join(lookAlerg(path+x,UserAg)["alèrgens"])])
#s=[Lista.append([categories['nom'][3],Tipus['nom'][1],Prod['nom'][0],",".join(lookAlerg(path+x,UserAg)["alèrgens"])]) for x in Prod["link"]]
   
with open(filename, 'w', newline='') as csvFile:
  writer = csv.writer(csvFile)
  for valor in Lista:
    writer.writerow(valor)
    
    

Downloading: http://www.elcorteingles.es/supermercado
Downloading: http://www.elcorteingles.es/supermercado/congelados/
Downloading: http://www.elcorteingles.es/supermercado/congelados/helados-postres-y-nata/
Downloading: http://www.elcorteingles.es/supermercado/0110118952003673-haagen-dazs-secret-sensation-chocolat-fondant-helado-con-centro-de-chocolate-liquido-tarrina-500-ml/
Downloading: http://www.elcorteingles.es/supermercado/0110118952003046-haagen-dazs-vanilla-helado-de-vainilla-tarrina-500-ml/
Downloading: http://www.elcorteingles.es/supermercado/0110118952003145-haagen-dazs-mint-leaves-and-chocolate-helado-de-chocolate-con-menta-tarrina-500-ml/
Downloading: http://www.elcorteingles.es/supermercado/0110118952000398-haagen-dazs-strawberries-and-cream-helado-de-crema-y-fresa-con-trozos-de-fresa-tarrina-500-ml/
Downloading: http://www.elcorteingles.es/supermercado/0110118953904572-haagen-dazs-caramel-attraction-tarrinas-de-helado-sabor-caramelo-4-unidades-de-86-ml-estuche-344-ml/
