# Web Scraping

Execute o comando `pip install bs4` para instalar o pacote beautiful soup, a biblioteca que faz o parse e nos ajuda a navegar dentro das tags HTML.

In [1]:
from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [2]:
soup.title

<title>The Dormouse's story</title>

In [3]:
soup.title.name

'title'

In [5]:
soup.title.text

"The Dormouse's story"

In [6]:
soup.title.parent.name

'head'

In [7]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [8]:
soup.find_all('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [9]:
soup.p['class']

['title']

In [10]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [11]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [12]:
soup.find(id="link3")

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [13]:
soup.find('a', {'id': 'link3'})

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

## Usando Requests

In [16]:
import requests
from bs4 import BeautifulSoup

riot_articles = []

# Buscando a página da Riot Games
page = requests.get('https://technology.riotgames.com/')
# Criar o objeto BeautifulSoup
soup = BeautifulSoup(page.text, 'html.parser')

In [17]:
page.text

'<!DOCTYPE html>\n<!--[if IEMobile 7]><html class="iem7"  lang="en" dir="ltr"><![endif]-->\n<!--[if lte IE 6]><html class="lt-ie9 lt-ie8 lt-ie7"  lang="en" dir="ltr"><![endif]-->\n<!--[if (IE 7)&(!IEMobile)]><html class="lt-ie9 lt-ie8"  lang="en" dir="ltr"><![endif]-->\n<!--[if IE 8]><html class="lt-ie9"  lang="en" dir="ltr"><![endif]-->\n<!--[if (gte IE 9)|(gt IEMobile 7)]><!--><html lang="en" dir="ltr" prefix="content: http://purl.org/rss/1.0/modules/content/ dc: http://purl.org/dc/terms/ foaf: http://xmlns.com/foaf/0.1/ og: http://ogp.me/ns# rdfs: http://www.w3.org/2000/01/rdf-schema# sioc: http://rdfs.org/sioc/ns# sioct: http://rdfs.org/sioc/types# skos: http://www.w3.org/2004/02/skos/core# xsd: http://www.w3.org/2001/XMLSchema#"><!--<![endif]-->\n<head>\n<meta charset="utf-8" /><script type="text/javascript">(window.NREUM||(NREUM={})).loader_config={licenseKey:"25a156d569",applicationID:"7806507"};window.NREUM||(NREUM={}),__nr_require=function(e,n,t){function r(t){if(!n[t]){var o=

In [18]:
print(soup.prettify())

<!DOCTYPE html>
<!--[if IEMobile 7]><html class="iem7"  lang="en" dir="ltr"><![endif]-->
<!--[if lte IE 6]><html class="lt-ie9 lt-ie8 lt-ie7"  lang="en" dir="ltr"><![endif]-->
<!--[if (IE 7)&(!IEMobile)]><html class="lt-ie9 lt-ie8"  lang="en" dir="ltr"><![endif]-->
<!--[if IE 8]><html class="lt-ie9"  lang="en" dir="ltr"><![endif]-->
<!--[if (gte IE 9)|(gt IEMobile 7)]><!-->
<html dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/ dc: http://purl.org/dc/terms/ foaf: http://xmlns.com/foaf/0.1/ og: http://ogp.me/ns# rdfs: http://www.w3.org/2000/01/rdf-schema# sioc: http://rdfs.org/sioc/ns# sioct: http://rdfs.org/sioc/types# skos: http://www.w3.org/2004/02/skos/core# xsd: http://www.w3.org/2001/XMLSchema#">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <script type="text/javascript">
   (window.NREUM||(NREUM={})).loader_config={licenseKey:"25a156d569",applicationID:"7806507"};window.NREUM||(NREUM={}),__nr_require=function(e,n,t){function r(t){if(!n[t]){va

In [19]:
print(soup.get_text())








(window.NREUM||(NREUM={})).loader_config={licenseKey:"25a156d569",applicationID:"7806507"};window.NREUM||(NREUM={}),__nr_require=function(e,n,t){function r(t){if(!n[t]){var o=n[t]={exports:{}};e[t][0].call(o.exports,function(n){var o=e[t][1][n];return r(o||n)},o,o.exports)}return n[t].exports}if("function"==typeof __nr_require)return __nr_require;for(var o=0;o<t.length;o++)r(t[o]);return r}({1:[function(e,n,t){function r(){}function o(e,n,t){return function(){return i(e,[c.now()].concat(u(arguments)),n?null:this,t),n?void 0:this}}var i=e("handle"),a=e(3),u=e(4),f=e("ee").get("tracer"),c=e("loader"),s=NREUM;"undefined"==typeof window.newrelic&&(newrelic=s);var p=["setPageViewName","setCustomAttribute","setErrorHandler","finished","addToTrace","inlineHit","addRelease"],d="api-",l=d+"ixn-";a(p,function(e,n){s[n]=o(d+n,!0,"api")}),s.addPageAction=o(d+"addPageAction",!0),s.setCurrentRouteName=o(d+"routeName",!0),n.exports=newrelic,s.interaction=function(){return(new r).get()};var m=r

In [20]:
soup.title

<title>Riot Games Technology</title>

In [21]:
soup.title.text

'Riot Games Technology'

In [22]:
articles = soup.find_all('article')

In [23]:
articles

[<article class="c-excerpt views-row views-row-1 views-row-odd views-row-first" role="article">
 <h1 class="c-excerpt__title">
 <a href="/news/engineering-esports-tech-powers-worlds">Engineering Esports: The Tech That Powers Worlds</a></h1>
 <a href="/news/engineering-esports-tech-powers-worlds">
 <img alt="" class="u-images-respond c-excerpt__image" src="https://technology.riotgames.com/sites/default/files/articles/102/engineeringesportsheader.png"/>
 </a>
 <p dir="ltr">We’re the Esports Technology Group, and we’re responsible for the tech behind Riot’s biggest esports events, from reliable network connectivity to global broadcast capabilities to specialty tournament servers to the custom PC fleet used by pros. Part of our role at Riot is to approach typical broadcast and live production challenges with scalable and technology-driven solutions.</p>
 <div class="c-excerpt-meta">
 <a class="c-excerpt-meta__read-more c-content__link" href="/news/engineering-esports-tech-powers-worlds">
 

In [24]:
len(articles)

10

In [25]:
articles[0]

<article class="c-excerpt views-row views-row-1 views-row-odd views-row-first" role="article">
<h1 class="c-excerpt__title">
<a href="/news/engineering-esports-tech-powers-worlds">Engineering Esports: The Tech That Powers Worlds</a></h1>
<a href="/news/engineering-esports-tech-powers-worlds">
<img alt="" class="u-images-respond c-excerpt__image" src="https://technology.riotgames.com/sites/default/files/articles/102/engineeringesportsheader.png"/>
</a>
<p dir="ltr">We’re the Esports Technology Group, and we’re responsible for the tech behind Riot’s biggest esports events, from reliable network connectivity to global broadcast capabilities to specialty tournament servers to the custom PC fleet used by pros. Part of our role at Riot is to approach typical broadcast and live production challenges with scalable and technology-driven solutions.</p>
<div class="c-excerpt-meta">
<a class="c-excerpt-meta__read-more c-content__link" href="/news/engineering-esports-tech-powers-worlds">
Full Story

In [26]:
articles[0].find('h1')

<h1 class="c-excerpt__title">
<a href="/news/engineering-esports-tech-powers-worlds">Engineering Esports: The Tech That Powers Worlds</a></h1>

In [27]:
articles[0].find('h1').text

'\nEngineering Esports: The Tech That Powers Worlds'

In [28]:
links = articles[0].find('a')
links

<a href="/news/engineering-esports-tech-powers-worlds">Engineering Esports: The Tech That Powers Worlds</a>

In [30]:
links['href']

'/news/engineering-esports-tech-powers-worlds'

In [32]:
riot_articles = []
for article in articles:
    title = article.find('h1').text
    href = article.find('a')['href']
    desc = article.find('p').text
    riot_articles.append({
        'title': title,
        'href': href,
        'desc': desc
    })
    
    

In [33]:
riot_articles

[{'title': '\nEngineering Esports: The Tech That Powers Worlds',
  'href': '/news/engineering-esports-tech-powers-worlds',
  'desc': 'We’re the Esports Technology Group, and we’re responsible for the tech behind Riot’s biggest esports events, from reliable network connectivity to global broadcast capabilities to specialty tournament servers to the custom PC fleet used by pros. Part of our role at Riot is to approach typical broadcast and live production challenges with scalable and technology-driven solutions.'},
 {'title': "\nThe Future of League's Engine",
  'href': '/news/future-leagues-engine',
  'desc': 'Hiya folks, Brian "Penrif" Bossé, your local friendly Tech Lead of\xa0League\xa0here. I\'m taking some time in between matches of TFT to wax philosophic about game engines and how we on\xa0League\xa0make decisions around what direction to take our custom game engine. Join me on a moderately long look at one dimension of game engine design, where\xa0League\xa0currently exists on th

## Exemplo - MercadoLivre

In [34]:
term = 'led'
request = requests.get(f'https://lista.mercadolivre.com.br/{term}')

In [35]:
soup = BeautifulSoup(request.text, 'html.parser')

In [36]:
print(soup.prettify())

<!DOCTYPE doctype html>
<html class="no-js" data-country="BR" data-device="desktop" data-site="MLB" lang="pt-BR">
 <head prefix="">
  <title>
   Led no Mercado Livre Brasil
  </title>
  <link href="//analytics.mlstatic.com" rel="preconnect"/>
  <link href="//resources.mlstatic.com" rel="preconnect"/>
  <link href="//static.mlstatic.com" rel="preconnect"/>
  <link href="https://www.google-analytics.com" rel="preconnect"/>
  <link href="https://www.google.com" rel="preconnect"/>
  <link href="https://data.mercadolibre.com" rel="preconnect"/>
  <link href="https://http2.mlstatic.com" rel="preconnect"/>
  <link href="https://www.google.com.br" rel="preconnect"/>
  <link href="https://stats.g.doubleclick.net" rel="preconnect"/>
  <link href="//analytics.mercadolivre.com.br" rel="preconnect"/>
  <link href="//analytics.mercadolivre.com" rel="preconnect"/>
  <link href="//mlb-s1-p.mlstatic.com" rel="preconnect"/>
  <link href="//mlb-s2-p.mlstatic.com" rel="preconnect"/>
  <script>
   document

In [41]:
items = soup.find_all('div', {'class':'item__info'})
items

[<div class="item__info item--hide-right-col"><h2 class="item__title list-view-item-title"> <a class="item__info-title" href="https://produto.mercadolivre.com.br/MLB-687925636-holofote-refletor-super-led-duplo-100w-bivolt-bco-frio-_JM#position=1&amp;type=item&amp;tracking_id=be560c37-9282-4a6a-9521-582550356b55"> <span class="main-title"> Holofote Refletor Super Led Duplo 100w Bivolt - Bco Frio </span> </a> <div class="item__brand"> <a class="item__brand-link" href="https://loja.mercadolivre.com.br/coruja-mix"> <span class="item__brand-title-tos"> por Coruja Mix </span> </a> </div></h2> <div class="price__container"><div class="item__price"> <span class="price__symbol">R$</span> <span class="price__fraction">52</span> <span class="price__decimals">90</span></div> </div> <div class="item__stack_column highlighted"> <div class="item__stack_column__info"> <div class="stack_column_item installments highlighted"><span class="item-installments showInterest"> <span class="item-installments-mu

In [42]:
items[0]

<div class="item__info item--hide-right-col"><h2 class="item__title list-view-item-title"> <a class="item__info-title" href="https://produto.mercadolivre.com.br/MLB-687925636-holofote-refletor-super-led-duplo-100w-bivolt-bco-frio-_JM#position=1&amp;type=item&amp;tracking_id=be560c37-9282-4a6a-9521-582550356b55"> <span class="main-title"> Holofote Refletor Super Led Duplo 100w Bivolt - Bco Frio </span> </a> <div class="item__brand"> <a class="item__brand-link" href="https://loja.mercadolivre.com.br/coruja-mix"> <span class="item__brand-title-tos"> por Coruja Mix </span> </a> </div></h2> <div class="price__container"><div class="item__price"> <span class="price__symbol">R$</span> <span class="price__fraction">52</span> <span class="price__decimals">90</span></div> </div> <div class="item__stack_column highlighted"> <div class="item__stack_column__info"> <div class="stack_column_item installments highlighted"><span class="item-installments showInterest"> <span class="item-installments-mul

In [48]:
items[0].find('span', {'class':'main-title'})

<span class="main-title"> Holofote Refletor Super Led Duplo 100w Bivolt - Bco Frio </span>

In [53]:
class ItemML():
    def __init__(self, nome, fraction, decimal):
        self.nome = nome
        self.fraction = fraction
        self.decimal = decimal
    
    def __repr__(self):
        return f'{self.nome}, preço: R${self.fraction},{self.decimal}'

items = soup.find_all('div', {'class':'item__info'})
ml_items = []
for item in items:
    nome = item.find('span', {'class':'main-title'}).text
    fraction = item.find('span', {'class': 'price__fraction'}).text
    decimal = item.find('span', {'class': 'price__decimals'})
    if decimal is None:
        decimal = '00'
    else:
        decimal = decimal.text
    
    ml_items.append(ItemML(nome, fraction, decimal))

In [54]:
ml_items

[ Holofote Refletor Super Led Duplo 100w Bivolt - Bco Frio , preço: R$52,90,
  Lustre Luminária Redondo Pendente De Madeira De E-27 21x18cm , preço: R$54,99,
  Fita Led 3528 Rolo 5m 300 Leds Sanca Teto Dupla Face 3m , preço: R$18,90,
  Par Barra Led Super Farol Drl Luz Diurna 12v Carro Moto 17cm , preço: R$16,85,
  Fita Led 5m Rgb 16 Cores 5050 Pro D'agua + Fonte + Controle , preço: R$38,45,
  Mangueira Led Alto Brilho Decoração Sanca Gesso E Marcenaria , preço: R$16,78,
  Fita Led Digital 6803 Rgb C/ Controle 133 Efeitos Ip67 5050 , preço: R$56,90,
  Lustre Luminária Redondo Pendente De Madeira De E-27 21x18cm , preço: R$54,99,
  Fita Led 3528 Rolo 5m 300 Leds Várias Cores Prova Dágua - Sf , preço: R$17,50,
  Kit Lâmpadas Ultra Led Full 7600l Efeito Xenon Super Branca , preço: R$45,99,
  C6 Lampada Led Automotiva H1/h3/h4/h7/hb3/hb4/h27 Xenon , preço: R$39,99,
  Fita Led 5 Metros Silicone Prova Dagua Diversas Cores 12v , preço: R$16,49,
  Fita Led 5m Ultra Rgb 5050 Prova D'agua + Cont

In [60]:
next_button = soup.find('li', {'class': 'andes-pagination__button--next'})

In [63]:
next_button.find('a')['href']

'https://lista.mercadolivre.com.br/led_Desde_51'

## Training Zone

1. Acesse o link da wikipedia sobre HTML (https://en.wikipedia.org/wiki/HTML) e faça um script python que retorne a tabela do índice do conteúdo da página, como abaixo:

    `1. History (https://en.wikipedia.org/wiki/HTML#History)
    1.1 Development (https://en.wikipedia.org/wiki/HTML#Development)
    1.2 HTML versions timeline (https://en.wikipedia.org/wiki/HTML#HTML_versions_timeline)
    ...`

In [None]:
import requests
from bs4 import BeautifulSoup

wikipedia = []

# Buscando a página 
page = requests.get('https://en.wikipedia.org/wiki/HTML')
# Criar o objeto BeautifulSoup
soup = BeautifulSoup(page.text, 'html.parser')

2. Transforme o seu código acima em uma classe, WikiPage, com o método `get_content_table`, que retorne o que você fez anteriormente