# Let's livecode!

First, let's import the libraries we'll need

In [1]:
# Importando bibliotecas

import requests # Para fazer requisições
from bs4 import BeautifulSoup # Para trabalhar com dados vindos de paginas web

Now let's set the URL for the page we're scraping, make a request to get the HTML and parse it. 

In [2]:
url = "http://books.toscrape.com/index.html"
response = requests.get(url)
html = response.content
scraped = BeautifulSoup(html, 'html.parser')

In [3]:
# Cada livro está num article com class product_pod
# assim selecionamos todos os articles para iterar por essa lista
books = scraped.find_all(class_='product_pod')

titles = [] # Lista que receberá os títulos de cada livro
prices = [] # Lista que recebera os preços de cada livro

list = {'Books': titles, 'Prices (£)': prices} # Lista que será usada para criar o dataframe do pandas

for item in books:
    title = item.h3.a['title'] # O Título está no atributo Title do <a> dentro do <h3> que está dentro do <article>
    
    # O preço está numa classe 'price_color' dentro de outra classe 'product_price'
    # O lstrip retira da esquerda o simbolo de Libra '£' e o float salva nosso dado como float
    price = float(item.find(class_='product_price').find(class_='price_color').text.lstrip('£'))
    
    titles.append(title) # O título é adicionado à lista de Títulos
    prices.append(price) # O Preço, já como float, é adicionado à lista de Preços
    #print(title)

print(list)

{'Books': ['A Light in the Attic', 'Tipping the Velvet', 'Soumission', 'Sharp Objects', 'Sapiens: A Brief History of Humankind', 'The Requiem Red', 'The Dirty Little Secrets of Getting Your Dream Job', 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull', 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics', 'The Black Maria', 'Starving Hearts (Triangular Trade Trilogy, #1)', "Shakespeare's Sonnets", 'Set Me Free', "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", 'Rip it Up and Start Again', 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991', 'Olio', 'Mesaerion: The Best Science Fiction Stories 1800-1849', 'Libertarianism for Beginners', "It's Only the Himalayas"], 'Prices (£)': [51.77, 53.74, 50.1, 47.82, 54.23, 22.65, 33.34, 17.93, 22.6, 52.15, 13.99, 20.66, 17.46, 52.29, 35.02, 57.25, 23.88, 37.59, 51.33, 45.17]}


In [4]:
# Importa o pandas como pd
import pandas as pd
df = pd.DataFrame(list) # Cria um dataframe com nossa listade livros e preços
df

Unnamed: 0,Books,Prices (£)
0,A Light in the Attic,51.77
1,Tipping the Velvet,53.74
2,Soumission,50.1
3,Sharp Objects,47.82
4,Sapiens: A Brief History of Humankind,54.23
5,The Requiem Red,22.65
6,The Dirty Little Secrets of Getting Your Dream...,33.34
7,The Coming Woman: A Novel Based on the Life of...,17.93
8,The Boys in the Boat: Nine Americans and Their...,22.6
9,The Black Maria,52.15


In [5]:
df.describe() # Descreve as colunas que contem numeros, nesse caso, apenas a coluna Prices contem numeros

Unnamed: 0,Prices (£)
count,20.0
mean,38.0485
std,15.135231
min,13.99
25%,22.6375
50%,41.38
75%,51.865
max,57.25


In [6]:
df.info() # Mostra informações do dataframe e os tipos dos dados de cada coluna e quantas entradas que não são null

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Books       20 non-null     object 
 1   Prices (£)  20 non-null     float64
dtypes: float64(1), object(1)
memory usage: 448.0+ bytes


In [7]:
df.shape # Mostra quantas linhas e colunas tem o dataframe

(20, 2)

In [8]:
df.Books # Mostra a coluna Books do dataframe

0                                  A Light in the Attic
1                                    Tipping the Velvet
2                                            Soumission
3                                         Sharp Objects
4                 Sapiens: A Brief History of Humankind
5                                       The Requiem Red
6     The Dirty Little Secrets of Getting Your Dream...
7     The Coming Woman: A Novel Based on the Life of...
8     The Boys in the Boat: Nine Americans and Their...
9                                       The Black Maria
10       Starving Hearts (Triangular Trade Trilogy, #1)
11                                Shakespeare's Sonnets
12                                          Set Me Free
13    Scott Pilgrim's Precious Little Life (Scott Pi...
14                            Rip it Up and Start Again
15    Our Band Could Be Your Life: Scenes from the A...
16                                                 Olio
17    Mesaerion: The Best Science Fiction Storie

In [11]:
df['Prices (£)'] # Outra forma de chamar uma coluna é entre colchetes e aspas: ['Coluna']

0     51.77
1     53.74
2     50.10
3     47.82
4     54.23
5     22.65
6     33.34
7     17.93
8     22.60
9     52.15
10    13.99
11    20.66
12    17.46
13    52.29
14    35.02
15    57.25
16    23.88
17    37.59
18    51.33
19    45.17
Name: Prices (£), dtype: float64

In [18]:
df.iloc[:,1] 
# Localiza no dataframe pelo indice, sendo o primeiro valor antes da virgula para as linhas
# O valor apos a virgula é para o indice da coluna
# o : seleciona todas as linhas
# o 1 seleciona a segunda coluna (Lembre-se que o indice começa por 0)

0     51.77
1     53.74
2     50.10
3     47.82
4     54.23
5     22.65
6     33.34
7     17.93
8     22.60
9     52.15
10    13.99
11    20.66
12    17.46
13    52.29
14    35.02
15    57.25
16    23.88
17    37.59
18    51.33
19    45.17
Name: Prices (£), dtype: float64

In [19]:
df.iloc[:, 0]
# Aqui seleciona todas as linhas, mas a primeira coluna

0                                  A Light in the Attic
1                                    Tipping the Velvet
2                                            Soumission
3                                         Sharp Objects
4                 Sapiens: A Brief History of Humankind
5                                       The Requiem Red
6     The Dirty Little Secrets of Getting Your Dream...
7     The Coming Woman: A Novel Based on the Life of...
8     The Boys in the Boat: Nine Americans and Their...
9                                       The Black Maria
10       Starving Hearts (Triangular Trade Trilogy, #1)
11                                Shakespeare's Sonnets
12                                          Set Me Free
13    Scott Pilgrim's Precious Little Life (Scott Pi...
14                            Rip it Up and Start Again
15    Our Band Could Be Your Life: Scenes from the A...
16                                                 Olio
17    Mesaerion: The Best Science Fiction Storie

In [20]:
df.loc[:, 'Books']
# Outra forma de selecionar todas as linhas da coluna de nome 'Books'
# o .loc seleciona pelo indice da linha, mas pela label = nome da coluna

0                                  A Light in the Attic
1                                    Tipping the Velvet
2                                            Soumission
3                                         Sharp Objects
4                 Sapiens: A Brief History of Humankind
5                                       The Requiem Red
6     The Dirty Little Secrets of Getting Your Dream...
7     The Coming Woman: A Novel Based on the Life of...
8     The Boys in the Boat: Nine Americans and Their...
9                                       The Black Maria
10       Starving Hearts (Triangular Trade Trilogy, #1)
11                                Shakespeare's Sonnets
12                                          Set Me Free
13    Scott Pilgrim's Precious Little Life (Scott Pi...
14                            Rip it Up and Start Again
15    Our Band Could Be Your Life: Scenes from the A...
16                                                 Olio
17    Mesaerion: The Best Science Fiction Storie

In [21]:
df.loc[:, 'Prices (£)']
# aqui seleciona todas as linhas da coluna Prices

0     51.77
1     53.74
2     50.10
3     47.82
4     54.23
5     22.65
6     33.34
7     17.93
8     22.60
9     52.15
10    13.99
11    20.66
12    17.46
13    52.29
14    35.02
15    57.25
16    23.88
17    37.59
18    51.33
19    45.17
Name: Prices (£), dtype: float64

In [22]:
df.nlargest(5, 'Prices (£)')
# Comando muito utilizado para retornar os maiores valores, 
# nesse caso, selecionei os 5 registros com maiores preços, note que em ordem do maior para o menor

Unnamed: 0,Books,Prices (£)
15,Our Band Could Be Your Life: Scenes from the A...,57.25
4,Sapiens: A Brief History of Humankind,54.23
1,Tipping the Velvet,53.74
13,Scott Pilgrim's Precious Little Life (Scott Pi...,52.29
9,The Black Maria,52.15


In [23]:
df.nsmallest(5, 'Prices (£)')
# Esse comando retorna os 5 menores preços, note que em ordem do menor para o maior

Unnamed: 0,Books,Prices (£)
10,"Starving Hearts (Triangular Trade Trilogy, #1)",13.99
12,Set Me Free,17.46
7,The Coming Woman: A Novel Based on the Life of...,17.93
11,Shakespeare's Sonnets,20.66
8,The Boys in the Boat: Nine Americans and Their...,22.6


In [27]:
df.Books.value_counts()
# Comando que conta quantos valores se repetiram, como não temos registros duplicados, todos tem 1 registro

Mesaerion: The Best Science Fiction Stories 1800-1849                                             1
Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991                1
Sharp Objects                                                                                     1
Olio                                                                                              1
Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)                                           1
The Dirty Little Secrets of Getting Your Dream Job                                                1
A Light in the Attic                                                                              1
The Black Maria                                                                                   1
The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull           1
Sapiens: A Brief History of Humankind                                                             1
