# Books to Scrap using Beautiful Soup
- https://books.toscrape.com/index.html
- https://www.youtube.com/watch?v=MH3641s3Roc

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [54]:
burl1 = 'https://books.toscrape.com'
burl2 = 'https://books.toscrape.com/catalogue'

In [55]:
requests.get(burl1) #if it is [200] it is success

<Response [200]>

In [6]:
purl = 'https://books.toscrape.com/catalogue/page-1.html'

In [8]:
requests.get(purl) #if it is [200] it is success

<Response [200]>

In [9]:
response = requests.get(purl)
response

<Response [200]>

In [11]:
responseC = response.content
responseC[0:100]  #this contains html content as well as other content - b...
#convert this to html code using BSoup

b'\n\n<!DOCTYPE html>\n<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![e'

In [17]:
soup = BeautifulSoup(responseC, 'html.parser')
#soup

In [20]:
#soup.prettify() #little better

In [None]:
## View the page and find elements to be scrapped
- ol : <ol> this is ordered list

<ol class="row">   ::: ol
<li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
<article class="product_pod"> ::: class = article_pod
<div class="image_container">
<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>  ::: a href ='xxx'
</div>
<p class="star-rating Three">  ::: p, class='star-rating', 'Three'
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>

## scraping only 1 page items

In [21]:
ol = soup.find('ol')  #first instance of ol

In [31]:
#ol # since this is first ol tag, hence it is not a list; ol[0:1] will not work, ol will work

In [24]:
articles = ol.find_all('article', class_ ='product_pod') # class is python reserved, hence _
# all html code in tags <article>...... </article> will be returned; there can be multiple articles

In [34]:
articles[0:1]  #1st article tag : start - end

[<article class="product_pod">
 <div class="image_container">
 <a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
 </div>
 <p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>
 <h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
 <div class="product_price">
 <p class="price_color">£51.77</p>
 <p class="instock availability">
 <i class="icon-ok"></i>
     
         In stock
     
 </p>
 <form>
 <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
 </form>
 </div>
 </article>]

- now loop through each article and extact child elements 
- eg. a href, p class='star-rating', h3, stock, price, title etc

In [44]:
for article in articles[0:2]:  #1st 2 article tags
    image = article.find('img')
    title = image.attrs['alt']
    print(title)  

A Light in the Attic
Tipping the Velvet


In [50]:
# star rating, price,
for article in articles[0:2]:  #1st 2 article tags
    star = article.find('p') #1st p tag
    star = star['class'][1]
    print(star)
    #price = article.find('p', class_ ='price_color').text  #only text
    price = article.find('p', class_ ='price_color').text  #text to float
    price = float(price[1:])
    print(price)

Three
51.77
One
53.74


In [53]:
# stock position, a href
for article in articles[0:2]:  #1st 2 article tags
     stockavl = article.find('p', class_ ='instock availability').text.strip()
     print(stockavl)  
     ref = article.find(['h3','a']).get('href')
     print(ref)

In stock
a-light-in-the-attic_1000/index.html
In stock
tipping-the-velvet_999/index.html


# Combine together and put in DF
- empty DF
- empty list of book data
- scrap through all pages (1-50); first try for 5 pages
- scrap all products in each page (all article tags)
- scrap all data points of each article tag - title, price, avl, url, star rating

In [58]:
df = pd.DataFrame(columns = ['title','star','price','avl', 'urlref'])
df  #empty DF

Unnamed: 0,title,star,price,avl,urlref


In [59]:
books =[]  #empty list to put extacted data
books

[]

In [75]:
books =[] #again reset during start of loop
for i in range(1,5):  #this can be changed
    purl2 = f'https://books.toscrape.com/catalogue/page-{i}.html'
    response = requests.get(purl2).content
    soup = BeautifulSoup(response, 'html.parser')
    ol = soup.find('ol')
    articles = ol.find_all('article', class_ = 'product_pod')
    
    for article in articles:
        imageTitle = article.find('img').get('alt')
        star = article.find('p').get('class')[1]
        price = float(article.find('p', class_ ='price_color').text[1:]) 
        stockavl = article.find('p', class_ ='instock availability').text.strip()
        ref = article.find(['h3','a']).get('href')
        #print(imageTitle, star, price, stockavl)
        books.append([imageTitle, star, price, stockavl])

In [76]:
len(books)  #no of books data extacted

80

In [80]:
df = pd.DataFrame(books, columns = ['title','star','price','avl'])
df.shape

(80, 4)

In [81]:
df.head()

Unnamed: 0,title,star,price,avl
0,A Light in the Attic,Three,51.77,In stock
1,Tipping the Velvet,One,53.74,In stock
2,Soumission,One,50.1,In stock
3,Sharp Objects,Four,47.82,In stock
4,Sapiens: A Brief History of Humankind,Five,54.23,In stock


In [None]:
#write to csv
#df.to_csv('books.csv')

# end of Practise