## Books ro Scrape
#### This notebook scrapes data from books.toscrape.com

In [None]:
import csv
from bs4 import BeautifulSoup
import requests
import datetime


In [None]:
base_url = 'https://books.toscrape.com/'

base_url

In [None]:
response = requests.get(base_url)
response 

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
books = soup.find_all('article', 'product_pod')

In [None]:
len(books)

#### Model a page

In [None]:
book = books[0]

In [None]:
book_img_container = book.find('div','image_container')

In [None]:
book_url = base_url + book_img_container.a.get('href')
book_url

In [None]:
book_img_url = base_url + book_img_container.img.get('src')
book_img_url

In [None]:
book_title = book.h3.a.get('title')
book_title

In [None]:
book_rating = book.p.get('class')[1] + ' stars'
book_rating

In [None]:
book_price_container = book.find('div', 'product_price')

In [None]:
book_price = book_price_container.find('p','price_color').text
book_price.strip()

In [None]:
book_availability = book_price_container.find('p', 'instock').text.strip()
book_availability

#### Generalize extraction model

In [None]:
def extract_data(book):
    book_img_container = book.find('div','image_container')
    book_url = base_url + book_img_container.a.get('href')
    book_img_url = base_url + book_img_container.img.get('src')

    book_title = book.h3.a.get('title')

    book_rating = book.p.get('class')[1] + ' stars'

    book_price_container = book.find('div', 'product_price')
    book_price = book_price_container.find('p','price_color').text
    book_availability = book_price_container.find('p', 'instock').text.strip()

    return book_url, book_img_url, book_title, book_rating, book_price, book_availability 

#### Get the next page of results

In [None]:
records = []

In [None]:
while True:
    try:
        next = soup.find('li', 'next').a.get('href')
        print(next)

        if (next.split("/")[0] != 'catalogue'):
            next = base_url + 'catalogue/' + next 
        else: 
            next = base_url + next
        
        # print(next)
    except AttributeError:
        break

    
    res = requests.get(next)
    print('Retrieving ...')
    
    if (res.status_code == 200):
        print('Retrieved: ', next)
    else:
        print('Error getting url: ', url)
        print('Status code: ', res.status_code)

    soup = BeautifulSoup(res.text, 'html.parser')


    books = soup.find_all('article', 'product_pod')
    # len(books)

    for book in books:
        record = extract_data(book)
        records.append(record)

len(records)

#### Pull all categories

In [None]:
category_ = soup.find('ul', 'nav').li.ul.find_all('a')
category_

In [None]:
category = {}
for atag in category_:
    category_links = 'catalogue/' + atag.get('href')
    category_title = atag.text.strip()

    if category_title not in category:
        category[category_title] = base_url +  category_links

category

#### Extract data from a category

In [None]:
# def extract_data_from_category(category):
for cat, url in category.items():
    cat_records = []
    res = requests.get(url)

    soup = BeautifulSoup(res.text, 'html.parser')

    books = soup.find_all('article', 'product_pod')

    for book in books:
        record = extract_data(book)
        cat_records.append(record)

    print(cat, cat_records)

        

#### Putting it all together

In [None]:
import csv
from bs4 import BeautifulSoup
import requests
import datetime
import time

def check_url(url):
    # if (url.split("/")[0] != 'catalogue'):
    #     url = 'catalogue/' + url
    
    return url.replace('index.html', '')

def extract_data(book):
    '''Extract data from a book'''
    book_img_container = book.find('div','image_container')
    book_url = base_url + book_img_container.a.get('href')
    book_img_url = base_url + book_img_container.img.get('src')

    book_title = book.h3.a.get('title')

    book_rating = book.p.get('class')[1] + ' stars'

    book_price_container = book.find('div', 'product_price')
    book_price = book_price_container.find('p','price_color').text
    book_availability = book_price_container.find('p', 'instock').text.strip()


    record = (book_url, book_img_url, book_title, book_rating, book_price, book_availability)

    return record

def main(category):

    records = []

    page_count = 0
    record_count = 0
    
    for cat,url in category.items():

        while True:
            res = requests.get(url)

            print('Retrieving ...')
            
            if (res.status_code == 200):
                print('Retrieved: ', url)
            else:
                print('Error getting url: ', url)
                print('Status code: ', res.status_code)

            page_count += 1

            soup = BeautifulSoup(res.text, 'html.parser')

            books = soup.find_all('article', 'product_pod')

            print('No of books on page: ', len(books))
            
            print('\n--------------------------------\n')

            for book in books:
                record_count += 1
                record_ = extract_data(book)

                record = record_ + (cat,)

                print('Record: ', record)
                print('Records found: ', record_count)
                records.append(record)

            print('Total no of pages retrieved: ', page_count)
           
            try:
                next = soup.find('li', 'next').a.get('href')
                print(next)

                url = url + check_url(next)
                print('Getting next page: ', url)
            except AttributeError:
                print('No more pages found')
                print('\n--------------------------------\n')
                break

            if page_count % 5   == 0:
                print('Pausing...')
                time.sleep(2)

        print('Writing to file......')
        with open('./data/books_to_scrape.csv', 'w') as f:
            fieldnames = ['url', 'img_url', 'title', 'rating', 'price', 'availability', 'category']
            writer = csv.writer(f)
            writer.writerow(fieldnames)
            writer.writerows(records)


base_url = 'https://books.toscrape.com/'

response = requests.get(base_url)

soup = BeautifulSoup(response.text, 'html.parser')

category_ = soup.find('ul', 'nav').li.ul.find_all('a')

category = {}

for atag in category_:
    category_links = base_url + check_url(atag.get('href'))
    category_title = atag.text.strip()

    if category_title not in category:
        category[category_title] = category_links

main(category)