# 0. Imports and Settings

In [1]:
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import math

pd.options.mode.chained_assignment = None 

In [2]:
#The headers for us to look real
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0'}
#The home page
home_url = 'https://books.toscrape.com/index.html'
#List of categories you wish to scrap
cats = ['Classics', 'Science Fiction', 'Humor', 'Business']

# 1. Functions

In [11]:
#This function is to create a clean data-set of the categories, their URLs and number of books in each.
#This is important to prevent the code from stop working in the case of layout (i.e. order) changes in the webpage.

def categories_pages_urls (home_url, headers):
    
    #Access the page and store it
    page = requests.get( home_url, headers=headers )

    #Parse the homepage using bs4
    soup = BeautifulSoup( page.text, 'html.parser' )

    #This is where I found the list including all the categories in the sidebar
    categories_list = soup.find( 'ul', class_='nav nav-list' ).find_all('a')

    #First, I will create a set with the urls heading to the categories pages.
    categories_urls = [p.get('href') for p in categories_list]

    #Transform this list in a dataframe, excluding line one which goes for the homepage.
    categories_urls_df = pd.DataFrame(categories_urls).iloc[1:]

    #Now, I need to get a list of the categories themselves to concatenate with the previous list.
    #Using split '\n' because there was some weird spacing back there in the html
    categories_titles = [p.get_text().split('\n') for p in categories_list]

    #Transforming it in a dataframe and dropping alien columns
    categories_titles_df = pd.DataFrame(categories_titles).iloc[1:, [False, False, True, False, False]]

    #Move the tables together and drop more alien columns
    categories_location = pd.concat([categories_titles_df, categories_urls_df], axis = 1).reset_index().iloc[:, [False, True, True]]
    categories_location.columns = ['category', 'category_url']

    #remove weird 32 spacing from category names
    for i in range(len(categories_location)):
        categories_location['category'][i] = categories_location['category'][i][32:]

    categories_location['books_in_category'] = 0

    #Now, I will use a loop to get the quantity of books in each category (we are going to need it later)
    for i in range(len(categories_location)):

        url_get_cat_number = 'https://books.toscrape.com/'+categories_location.iloc[i]['category_url']
        page = requests.get( url_get_cat_number, headers=headers )
        soup = BeautifulSoup( page.text, 'html.parser' )
        books_in_cat = soup.find( 'form', class_='form-horizontal' ).find('strong')
        qty = int([p.get_text('strong') for p in books_in_cat][0])
        categories_location.loc[i, 'books_in_category'] = qty
        
    return categories_location

In [12]:
#The input to this function (df_cats_urls_qtys) should be the return of categories_pages_urls.
#cats should be a list of categories to be scrapped
def cats_wish_to_scrap(df_cats_urls_qtys, cats, pagination):
    
    #Filter
    boolean = df_categories_urls.category.isin(cats)
    wish = df_categories_urls[boolean]
    
    #Add pagination info, we will need it later
    wish['pages'] = 0
    wish['pages'] = wish['books_in_category']/pagination
    wish['pages'] = wish['pages'].apply(lambda x: int(math.ceil(x)))
    wish
    
    return wish

In [289]:
#The input to this function should be the return of cats_wish_to_scrap
#====== NEED TO ADD PAGINATION FEATURES==========
def create_worklist_to_scrap(worklist):
    df_worklist = pd.DataFrame()

    for i in range(len(worklist)):
        #let's start with Classics from worklist
        url_titles = 'http://books.toscrape.com/'+ worklist.iloc[i, 1]

        #Access the page and store it
        page = requests.get( url_titles, headers=headers )

        #Parse the catalogue page using the html.parser
        soup = BeautifulSoup( page.text, 'html.parser' )

        titles = soup.find('div', class_='col-sm-8 col-md-9')
        category_is = titles.find_all('h1')
        c = category_is[0].get_text()

        url_is = titles.find_all('a')
        titles_url = set([p.get('href') for p in url_is])
        titles_url = list(titles_url)

        df_titles_url = pd.DataFrame(titles_url)
        df_titles_url['category'] = c

        df_worklist = df_worklist.append(df_titles_url, ignore_index=True)
        df_titles_url = df_titles_url.iloc[0:0]

    df_worklist.columns = ['title_url','category']

    for i in range(len(df_worklist)):
        df_worklist['title_url'][i] = 'https://books.toscrape.com/catalogue'+df_worklist['title_url'][i][8:]
        
    return df_worklist

# 2. Scrapping

## 2.1 Website structure scrapping

### 2.1.1 Get the categories in the Sidebar, their URLs and sizes for pagination

This is to prevent that the script won't stop working case the layout (i.e. the order of the categories) is changed in the future.

In [13]:
#Use the defined function to get the clean dataframe of categories
df_categories_urls = categories_pages_urls(home_url=home_url, headers=headers)

In [14]:
df_categories_urls.sample(3)

Unnamed: 0,category,category_url,books_in_category
15,Sports and Games,catalogue/category/books/sports-and-games_17/i...,5
47,Cultural,catalogue/category/books/cultural_49/index.html,1
16,Add a comment,catalogue/category/books/add-a-comment_18/inde...,67


In [17]:
#Select only the ones we wish to work on and add pagination info
worklist = cats_wish_to_scrap(cats=cats, df_cats_urls_qtys=df_categories_urls, pagination=20)

Unnamed: 0,category,category_url,books_in_category,pages
33,Business,catalogue/category/books/business_35/index.html,12,1
4,Classics,catalogue/category/books/classics_6/index.html,19,1
14,Science Fiction,catalogue/category/books/science-fiction_16/in...,16,1


### 2.1.2 Get the titles in each category

From these categories, we need to fetch the books titles from their pages

In [291]:
books_to_scrap = create_worklist_to_scrap(worklist=worklist)
books_to_scrap.sample(5)

Unnamed: 0,title_url,category
50,https://books.toscrape.com/catalogue/the-10-en...,Business
20,https://books.toscrape.com/catalogue/having-th...,Science Fiction
53,https://books.toscrape.com/catalogue/quench-yo...,Business
56,https://books.toscrape.com/catalogue/born-for-...,Business
19,https://books.toscrape.com/catalogue/soft-apoc...,Science Fiction


### 2.1.3 Scrap each book page for the remaining information needed