# 0. Imports and Settings

In [294]:
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup

In [295]:
#The headers for us to look real
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0'}
#The home page
home_url = 'https://books.toscrape.com/index.html'

# 1. Functions

In [308]:
#This function is to create a list of the categories, their URLs and number of books in each.
#This is important to prevent the code from stop working in the case of layout (i.e. order) changes in the webpage.

def categories_pages_urls (home_url, headers):
    
    #Access the page and store it
    page = requests.get( home_url, headers=headers )
    
    #Parse the homepage using bs4
    soup = BeautifulSoup( page.text, 'html.parser' )
    
    #This is where I found the list including all the categories in the sidebar
    categories = soup.find( 'ul', class_='nav nav-list' ).find_all('a')
    
    #First, I will create a set with the urls heading to the categories pages.
    categories_urls = [p.get('href') for p in categories_list]
    
    #Transform this list in a dataframe, excluding line one which goes for the homepage.
    categories_urls_df = pd.DataFrame(categories_urls).iloc[1:]
    
    #Now, I need to get a list of the categories themselves to concatenate with the previous list.
    #Using split '\n' because there was some weird spacing back there in the html
    categories_titles = [p.get_text().split('\n') for p in categories_list]
    
    #Transforming it in a dataframe and dropping alien columns
    categories_titles_df = pd.DataFrame(categories_titles).iloc[1:, [False, False, True, False, False]]
    
    #Move the tables together and drop more alien columns
    categories_location = pd.concat([categories_titles_df, categories_urls_df], axis = 1).reset_index().iloc[:, [False, True, True]]
    categories_location.columns = ['category', 'category_url']
    
    #remove weird .. from urls
    for i in range(len(categories_location)):
        categories_location['category_url'][i] = categories_location['category_url'][i][2:]
        
    #Create category_size column including the number of books
    #This is important to define either pagination should be used or not for each category
    categories_location['books_in_category'] = 0
    for i in range(len(categories_location)):

        url_get_cat_number = 'https://books.toscrape.com/catalogue/category'+categories_location.iloc[i]['category_url']
        page = requests.get( url_get_cat_number, headers=headers )
        soup = BeautifulSoup( page.text, 'html.parser' )
        books_in_cat = soup.find( 'form', class_='form-horizontal' ).find('strong')
        qty = int([p.get_text('strong') for p in books_in_cat][0])

        categories_location.loc[i, 'books_in_category'] = qty
    
    #return a nice and fine dataframe with categories, urls and number of books
    return categories_location

# 2. Scrapping

## 2.1 Website structure scrapping

### 2.1.1 Get the categories in the Sidebar, their URLs and sizes

This is to prevent that the script won't stop working case the layout (i.e. the order of the categories) is changed in the future.

In [309]:
#Use the defined function
df_categories_urls = categories_pages_urls(home_url=home_url, headers=headers)

In [310]:
df_categories_urls.sample(5)

Unnamed: 0,category,category_url,books_in_category
26,Parenting,/books/parenting_28/index.html,1
5,Philosophy,/books/philosophy_7/index.html,11
35,Thriller,/books/thriller_37/index.html,11
45,Health,/books/health_47/index.html,4
23,Art,/books/art_25/index.html,8
