# 0. Imports and Settings

In [1]:
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import math

pd.options.mode.chained_assignment = None 

In [2]:
#The headers for us to look real
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0'}
#The home page
home_url = 'https://books.toscrape.com/index.html'
#List of categories you wish to scrap
cats = ['Classics', 'Science Fiction', 'Humor', 'Business']

# 1. Functions

In [11]:
#This function is to create a clean data-set of the categories, their URLs and number of books in each.
#This is important to prevent the code from stop working in the case of layout (i.e. order) changes in the webpage.

def categories_pages_urls (home_url, headers):
    
    #Access the page and store it
    page = requests.get( home_url, headers=headers )

    #Parse the homepage using bs4
    soup = BeautifulSoup( page.text, 'html.parser' )

    #This is where I found the list including all the categories in the sidebar
    categories_list = soup.find( 'ul', class_='nav nav-list' ).find_all('a')

    #First, I will create a set with the urls heading to the categories pages.
    categories_urls = [p.get('href') for p in categories_list]

    #Transform this list in a dataframe, excluding line one which goes for the homepage.
    categories_urls_df = pd.DataFrame(categories_urls).iloc[1:]

    #Now, I need to get a list of the categories themselves to concatenate with the previous list.
    #Using split '\n' because there was some weird spacing back there in the html
    categories_titles = [p.get_text().split('\n') for p in categories_list]

    #Transforming it in a dataframe and dropping alien columns
    categories_titles_df = pd.DataFrame(categories_titles).iloc[1:, [False, False, True, False, False]]

    #Move the tables together and drop more alien columns
    categories_location = pd.concat([categories_titles_df, categories_urls_df], axis = 1).reset_index().iloc[:, [False, True, True]]
    categories_location.columns = ['category', 'category_url']

    #remove weird 32 spacing from category names
    for i in range(len(categories_location)):
        categories_location['category'][i] = categories_location['category'][i][32:]

    categories_location['books_in_category'] = 0

    #Now, I will use a loop to get the quantity of books in each category (we are going to need it later)
    for i in range(len(categories_location)):

        url_get_cat_number = 'https://books.toscrape.com/'+categories_location.iloc[i]['category_url']
        page = requests.get( url_get_cat_number, headers=headers )
        soup = BeautifulSoup( page.text, 'html.parser' )
        books_in_cat = soup.find( 'form', class_='form-horizontal' ).find('strong')
        qty = int([p.get_text('strong') for p in books_in_cat][0])
        categories_location.loc[i, 'books_in_category'] = qty
        
    return categories_location

In [12]:
#The input to this function (df_cats_urls_qtys) should be the return of categories_pages_urls.
#cats should be a list of categories to be scrapped
def cats_wish_to_scrap(df_cats_urls_qtys, cats, pagination):
    
    #Filter
    boolean = df_categories_urls.category.isin(cats)
    wish = df_categories_urls[boolean]
    
    #Add pagination info, we will need it later
    wish['pages'] = 0
    wish['pages'] = wish['books_in_category']/pagination
    wish['pages'] = wish['pages'].apply(lambda x: int(math.ceil(x)))
    wish
    
    return wish

# 2. Scrapping

## 2.1 Website structure scrapping

### 2.1.1 Get the categories in the Sidebar, their URLs and sizes for pagination

This is to prevent that the script won't stop working case the layout (i.e. the order of the categories) is changed in the future.

In [13]:
#Use the defined function to get the clean dataframe of categories
df_categories_urls = categories_pages_urls(home_url=home_url, headers=headers)

In [14]:
df_categories_urls.sample(3)

Unnamed: 0,category,category_url,books_in_category
15,Sports and Games,catalogue/category/books/sports-and-games_17/i...,5
47,Cultural,catalogue/category/books/cultural_49/index.html,1
16,Add a comment,catalogue/category/books/add-a-comment_18/inde...,67


In [17]:
#Select only the ones we wish to work on and add pagination info
worklist = cats_wish_to_scrap(cats=cats, df_cats_urls_qtys=df_categories_urls, pagination=20)
worklist.sample(3)

Unnamed: 0,category,category_url,books_in_category,pages
33,Business,catalogue/category/books/business_35/index.html,12,1
4,Classics,catalogue/category/books/classics_6/index.html,19,1
14,Science Fiction,catalogue/category/books/science-fiction_16/in...,16,1


### 2.1.2 Get the titles in each category

From these categories, we need to fetch the books titles from their pages

In [27]:
#let's start with Classics from worklist
url_titles = 'http://books.toscrape.com/'+ worklist.iloc[1, 1]
url_titles

#Access the page and store it
page = requests.get( url_titles, headers=headers )
    
#Parse the catalogue page using the html.parser
soup = BeautifulSoup( page.text, 'html.parser' )



In [47]:
titles = soup.find('div', class_='col-sm-8 col-md-9')

In [108]:
category_is = titles.find_all('h1')
c = category_is[0].get_text()
c

'Science Fiction'

In [159]:
title_is = titles.find_all('a')
titles_titles = set([p.get('title') for p in title_is])
titles_titles

{'Arena',
 'Do Androids Dream of Electric Sheep? (Blade Runner #1)',
 'Dune (Dune #1)',
 'Foundation (Foundation (Publication Order) #1)',
 "Having the Barbarian's Baby (Ice Planet Barbarians #7.5)",
 'Join',
 "Life, the Universe and Everything (Hitchhiker's Guide to the Galaxy #3)",
 'Mesaerion: The Best Science Fiction Stories 1800-1849',
 None,
 'Ready Player One',
 'Sleeping Giants (Themis Files #1)',
 'Soft Apocalypse',
 'The Last Girl (The Dominion Trilogy #1)',
 'The Project',
 "The Restaurant at the End of the Universe (Hitchhiker's Guide to the Galaxy #2)",
 'Three Wishes (River of Time: California #1)',
 "William Shakespeare's Star Wars: Verily, A New Hope (William Shakespeare's Star Wars #4)"}

In [157]:
url_is = titles.find_all('a')
titles_url = set([p.get('href') for p in url_is])
titles_url 

{'../../../arena_587/index.html',
 '../../../do-androids-dream-of-electric-sheep-blade-runner-1_149/index.html',
 '../../../dune-dune-1_151/index.html',
 '../../../foundation-foundation-publication-order-1_375/index.html',
 '../../../having-the-barbarians-baby-ice-planet-barbarians-75_23/index.html',
 '../../../join_902/index.html',
 '../../../life-the-universe-and-everything-hitchhikers-guide-to-the-galaxy-3_189/index.html',
 '../../../mesaerion-the-best-science-fiction-stories-1800-1849_983/index.html',
 '../../../ready-player-one_209/index.html',
 '../../../sleeping-giants-themis-files-1_604/index.html',
 '../../../soft-apocalypse_833/index.html',
 '../../../the-last-girl-the-dominion-trilogy-1_70/index.html',
 '../../../the-project_856/index.html',
 '../../../the-restaurant-at-the-end-of-the-universe-hitchhikers-guide-to-the-galaxy-2_275/index.html',
 '../../../three-wishes-river-of-time-california-1_89/index.html',
 '../../../william-shakespeares-star-wars-verily-a-new-hope-willia

In [149]:
#===== WHY IS THIS ORDERED DIFFERENTLY THAN THE LIST?======
a = pd.DataFrame(titles_titles).reset_index()
a.head()

Unnamed: 0,index,0
0,0,Having the Barbarian's Baby (Ice Planet Barbar...
1,1,Sleeping Giants (Themis Files #1)
2,2,The Last Girl (The Dominion Trilogy #1)
3,3,Join
4,4,Dune (Dune #1)


In [151]:
b = pd.DataFrame(titles_url).reset_index()
b.head()

Unnamed: 0,index,0
0,0,../../../soft-apocalypse_833/index.html
1,1,../../../having-the-barbarians-baby-ice-planet...
2,2,../../../arena_587/index.html
3,3,../../../dune-dune-1_151/index.html
4,4,../../../three-wishes-river-of-time-california...
