# 0. Imports and Settings

In [1]:
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import math

pd.options.mode.chained_assignment = None 

In [2]:
#The headers for us to look real
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:95.0) Gecko/20100101 Firefox/95.0'}

#The home page
home_url = 'https://books.toscrape.com/index.html'

#List of categories you wish to scrap 
cats = ['Classics', 'Science Fiction', 'Humor', 'Business']

# 1. Functions

## 1.1 Fetch the Categories Available

In [3]:
#This function is to create a clean data-set of the categories, their URLs and number of books in each.
#This is important to prevent the code from stop working in the case of layout (i.e. order) changes in the webpage.

def categories_pages_urls (home_url, headers):
    
    #Access the page and store it
    page = requests.get( home_url, headers=headers )

    #Parse the homepage using bs4
    soup = BeautifulSoup( page.text, 'html.parser' )

    #This is where I found the list including all the categories in the sidebar
    categories_list = soup.find( 'ul', class_='nav nav-list' ).find_all('a')

    #First, I will create a set with the urls heading to the categories pages.
    categories_urls = [p.get('href') for p in categories_list]

    #Transform this list in a dataframe, excluding line one which goes for the homepage.
    categories_urls_df = pd.DataFrame(categories_urls).iloc[1:]

    #Now, I need to get a list of the categories themselves to concatenate with the previous list.
    #Using split '\n' because there was some weird spacing back there in the html
    categories_titles = [p.get_text().split('\n') for p in categories_list]

    #Transforming it in a dataframe and dropping alien columns
    categories_titles_df = pd.DataFrame(categories_titles).iloc[1:, [False, False, True, False, False]]

    #Move the tables together and drop more alien columns
    categories_location = pd.concat([categories_titles_df, categories_urls_df], axis = 1).reset_index().iloc[:, [False, True, True]]
    categories_location.columns = ['category', 'category_url']

    #remove weird 32 spacing from category names
    for i in range(len(categories_location)):
        categories_location['category'][i] = categories_location['category'][i][32:]

    categories_location['books_in_category'] = 0

    #Now, I will use a loop to get the quantity of books in each category (we are going to need it later)
    for i in range(len(categories_location)):

        url_get_cat_number = 'https://books.toscrape.com/'+categories_location.iloc[i]['category_url']
        page = requests.get( url_get_cat_number, headers=headers )
        soup = BeautifulSoup( page.text, 'html.parser' )
        books_in_cat = soup.find( 'form', class_='form-horizontal' ).find('strong')
        qty = int([p.get_text('strong') for p in books_in_cat][0])
        categories_location.loc[i, 'books_in_category'] = qty
        
    return categories_location

## 1.2 Select only the wished categories

In [19]:
#The input to this function (df_cats_urls_qtys) should be the return of categories_pages_urls.
#cats should be a list of categories to be scrapped
def cats_wish_to_scrap(df_cats_urls_qtys, cats, pagination):
    
    #Filter
    boolean = df_categories_urls.category.isin(cats)
    wish = df_categories_urls[boolean]
    
    #Add pagination info, we will need it later
    wish['pages'] = 0
    wish['pages'] = wish['books_in_category']/pagination
    wish['pages'] = wish['pages'].apply(lambda x: int(math.ceil(x)))
    
    #===========================================
    # Need to add pagination features. 
    # Suggestion: apply change "../index.html" to "../page-{i}.html" i in range(wish[pages])
    # Update the wishlist
    #=========================================== 
    
    return wish

## 1.3  Fetch the URLs of every single book in the selected categories

In [5]:
#The input to this function should be the return of cats_wish_to_scrap

def create_worklist_to_scrap(worklist):
    df_worklist = pd.DataFrame()

    for i in range(len(worklist)):
        #let's start with Classics from worklist
        url_titles = 'http://books.toscrape.com/'+ worklist.iloc[i, 1]

        #Access the page and store it
        page = requests.get( url_titles, headers=headers )

        #Parse the catalogue page using the html.parser
        soup = BeautifulSoup( page.text, 'html.parser' )

        titles = soup.find('div', class_='col-sm-8 col-md-9')
        category_is = titles.find_all('h1')
        c = category_is[0].get_text()

        url_is = titles.find_all('a')
        titles_url = set([p.get('href') for p in url_is])
        titles_url = list(titles_url)

        df_titles_url = pd.DataFrame(titles_url)
        df_titles_url['category'] = c

        df_worklist = df_worklist.append(df_titles_url, ignore_index=True)
        df_titles_url = df_titles_url.iloc[0:0]

    df_worklist.columns = ['title_url','category']

    for i in range(len(df_worklist)):
        df_worklist['title_url'][i] = 'https://books.toscrape.com/catalogue'+df_worklist['title_url'][i][8:]
        
    return df_worklist

## 1.4 Scrap the books pages and append the data in a single dataframe

In [6]:
def books_scrapping(books_to_scrap):
    df_books_scrap = pd.DataFrame(columns=['scrap_time','book_title','book_category', 'book_upc', 'book_price', 'book_availability','book_stars'])

    #the date the scrapping was held
    scrap_time = datetime.now().strftime( '%Y-%m-%d %H:%M:%S' )

    for i in range(len(books_to_scrap)):

        book_url = books_to_scrap.iloc[0:]['title_url'][i]
        
        book_category = books_to_scrap.iloc[0:]['category'][i]

        #Access the page and store it
        page = requests.get( book_url, headers=headers )

        #Parse the homepage using bs4
        soup = BeautifulSoup( page.text, 'html.parser' )


        #the title of the book
        book_title = (soup.find('div', class_="col-sm-6 product_main").find_all('h1')[0]).get_text()

        #where most the information is:
        book_info_table = soup.find('table', class_="table table-striped").find_all('td')

        #ua unique identifier for each title
        book_upc = (book_info_table[0]).get_text()

        #price excl. tax
        book_price = (book_info_table[2]).get_text()

        #quantity available
        book_availability = (book_info_table[5]).get_text()
        
        #rating
        book_rate = (soup.find('div', class_="col-sm-6 product_main")).find_all('p')
        book_stars = (book_rate[2].get('class'))[1]
        
        df_books_scrap.loc[i,'scrap_time'] = scrap_time
        df_books_scrap.loc[i,'book_title'] = book_title
        df_books_scrap.loc[i,'book_upc'] = book_upc
        df_books_scrap.loc[i,'book_price'] = book_price
        df_books_scrap.loc[i,'book_availability'] = book_availability
        df_books_scrap.loc[i,'book_category'] = book_category
        df_books_scrap.loc[i,'book_stars'] = book_stars
        
    return df_books_scrap

## 1.5 Final transformations for a clean delivery

In [7]:
def data_processing(books_scrap):
    #Change string stars to numbers
    dic = {'One':1, 'Two':2, 'Three':3, 'Four':4, 'Five':5}
    books_scrap['book_stars'].replace(dic, inplace=True)

    #Change price to number
    books_scrap['book_price'] = books_scrap['book_price'].apply(lambda x: x[2:])

    #Turn availability data better
    books_scrap[['book_in_stock', 'availability']] = books_scrap['book_availability'].str.split('(', 1, expand=True)
    books_scrap[['nr_available', 'drop']] = books_scrap['availability'].str.split(' ', 1, expand=True)

    books_scrap.drop(columns = ['book_availability', 'availability', 'drop'], axis = 1, inplace=True)
    
    return books_scrap

# 2. Scrapping

## 2.1 Website structure scrapping

### 2.1.1 Get the categories in the Sidebar, their URLs and sizes for pagination

In [8]:
#Use the defined function to get the clean dataframe of categories
df_categories_urls = categories_pages_urls(home_url=home_url, headers=headers)

In [9]:
df_categories_urls.sample(5)

Unnamed: 0,category,category_url,books_in_category
47,Cultural,catalogue/category/books/cultural_49/index.html,1
11,Nonfiction,catalogue/category/books/nonfiction_13/index.html,110
21,Poetry,catalogue/category/books/poetry_23/index.html,19
9,Childrens,catalogue/category/books/childrens_11/index.html,29
48,Erotica,catalogue/category/books/erotica_50/index.html,1


In [10]:
#Select only the ones we wish to work on and add pagination info
worklist = cats_wish_to_scrap(cats=cats, df_cats_urls_qtys=df_categories_urls, pagination=20)

In [12]:
worklist.sample(3)

Unnamed: 0,category,category_url,books_in_category,pages
4,Classics,catalogue/category/books/classics_6/index.html,19,1
28,Humor,catalogue/category/books/humor_30/index.html,10,1
14,Science Fiction,catalogue/category/books/science-fiction_16/in...,16,1


### 2.1.2 Get the titles in each category

In [13]:
books_to_scrap = create_worklist_to_scrap(worklist=worklist)

In [14]:
books_to_scrap.sample(5)

Unnamed: 0,title_url,category
1,https://books.toscrape.com/catalogue/animal-fa...,Classics
26,https://books.toscrape.com/catalogue/do-androi...,Science Fiction
18,https://books.toscrape.com/catalogue/little-wo...,Classics
39,https://books.toscrape.com/catalogue/naked_197...,Humor
21,https://books.toscrape.com/catalogue/william-s...,Science Fiction


### 2.1.3 Scrap each book page to retrieve the remaining information and build the dataset

In [15]:
books_scrap = books_scrapping(books_to_scrap=books_to_scrap)

In [16]:
books_scrap.sample(5)

Unnamed: 0,scrap_time,book_title,book_category,book_upc,book_price,book_availability,book_stars
36,2021-12-27 18:37:06,"Hyperbole and a Half: Unfortunate Situations, ...",Humor,fbb21a2f71f4d981,Â£14.75,In stock (14 available),Five
27,2021-12-27 18:37:06,Three Wishes (River of Time: California #1),Science Fiction,460f97a346b44bac,Â£44.18,In stock (1 available),Two
53,2021-12-27 18:37:06,The Dirty Little Secrets of Getting Your Dream...,Business,2597b5a345f45e1b,Â£33.34,In stock (19 available),Four
2,2021-12-27 18:37:06,Sense and Sensibility,Classics,35e44170702139a4,Â£37.46,In stock (1 available),One
33,2021-12-27 18:37:06,Dune (Dune #1),Science Fiction,36d5884d44484277,Â£54.86,In stock (3 available),One


### 2.1.4 Transform the data and export the final dataset to .csv

In [17]:
df_deliver = data_processing(books_scrap=books_scrap)

In [18]:
df_deliver.sample(5)

Unnamed: 0,scrap_time,book_title,book_category,book_upc,book_price,book_stars,book_in_stock,nr_available
30,2021-12-27 18:37:06,Join,Science Fiction,18ad3ed896c7ce05,35.67,5,In stock,16
27,2021-12-27 18:37:06,Three Wishes (River of Time: California #1),Science Fiction,460f97a346b44bac,44.18,2,In stock,1
8,2021-12-27 18:37:06,The Picture of Dorian Gray,Classics,b9f6305a3d650e2c,29.7,2,In stock,3
50,2021-12-27 18:37:06,The 10% Entrepreneur: Live Your Startup Dream ...,Business,56e4f9eab2e8e674,27.55,3,In stock,15
35,2021-12-27 18:37:06,Toddlers Are A**holes: It's Not Your Fault,Humor,4242cc3d0437db12,25.55,1,In stock,7


In [None]:
df_deliver.to_csv('CoffeeCookies-dataset.csv')