In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Exercise

Go to http://books.toscrape.com/, Using what you have learned create a csv file the contains all the books found in the website. The csv file should contain the following:

- Title
- Price
- Description
- Availability

Code guides have been provided to help you in creating the web scraper. Found below is the `get_title_links_and_next_page` this function returns 2 things book urls in a page and link to the next page. The idea here is to collect first all the book links available in the website and store the links in the `title_links` variable **(5 points)**

In [6]:

base_url = "http://books.toscrape.com/"

def get_title_links_and_next_page(page_url):
    #this is where we store our links to the title 
    list_links = [] 
    #get the html for the url that was given
    page = requests.get(page_url)
    #parse the html file for beautifulsoup to query on
    soup = BeautifulSoup(page.text, 'html.parser')
    #inspecting the page we notice that the books are placed under 
    #the article tag so we get all articles
    for article in soup.find_all('article'):
        #the article tag has an anchor tag so we find it and get the href
        if "catalogue" not in article.find("a")['href']:
            url = base_url + "catalogue/" + article.find("a")['href']
        else:
            url = base_url + article.find("a")['href']
        #add the title url to our list of titles 
        list_links.append(url)
    
    #try to check if a next button is in the page 
    try:
        next_url = soup.find('li', attrs={'class':'next'}).find("a")['href']
    #if none we return None :)     
    except:
        next_url = None

    return (list_links, next_url)


#initial set up to crawl the book links and next page
res = get_title_links_and_next_page('http://books.toscrape.com/index.html')
title_links = res[0]
print(f"Next Page Loaded: http://books.toscrape.com/index.html")
print(f"title_links Update: {len(title_links)}")

#while we get a next page link keep on crawling for book links
while res[1]:
    #there are cases that the word "catalogue" is not in the link so we add it 
    #so that we can crawl properly
    if "catalogue" not in res[1]:
        page_url = base_url + "catalogue/" + str(res[1])
    else:
        page_url = base_url + str(res[1])
    res = get_title_links_and_next_page(page_url)
    title_links += res[0] # append arrays
    print(f"Next Page Loaded: {page_url}")
    print(f"title_links Update: {len(title_links)}")

print(f"All Pages Scraped {len(title_links)} title links available.")

Next Page Loaded: http://books.toscrape.com/index.html
title_links Update: 20
Next Page Loaded: http://books.toscrape.com/catalogue/page-2.html
title_links Update: 40
Next Page Loaded: http://books.toscrape.com/catalogue/page-3.html
title_links Update: 60
Next Page Loaded: http://books.toscrape.com/catalogue/page-4.html
title_links Update: 80
Next Page Loaded: http://books.toscrape.com/catalogue/page-5.html
title_links Update: 100
Next Page Loaded: http://books.toscrape.com/catalogue/page-6.html
title_links Update: 120
Next Page Loaded: http://books.toscrape.com/catalogue/page-7.html
title_links Update: 140
Next Page Loaded: http://books.toscrape.com/catalogue/page-8.html
title_links Update: 160
Next Page Loaded: http://books.toscrape.com/catalogue/page-9.html
title_links Update: 180
Next Page Loaded: http://books.toscrape.com/catalogue/page-10.html
title_links Update: 200
Next Page Loaded: http://books.toscrape.com/catalogue/page-11.html
title_links Update: 220
Next Page Loaded: http:

Once you have a list of all the available book links we loop through the links and use the 4 functions `get_title`, `get_price`, `get_description`, `get_availability` to retrieve the book information. **(10 points)**

In [None]:
def get_title(soup):
    return soup.find('div', attrs={'class':'product_main'}).find("h1").text

def get_price(soup):
    return soup.find('div', attrs={'class':'product_main'}).find('p', attrs={'class':'price_color'}).text.replace("Â","")

def get_description(soup):
    product_desc_marker = soup.find('div', attrs={'id':'product_description'})
    return product_desc_marker.findNext('p').text if product_desc_marker is not None else ""
    
def get_availability(soup):
    return soup.find('div', attrs={'class':'product_main'}).find('p', attrs={'class':'availability'}, recursive=False).text

book_data = []
for title_link in title_links: 
    # print(f"Retrieving book information from {title_link}")
    page = requests.get(title_link)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    title = get_title(soup)
    price = get_price(soup)
    description = get_description(soup)
    availability = get_availability(soup)
    
    book_data += [[title, price, description, availability.strip()]]
    # print(f"Book '{title}' added.")
    # print(f"Adding Retrieved Book Information: {[title, price, description, availability.strip()]}")

print(f"Book Information Scraping Done: {len(book_data)} books scraped")

In [None]:
df = pd.DataFrame(data=book_data)
df.columns = ['title', 'price', 'description', 'availability']
display(df.head())

#save to csv file 
df.to_csv("book-information.csv")

In [None]:
# Hi, I don't know why tha last two cells won't run :( It run a while ago. 
