# Challenge Web Scraping
Explore the site: http://books.toscrape.com 

Task: 

Make a dataframe consisting of each book title, price, and availability

Export the data as a csv



## Make request

In [1]:
import requests
url = 'http://books.toscrape.com'
response = requests.get(url)

In [2]:
# Check status code
response.status_code # Every response includes a status code.


200

In [3]:
# Preview the content of the response
response.content[:1000]

b'<!DOCTYPE html>\n<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->\n<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->\n<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->\n<!--[if gt IE 8]><!--> <html lang="en-us" class="no-js"> <!--<![endif]-->\n    <head>\n        <title>\n    All products | Books to Scrape - Sandbox\n</title>\n\n        <meta http-equiv="content-type" content="text/html; charset=UTF-8" />\n        <meta name="created" content="24th Jun 2016 09:29" />\n        <meta name="description" content="" />\n        <meta name="viewport" content="width=device-width" />\n        <meta name="robots" content="NOARCHIVE,NOCACHE" />\n\n        <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->\n        <!--[if lt IE 9]>\n        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>\n        <![endif]-->\n\n        \n            <link rel="shortcut icon" hre

## Make Soup Object - Parsing HTML with BeautifulSoup


In [4]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.content)

# To view the soup object, use .prettify()
# soup.prettify()
print(soup.prettify()) # try this

<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en-us">
 <!--<![endif]-->
 <head>
  <title>
   All products | Books to Scrape - Sandbox
  </title>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="24th Jun 2016 09:29" name="created"/>
  <meta content="" name="description"/>
  <meta content="width=device-width" name="viewport"/>
  <meta content="NOARCHIVE,NOCACHE" name="robots"/>
  <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
  <!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
  <link href="static/oscar/favicon.ico" rel="shortcut icon"/>
  <link href="static/oscar/css/styles.css" rel="stylesheet" type="tex

In [5]:
# Make it easier to read by printing (Here we only printed the first 1000 characters)
print(soup.prettify()[:1000])

<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en-us">
 <!--<![endif]-->
 <head>
  <title>
   All products | Books to Scrape - Sandbox
  </title>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="24th Jun 2016 09:29" name="created"/>
  <meta content="" name="description"/>
  <meta content="width=device-width" name="viewport"/>
  <meta content="NOARCHIVE,NOCACHE" name="robots"/>
  <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
  <!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
  <link href="static/oscar/favicon.ico" rel="shortcut icon"/>
  <link href="static/oscar/css/styles.css" rel="stylesheet" type="tex

## Navigate the soup

In [6]:
# Find the title
soup.title

<title>
    All products | Books to Scrape - Sandbox
</title>

In [7]:
# We can get just the content of the title without the tags by adding .text
soup.title.text

'\n    All products | Books to Scrape - Sandbox\n'

In [8]:
# Find the body
soup.body

<body class="default" id="default">
<header class="header container-fluid">
<div class="page_inner">
<div class="row">
<div class="col-sm-8 h1"><a href="index.html">Books to Scrape</a><small> We love being scraped!</small>
</div>
</div>
</div>
</header>
<div class="container-fluid page">
<div class="page_inner">
<ul class="breadcrumb">
<li>
<a href="index.html">Home</a>
</li>
<li class="active">All products</li>
</ul>
<div class="row">
<aside class="sidebar col-sm-4 col-md-3">
<div id="promotions_left">
</div>
<div class="side_categories">
<ul class="nav nav-list">
<li>
<a href="catalogue/category/books_1/index.html">
                            
                                Books
                            
                        </a>
<ul>
<li>
<a href="catalogue/category/books/travel_2/index.html">
                            
                                Travel
                            
                        </a>
</li>
<li>
<a href="catalogue/category/books/mystery_3/in

In [9]:
# Find all h1 tags
h1_headers = soup.find_all('h1')
len(h1_headers)

1

In [10]:
h1_headers

[<h1>All products</h1>]

In [11]:
# Find all containers as article
book_containers = soup.find_all('article')
len(book_containers)

20

In [12]:
# Obtain the article  for class product_pod
book_containers = soup.find_all('article', class_='product_pod')
len(book_containers)

20

In [13]:
# Find each piece of information within product_pod
product_div = book_containers[0]
product_div

<article class="product_pod">
<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [14]:
from IPython.display import HTML
HTML(str(product_div))

Lets extract each information separately except image of the book.

### Extract the Book title

In [15]:
book = product_div.find('h3')
book

<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>

In [16]:
book_title = book.a['title']

In [17]:
book_title

'A Light in the Attic'

In [18]:
title = product_div.h3.a['title']
title

'A Light in the Attic'

### Extract the book price

In [19]:
product = product_div.find('div', attrs = {'class' : 'product_price'})
product

<div class="product_price">
<p class="price_color">£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>

In [20]:
price = product.select('p.price_color')
price

[<p class="price_color">£51.77</p>]

In [21]:
price[0].text

'£51.77'

Now lets see how we can get the price in one step

In [22]:
price = product_div.select('div p.price_color')[0].text
price

'£51.77'

### Extract the availability

In [23]:
availability = product.select('p.instock.availability')

In [24]:
availability

[<p class="instock availability">
 <i class="icon-ok"></i>
     
         In stock
     
 </p>]

In [25]:
availability[0].text.strip()

'In stock'

In [26]:
availability = product_div.select('div p.instock.availability')[0].text.strip()
availability

'In stock'

## Make a dictionary with all of the info

In [27]:
# Book title

book_dict = {'title': title,
             'price' : price,
             'availability': availability}
            
book_dict

{'title': 'A Light in the Attic',
 'price': '£51.77',
 'availability': 'In stock'}

## Loop through all of the countries

Create a list of dictionaries

In [28]:
# Create an empty list
book_data_list = []

# Loop through each country div and fill in the dictionary
for product_div in book_containers:
    # book title
    title = product_div.h3.a['title']

    # price
    price = product_div.select('div p.price_color')[0].text

    # availability
    availability =product_div.select('div p.instock.availability')[0].text.strip()
    
    book_dict = {'title': title,
             'price' : price,
             'availability': availability}
    print(book_dict)        

    book_data_list.append(book_dict)

{'title': 'A Light in the Attic', 'price': '£51.77', 'availability': 'In stock'}
{'title': 'Tipping the Velvet', 'price': '£53.74', 'availability': 'In stock'}
{'title': 'Soumission', 'price': '£50.10', 'availability': 'In stock'}
{'title': 'Sharp Objects', 'price': '£47.82', 'availability': 'In stock'}
{'title': 'Sapiens: A Brief History of Humankind', 'price': '£54.23', 'availability': 'In stock'}
{'title': 'The Requiem Red', 'price': '£22.65', 'availability': 'In stock'}
{'title': 'The Dirty Little Secrets of Getting Your Dream Job', 'price': '£33.34', 'availability': 'In stock'}
{'title': 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull', 'price': '£17.93', 'availability': 'In stock'}
{'title': 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics', 'price': '£22.60', 'availability': 'In stock'}
{'title': 'The Black Maria', 'price': '£52.15', 'availability': 'In stock'}
{'title': 'Starving Hearts (Tri

In [29]:
# Confirm all countries are included
len(book_data_list)

20

In [30]:
# Inspect a particular country
book_data_list[2]

{'title': 'Soumission', 'price': '£50.10', 'availability': 'In stock'}

In [31]:
book_data_list

[{'title': 'A Light in the Attic',
  'price': '£51.77',
  'availability': 'In stock'},
 {'title': 'Tipping the Velvet',
  'price': '£53.74',
  'availability': 'In stock'},
 {'title': 'Soumission', 'price': '£50.10', 'availability': 'In stock'},
 {'title': 'Sharp Objects', 'price': '£47.82', 'availability': 'In stock'},
 {'title': 'Sapiens: A Brief History of Humankind',
  'price': '£54.23',
  'availability': 'In stock'},
 {'title': 'The Requiem Red', 'price': '£22.65', 'availability': 'In stock'},
 {'title': 'The Dirty Little Secrets of Getting Your Dream Job',
  'price': '£33.34',
  'availability': 'In stock'},
 {'title': 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
  'price': '£17.93',
  'availability': 'In stock'},
 {'title': 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
  'price': '£22.60',
  'availability': 'In stock'},
 {'title': 'The Black Maria', 'price': '£52.15', 'availability': 

## Convert List of dictionaries to a dataframe

In [32]:
import pandas as pd

In [33]:
book_df = pd.DataFrame(book_data_list)
book_df.head()

Unnamed: 0,title,price,availability
0,A Light in the Attic,£51.77,In stock
1,Tipping the Velvet,£53.74,In stock
2,Soumission,£50.10,In stock
3,Sharp Objects,£47.82,In stock
4,Sapiens: A Brief History of Humankind,£54.23,In stock


In [34]:
# Export the df as a csv
book_df.to_csv("../Data/book_data_scraped.csv", index=False)