#### Scraping the https://news.ycombinator.com/news front page, using requests and Beautiful Soup

In [3]:
import requests
import re
from bs4 import BeautifulSoup
articles = []
url = 'https://news.ycombinator.com/news'
r = requests.get(url)
html_soup = BeautifulSoup(r.text, 'html.parser')
for item in html_soup.find_all('tr', class_='athing'):
    item_a = item.find('a', class_='storylink')
    item_link = item_a.get('href') if item_a else None
    item_text = item_a.get_text(strip=True) if item_a else None
    next_row = item.find_next_sibling('tr')
    item_score = next_row.find('span', class_='score')
    item_score = item_score.get_text(strip=True) if item_score else '0 points'
    
    # We use regex here to find the correct element
    item_comments = next_row.find('a', string=re.compile('\d+(&nbsp;|\s)comment(s?)'))
    item_comments = item_comments.get_text(strip=True).replace('\xa0', ' ') \
    if item_comments else '0 comments'
    articles.append({'link' : item_link,'title' : item_text,'score' : item_score,'comments' : item_comments})

    
for article in articles:
    print(article)

{'link': 'https://pgdash.io/blog/postgres-12-generated-columns.html', 'title': 'PostgreSQL 12 – Generated Columns', 'score': '102 points', 'comments': '36 comments'}
{'link': 'https://weinholt.se/articles/new-r6rs-compiler/', 'title': 'A New R6RS Scheme Compiler', 'score': '38 points', 'comments': '6 comments'}
{'link': 'https://thistothat.com', 'title': 'This To That', 'score': '89 points', 'comments': '10 comments'}
{'link': 'https://arstechnica.com/tech-policy/2019/09/isps-worry-a-new-chrome-feature-will-stop-them-from-spying-on-you/', 'title': 'Big ISPs aren’t happy about Google’s plans for encrypted DNS', 'score': '447 points', 'comments': '315 comments'}
{'link': 'http://www.digitaljournal.com/science/40-000-year-old-bracelet-from-extinct-human-species-discovered/article/432798', 'title': '40k-year-old bracelet made by extinct human species found (2015)', 'score': '89 points', 'comments': '10 comments'}
{'link': 'https://techcrunch.com/2019/10/01/instagram-vs-tiktok/', 'title': '

#### Scraping the https://news.ycombinator.com/news front page, using requests and Beautiful Soup Using the Hacker News API

In [4]:
import requests
articles = []
url = 'https://hacker-news.firebaseio.com/v0'
top_stories = requests.get(url + '/topstories.json').json()

for story_id in top_stories:
    story_url = url + '/item/{}.json'.format(story_id)
    print('Fetching:', story_url)
    r = requests.get(story_url)
    story_dict = r.json()
    articles.append(story_dict)

for article in articles:
    print(article)

Fetching: https://hacker-news.firebaseio.com/v0/item/21134339.json
Fetching: https://hacker-news.firebaseio.com/v0/item/21134540.json
Fetching: https://hacker-news.firebaseio.com/v0/item/21110712.json
Fetching: https://hacker-news.firebaseio.com/v0/item/21124900.json
Fetching: https://hacker-news.firebaseio.com/v0/item/21109642.json
Fetching: https://hacker-news.firebaseio.com/v0/item/21134016.json
Fetching: https://hacker-news.firebaseio.com/v0/item/21132752.json
Fetching: https://hacker-news.firebaseio.com/v0/item/21133839.json
Fetching: https://hacker-news.firebaseio.com/v0/item/21119214.json
Fetching: https://hacker-news.firebaseio.com/v0/item/21132770.json
Fetching: https://hacker-news.firebaseio.com/v0/item/21131704.json
Fetching: https://hacker-news.firebaseio.com/v0/item/21111711.json
Fetching: https://hacker-news.firebaseio.com/v0/item/21130343.json
Fetching: https://hacker-news.firebaseio.com/v0/item/21122112.json
Fetching: https://hacker-news.firebaseio.com/v0/item/21134545.

KeyboardInterrupt: 

####  We’re going to scrape http://quotes.toscrape.com, using requests and Beautiful Soup.

We’re going to scrape http://quotes.toscrape.com, using requests and Beautiful Soup.
We’ll store this information in a SQLite database. Instead of using the “records” library and writing manual SQL statements, we’re going to use the “dataset” library (see https://dataset.readthedocs.io/en/latest/).

##### Installing a dataset can be done easily through pip:
pip install -U dataset

In [5]:
import requests
import dataset
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
db = dataset.connect('sqlite:///quotes.db')
authors_seen = set()

base_url = 'http://quotes.toscrape.com/'
def clean_url(url):
    # Clean '/author/Steve-Martin' to 'Steve-Martin'
    # Use urljoin to make an absolute URL
    url = urljoin(base_url, url)
    # Use urlparse to get out the path part
    path = urlparse(url).path
    # Now split the path by '/' and get the second part
    # E.g. '/author/Steve-Martin' -> ['', 'author', 'Steve-Martin']
    return path.split('/')[2]

def scrape_quotes(html_soup):
    for quote in html_soup.select('div.quote'):
        quote_text = quote.find(class_='text').get_text(strip=True)
        quote_author_url = clean_url(quote.find(class_='author').find_next_sibling('a').get('href'))
        quote_tag_urls = [clean_url(a.get('href'))
                          for a in quote.find_all('a', class_='tag')]
        authors_seen.add(quote_author_url)
        # Store this quote and its tags
        quote_id = db['quotes'].insert({ 'text' : quote_text,'author' : quote_author_url })
        db['quote_tags'].insert_many([{'quote_id' : quote_id, 'tag_id' : tag} for tag in quote_tag_urls])

def scrape_author(html_soup, author_id):
    author_name = html_soup.find(class_='author-title').get_text(strip=True)
    author_born_date = html_soup.find(class_='author-born-date').get_text(strip=True)
    author_born_loc = html_soup.find(class_='author-born-location').get_text(strip=True)
    author_desc = html_soup.find(class_='author-description').get_text(strip=True)
    db['authors'].insert({ 'author_id' : author_id,'name' : author_name,'born_date' : author_born_date,'born_location' : author_born_loc,'description' : author_desc})

# Start by scraping all the quote pages
url = base_url
while True:
    print('Now scraping page:', url)
    r = requests.get(url)
    html_soup = BeautifulSoup(r.text, 'html.parser')
    # Scrape the quotes
    scrape_quotes(html_soup)
    # Is there a next page?
    next_a = html_soup.select('li.next > a')
    
    if not next_a or not next_a[0].get('href'):
        break
    url = urljoin(url, next_a[0].get('href'))

# Now fetch out the author information
for author_id in authors_seen:
    url = urljoin(base_url, '/author/' + author_id)
    print('Now scraping author:', url)
    r = requests.get(url)
    html_soup = BeautifulSoup(r.text, 'html.parser')
    # Scrape the author information
    scrape_author(html_soup, author_id)

Now scraping page: http://quotes.toscrape.com/
Now scraping page: http://quotes.toscrape.com/page/2/
Now scraping page: http://quotes.toscrape.com/page/3/
Now scraping page: http://quotes.toscrape.com/page/4/
Now scraping page: http://quotes.toscrape.com/page/5/
Now scraping page: http://quotes.toscrape.com/page/6/
Now scraping page: http://quotes.toscrape.com/page/7/
Now scraping page: http://quotes.toscrape.com/page/8/
Now scraping page: http://quotes.toscrape.com/page/9/
Now scraping page: http://quotes.toscrape.com/page/10/
Now scraping author: http://quotes.toscrape.com/author/Madeleine-LEngle
Now scraping author: http://quotes.toscrape.com/author/Haruki-Murakami
Now scraping author: http://quotes.toscrape.com/author/Marilyn-Monroe
Now scraping author: http://quotes.toscrape.com/author/Alexandre-Dumas-fils
Now scraping author: http://quotes.toscrape.com/author/Bob-Marley
Now scraping author: http://quotes.toscrape.com/author/George-Eliot
Now scraping author: http://quotes.toscrape

We’re going to scrape http://books.toscrape.com, using requests and Beautiful Soup.
This page is provided by Scrapinghub as a more realistic scraping playground. Take some time to explore the page. We’ll scrape out all the information, that is, for every book, we’ll obtain:
    • Its title;
    • Its image;
    • Its price and stock availability;
    • Its rating;
    • Its product description;
    • Other product information.
We’re going to store this information in an SQLite database, again using the “dataset” library. However, this time we’re going to write our program in such a way that it takes into account updates — so that we can run it multiple times without inserting duplicate records in the database.

In [10]:
import requests
import dataset
import re
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
db = dataset.connect('sqlite:///books.db')
base_url = 'http://books.toscrape.com/'

def scrape_books(html_soup, url):
    for book in html_soup.select('article.product_pod'):
        # For now, we'll only store the books url
        book_url = book.find('h3').find('a').get('href')
        book_url = urljoin(url, book_url)
        path = urlparse(book_url).path
        book_id = path.split('/')[2]
        # Upsert tries to update first and then insert instead
        db['books'].upsert({'book_id' : book_id,'last_seen' : datetime.now()}, ['book_id'])

def scrape_book(html_soup, book_id):
    main = html_soup.find(class_='product_main')
    book = {}
    book['book_id'] = book_id
    book['title'] = main.find('h1').get_text(strip=True)
    book['price'] = main.find(class_='price_color').get_text(strip=True)
    book['stock'] = main.find(class_='availability').get_text(strip=True)
    book['rating'] = ' '.join(main.find(class_='star-rating').get('class')).replace('star-rating', '').strip()
    book['img'] = html_soup.find(class_='thumbnail').find('img').get('src')
    
    desc = html_soup.find(id='product_description')
    book['description'] = ''

    if desc:
        book['description'] = desc.find_next_sibling('p').get_text(strip=True)
    info_table = html_soup.find(string='Product Information').find_next('table')
    for row in info_table.find_all('tr'):
        header = row.find('th').get_text(strip=True)
        # Since we'll use the header as a column, clean it a bit
        # to make sure SQLite will accept it
        header = re.sub('[^a-zA-Z]+', '_', header)
        value = row.find('td').get_text(strip=True)
        book[header] = value
        db['book_info'].upsert(book, ['book_id'])
        
# Scrape the pages in the catalogue
url = base_url
inp = input('Do you wish to re-scrape the catalogue (y/n)? ')
while True and inp == 'y':
    print('Now scraping page:', url)
    r = requests.get(url)
    html_soup = BeautifulSoup(r.text, 'html.parser')
    scrape_books(html_soup, url)
    # Is there a next page?
    next_a = html_soup.select('li.next > a')
    if not next_a or not next_a[0].get('href'):
        break
    url = urljoin(url, next_a[0].get('href'))
                
# Now scrape book by book, oldest first
books = db['books'].find(order_by=['last_seen'])
for book in books:
    book_id = book['book_id']
    book_url = base_url + 'catalogue/{}'.format(book_id)
    print('Now scraping book:', book_url)
    r = requests.get(book_url)
    r.encoding = 'utf-8'
    html_soup = BeautifulSoup(r.text, 'html.parser')
    scrape_book(html_soup, book_id)
    # Update the last seen timestamp
    db['books'].upsert({'book_id' : book_id,'last_seen' : datetime.now()}, ['book_id'])

Do you wish to re-scrape the catalogue (y/n)? y
Now scraping page: http://books.toscrape.com/
Now scraping page: http://books.toscrape.com/catalogue/page-2.html
Now scraping page: http://books.toscrape.com/catalogue/page-3.html
Now scraping page: http://books.toscrape.com/catalogue/page-4.html
Now scraping page: http://books.toscrape.com/catalogue/page-5.html
Now scraping page: http://books.toscrape.com/catalogue/page-6.html
Now scraping page: http://books.toscrape.com/catalogue/page-7.html
Now scraping page: http://books.toscrape.com/catalogue/page-8.html
Now scraping page: http://books.toscrape.com/catalogue/page-9.html
Now scraping page: http://books.toscrape.com/catalogue/page-10.html
Now scraping page: http://books.toscrape.com/catalogue/page-11.html
Now scraping page: http://books.toscrape.com/catalogue/page-12.html
Now scraping page: http://books.toscrape.com/catalogue/page-13.html
Now scraping page: http://books.toscrape.com/catalogue/page-14.html
Now scraping page: http://book

Now scraping book: http://books.toscrape.com/catalogue/unbound-how-eight-technologies-made-us-human-transformed-society-and-brought-our-world-to-the-brink_950
Now scraping book: http://books.toscrape.com/catalogue/tsubasa-world-chronicle-2-tsubasa-world-chronicle-2_949
Now scraping book: http://books.toscrape.com/catalogue/throwing-rocks-at-the-google-bus-how-growth-became-the-enemy-of-prosperity_948
Now scraping book: http://books.toscrape.com/catalogue/this-one-summer_947
Now scraping book: http://books.toscrape.com/catalogue/thirst_946
Now scraping book: http://books.toscrape.com/catalogue/the-torch-is-passed-a-harding-family-story_945
Now scraping book: http://books.toscrape.com/catalogue/the-secret-of-dreadwillow-carse_944
Now scraping book: http://books.toscrape.com/catalogue/the-pioneer-woman-cooks-dinnertime-comfort-classics-freezer-food-16-minute-meals-and-other-delicious-ways-to-solve-supper_943
Now scraping book: http://books.toscrape.com/catalogue/the-past-never-ends_942
No

Now scraping book: http://books.toscrape.com/catalogue/a-fierce-and-subtle-poison_875
Now scraping book: http://books.toscrape.com/catalogue/a-court-of-thorns-and-roses-a-court-of-thorns-and-roses-1_874
Now scraping book: http://books.toscrape.com/catalogue/unqualified-how-god-uses-broken-people-to-do-big-things_873
Now scraping book: http://books.toscrape.com/catalogue/you-are-what-you-love-the-spiritual-power-of-habit_872
Now scraping book: http://books.toscrape.com/catalogue/william-shakespeares-star-wars-verily-a-new-hope-william-shakespeares-star-wars-4_871
Now scraping book: http://books.toscrape.com/catalogue/tuesday-nights-in-1980_870
Now scraping book: http://books.toscrape.com/catalogue/tracing-numbers-on-a-train_869
Now scraping book: http://books.toscrape.com/catalogue/throne-of-glass-throne-of-glass-1_868
Now scraping book: http://books.toscrape.com/catalogue/thomas-jefferson-and-the-tripoli-pirates-the-forgotten-war-that-changed-american-history_867
Now scraping book: htt

Now scraping book: http://books.toscrape.com/catalogue/batman-the-long-halloween-batman_793
Now scraping book: http://books.toscrape.com/catalogue/batman-the-dark-knight-returns-batman_792
Now scraping book: http://books.toscrape.com/catalogue/ayumis-violin_791
Now scraping book: http://books.toscrape.com/catalogue/anonymous_790
Now scraping book: http://books.toscrape.com/catalogue/amy-meets-the-saints-and-sages_789
Now scraping book: http://books.toscrape.com/catalogue/amid-the-chaos_788
Now scraping book: http://books.toscrape.com/catalogue/amatus_787
Now scraping book: http://books.toscrape.com/catalogue/agnostic-a-spirited-manifesto_786
Now scraping book: http://books.toscrape.com/catalogue/zealot-the-life-and-times-of-jesus-of-nazareth_785
Now scraping book: http://books.toscrape.com/catalogue/you-you-1_784
Now scraping book: http://books.toscrape.com/catalogue/wonder-woman-earth-one-volume-one-wonder-woman-earth-one-1_783
Now scraping book: http://books.toscrape.com/catalogue/wi

Now scraping book: http://books.toscrape.com/catalogue/lady-midnight-the-dark-artifices-1_707
Now scraping book: http://books.toscrape.com/catalogue/its-all-easy-healthy-delicious-weeknight-meals-in-under-30-minutes_706
Now scraping book: http://books.toscrape.com/catalogue/island-of-dragons-unwanteds-7_705
Now scraping book: http://books.toscrape.com/catalogue/i-know-what-im-doing-and-other-lies-i-tell-myself-dispatches-from-a-life-under-construction_704
Now scraping book: http://books.toscrape.com/catalogue/i-am-pilgrim-pilgrim-1_703
Now scraping book: http://books.toscrape.com/catalogue/hyperbole-and-a-half-unfortunate-situations-flawed-coping-mechanisms-mayhem-and-other-things-that-happened_702
Now scraping book: http://books.toscrape.com/catalogue/hush-hush-hush-hush-1_701
Now scraping book: http://books.toscrape.com/catalogue/hold-your-breath-search-and-rescue-1_700
Now scraping book: http://books.toscrape.com/catalogue/hamilton-the-revolution_699
Now scraping book: http://books.

Now scraping book: http://books.toscrape.com/catalogue/murder-at-the-42nd-street-library-raymond-ambler-1_624
Now scraping book: http://books.toscrape.com/catalogue/most-wanted_623
Now scraping book: http://books.toscrape.com/catalogue/love-lies-and-spies_622
Now scraping book: http://books.toscrape.com/catalogue/how-to-speak-golf-an-illustrated-guide-to-links-lingo_621
Now scraping book: http://books.toscrape.com/catalogue/hide-away-eve-duncan-20_620
Now scraping book: http://books.toscrape.com/catalogue/furiously-happy-a-funny-book-about-horrible-things_619
Now scraping book: http://books.toscrape.com/catalogue/everyday-italian-125-simple-and-delicious-recipes_618
Now scraping book: http://books.toscrape.com/catalogue/equal-is-unfair-americas-misguided-fight-against-income-inequality_617
Now scraping book: http://books.toscrape.com/catalogue/eleanor-park_616
Now scraping book: http://books.toscrape.com/catalogue/dirty-dive-bar-1_615
Now scraping book: http://books.toscrape.com/catalo

Now scraping book: http://books.toscrape.com/catalogue/roller-girl_540
Now scraping book: http://books.toscrape.com/catalogue/rising-strong_539


KeyboardInterrupt: 