## Scraping non-tabular, multipage sites
Scrape the top 500 <a href="https://bestsellingalbums.org/decade/2010">best-selling albums of the 2010's</a>. Your data must include the following datapoints:

- Name of album
- Name of artist
- Number of albums sold 
- The link to the page that breaks down sales by country (found by clicking album title)



In [1]:
## create cells as needed
import requests ## what pulls down all the html from the server
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
url = "https://bestsellingalbums.org/decade/2010"
##scrape url website
response = requests.get(url)

In [3]:
response.status_code

200

In [4]:
type(response)

requests.models.Response

In [5]:
response

<Response [200]>

In [6]:
response.text

'\r\n<!DOCTYPE html>\n<html lang="en-US" class="no-js">\n<head>\n    <meta charset="UTF-8">\n    <meta name="viewport" content="width=device-width">\n    <link rel="profile" href="https://gmpg.org/xfn/11">\n    <link rel="pingback" href="https://bestsellingalbums.org/xmlrpc.php">\n    <!--[if lt IE 9]>\n    <script src="https://bestsellingalbums.org/wp-content/themes/twentyfifteen/js/html5.js?ver=3.7.0"></script>\n    <![endif]-->\n    <script>(function(html){html.className = html.className.replace(/\\bno-js\\b/,\'js\')})(document.documentElement);</script>\n\n    <!-- This site is optimized with the Yoast SEO plugin v14.5 - https://yoast.com/wordpress/plugins/seo/ -->\n    <title>Best-selling albums of 2010\'s</title>\n    <meta name="description" content="Best-selling albums of 2010\'s" />\n    <meta name="robots" content="index, follow" />\n    <meta name="googlebot" content="index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1" />\n    <meta name="bingbot" c

In [7]:
type(response.text)

str

In [8]:
type(response.content)

bytes

In [9]:
soup = BeautifulSoup(response.text, "html.parser")

In [10]:
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width" name="viewport"/>
  <link href="https://gmpg.org/xfn/11" rel="profile"/>
  <link href="https://bestsellingalbums.org/xmlrpc.php" rel="pingback"/>
  <!--[if lt IE 9]>
    <script src="https://bestsellingalbums.org/wp-content/themes/twentyfifteen/js/html5.js?ver=3.7.0"></script>
    <![endif]-->
  <script>
   (function(html){html.className = html.className.replace(/\bno-js\b/,'js')})(document.documentElement);
  </script>
  <!-- This site is optimized with the Yoast SEO plugin v14.5 - https://yoast.com/wordpress/plugins/seo/ -->
  <title>
   Best-selling albums of 2010's
  </title>
  <meta content="Best-selling albums of 2010's" name="description">
   <meta content="index, follow" name="robots">
    <meta content="index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1" name="googlebot"/>
    <meta content="index, follow, max-snippet:-1, max-image-

In [11]:
type(soup)

bs4.BeautifulSoup

In [12]:
soup.title

<title>Best-selling albums of 2010's</title>

In [13]:
type(soup.p)

bs4.element.Tag

In [14]:
soup.find("p")

<p class="site-title"><a href="https://bestsellingalbums.org/" rel="home">BestSellingAlbums.org</a></p>

In [46]:
albums = soup.find_all("div", class_="album")

In [50]:
artists = soup.find_all("div", class_="artist")

In [60]:
sales = soup.find_all("div", class_="sales")
type(sales)

bs4.element.ResultSet

In [54]:
albums

[<div class="album"><a href="https://bestsellingalbums.org/album/1034">21</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/1035">25</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/30524">CHRISTMAS</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/45488">1989</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/23318">PURPOSE</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/12876">DIVIDE</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/42961">FROZEN</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/23977">TEENAGE DREAM</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/12880">X</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/6777">DOO-WOPS &amp; HOOLIGANS</a></div>,
 <div class="album"><a href="https://bestsellingalbums.org/album/13756">RECOVERY</a></div

In [91]:
country_sales = soup.find_all("div", class_="album")
country_sales_lc = []
for country_sale in country_sales:
    link = country_sale.a.get("href")
    country_sales_lc.append(link)

country_sales_lc

['https://bestsellingalbums.org/album/1034',
 'https://bestsellingalbums.org/album/1035',
 'https://bestsellingalbums.org/album/30524',
 'https://bestsellingalbums.org/album/45488',
 'https://bestsellingalbums.org/album/23318',
 'https://bestsellingalbums.org/album/12876',
 'https://bestsellingalbums.org/album/42961',
 'https://bestsellingalbums.org/album/23977',
 'https://bestsellingalbums.org/album/12880',
 'https://bestsellingalbums.org/album/6777',
 'https://bestsellingalbums.org/album/13756',
 'https://bestsellingalbums.org/album/19810',
 'https://bestsellingalbums.org/album/39978',
 'https://bestsellingalbums.org/album/6778',
 'https://bestsellingalbums.org/album/45494',
 'https://bestsellingalbums.org/album/12875',
 'https://bestsellingalbums.org/album/12457',
 'https://bestsellingalbums.org/album/47839',
 'https://bestsellingalbums.org/album/5207',
 'https://bestsellingalbums.org/album/25786',
 'https://bestsellingalbums.org/album/6859',
 'https://bestsellingalbums.org/album/36

In [59]:
country_sales = []

for link in albums:
    country_sales.append(link.a.get("href"))

country_sales

['https://bestsellingalbums.org/album/1034',
 'https://bestsellingalbums.org/album/1035',
 'https://bestsellingalbums.org/album/30524',
 'https://bestsellingalbums.org/album/45488',
 'https://bestsellingalbums.org/album/23318',
 'https://bestsellingalbums.org/album/12876',
 'https://bestsellingalbums.org/album/42961',
 'https://bestsellingalbums.org/album/23977',
 'https://bestsellingalbums.org/album/12880',
 'https://bestsellingalbums.org/album/6777',
 'https://bestsellingalbums.org/album/13756',
 'https://bestsellingalbums.org/album/19810',
 'https://bestsellingalbums.org/album/39978',
 'https://bestsellingalbums.org/album/6778',
 'https://bestsellingalbums.org/album/45494',
 'https://bestsellingalbums.org/album/12875',
 'https://bestsellingalbums.org/album/12457',
 'https://bestsellingalbums.org/album/47839',
 'https://bestsellingalbums.org/album/5207',
 'https://bestsellingalbums.org/album/25786',
 'https://bestsellingalbums.org/album/6859',
 'https://bestsellingalbums.org/album/36

In [61]:
albums_lc = [album.get_text() for album in albums]
albums_lc

['21',
 '25',
 'CHRISTMAS',
 '1989',
 'PURPOSE',
 'DIVIDE',
 'FROZEN',
 'TEENAGE DREAM',
 'X',
 'DOO-WOPS & HOOLIGANS',
 'RECOVERY',
 'NIGHT VISIONS',
 'IN THE LONELY HOUR',
 'UNORTHODOX JUKEBOX',
 'RED',
 '+',
 'VIEWS',
 'BEAUTY BEHIND THE MADNESS',
 'WHEN WE ALL FALL ASLEEP, WHERE DO WE GO?',
 'BORN THIS WAY',
 'MAP OF THE SOUL: 7',
 'BEERBONGS & BENTLEYS',
 'TAKE CARE',
 'SPEAK NOW',
 'PRISM',
 'BORN TO DIE',
 'LOUD',
 'ANTI',
 'BLURRYFACE',
 "HOLLYWOOD'S BLEEDING",
 'SCORPION',
 'STONEY',
 'TAKE ME HOME',
 'THE GREATEST SHOWMAN',
 'BEYONCÉ',
 'THE TRUTH ABOUT LOVE',
 'REPUTATION',
 '?',
 'TRAVELLER',
 'STARBOY',
 'UP ALL NIGHT',
 'MIDNIGHT MEMORIES',
 'MAP OF THE SOUL: PERSONA',
 'GOODBYE & GOOD RIDDANCE',
 'A HEAD FULL OF DREAMS',
 'THE HEIST',
 'THE MARSHALL MATHERS LP 2',
 'LOVER',
 'WATCH THE THRONE',
 "THIS ONE'S FOR YOU"]

In [62]:
artists_lc = [artist.get_text() for artist in artists]
artists_lc

['ADELE',
 'ADELE',
 'MICHAEL BUBLÉ',
 'TAYLOR SWIFT',
 'JUSTIN BIEBER',
 'ED SHEERAN',
 'SOUNDTRACK',
 'KATY PERRY',
 'ED SHEERAN',
 'BRUNO MARS',
 'EMINEM',
 'IMAGINE DRAGONS',
 'SAM SMITH',
 'BRUNO MARS',
 'TAYLOR SWIFT',
 'ED SHEERAN',
 'DRAKE',
 'THE WEEKND',
 'BILLIE EILISH',
 'LADY GAGA',
 'BTS (방탄소년단)',
 'POST MALONE',
 'DRAKE',
 'TAYLOR SWIFT',
 'KATY PERRY',
 'LANA DEL REY',
 'RIHANNA',
 'RIHANNA',
 'TWENTY ONE PILOTS',
 'POST MALONE',
 'DRAKE',
 'POST MALONE',
 'ONE DIRECTION',
 'SOUNDTRACK',
 'BEYONCÉ',
 'P!NK',
 'TAYLOR SWIFT',
 'XXXTENTACION',
 'CHRIS STAPLETON',
 'THE WEEKND',
 'ONE DIRECTION',
 'ONE DIRECTION',
 'BTS (방탄소년단)',
 'JUICE WRLD',
 'COLDPLAY',
 'MACKLEMORE & RYAN LEWIS',
 'EMINEM',
 'TAYLOR SWIFT',
 'JAY-Z & KANYE WEST',
 'LUKE COMBS']

In [72]:
sales_lc = [sale.get_text().replace("Sales: ","") for sale in sales]

['30,000,000',
 '23,000,000',
 '15,000,000',
 '14,748,116',
 '14,000,000',
 '13,787,460',
 '12,632,083',
 '12,134,000',
 '11,879,785',
 '11,270,000',
 '10,873,795',
 '9,616,263',
 '9,321,352',
 '8,976,749',
 '8,889,124',
 '7,705,000',
 '7,687,247',
 '7,584,588',
 '7,256,516',
 '7,166,944',
 '7,130,621',
 '7,116,118',
 '6,920,000',
 '6,917,500',
 '6,692,500',
 '6,674,983',
 '6,673,000',
 '6,537,235',
 '6,500,000',
 '6,461,665',
 '6,433,983',
 '6,371,355',
 '6,334,619',
 '6,318,119',
 '6,290,833',
 '6,231,084',
 '6,186,524',
 '6,182,852',
 '6,157,000',
 '6,070,666',
 '6,046,188',
 '6,020,087',
 '6,010,031',
 '6,002,713',
 '6,000,000',
 '5,858,500',
 '5,790,318',
 '5,686,733',
 '5,550,000',
 '5,490,000']

In [90]:
albums_lc

['21',
 '25',
 'CHRISTMAS',
 '1989',
 'PURPOSE',
 'DIVIDE',
 'FROZEN',
 'TEENAGE DREAM',
 'X',
 'DOO-WOPS & HOOLIGANS',
 'RECOVERY',
 'NIGHT VISIONS',
 'IN THE LONELY HOUR',
 'UNORTHODOX JUKEBOX',
 'RED',
 '+',
 'VIEWS',
 'BEAUTY BEHIND THE MADNESS',
 'WHEN WE ALL FALL ASLEEP, WHERE DO WE GO?',
 'BORN THIS WAY',
 'MAP OF THE SOUL: 7',
 'BEERBONGS & BENTLEYS',
 'TAKE CARE',
 'SPEAK NOW',
 'PRISM',
 'BORN TO DIE',
 'LOUD',
 'ANTI',
 'BLURRYFACE',
 "HOLLYWOOD'S BLEEDING",
 'SCORPION',
 'STONEY',
 'TAKE ME HOME',
 'THE GREATEST SHOWMAN',
 'BEYONCÉ',
 'THE TRUTH ABOUT LOVE',
 'REPUTATION',
 '?',
 'TRAVELLER',
 'STARBOY',
 'UP ALL NIGHT',
 'MIDNIGHT MEMORIES',
 'MAP OF THE SOUL: PERSONA',
 'GOODBYE & GOOD RIDDANCE',
 'A HEAD FULL OF DREAMS',
 'THE HEIST',
 'THE MARSHALL MATHERS LP 2',
 'LOVER',
 'WATCH THE THRONE',
 "THIS ONE'S FOR YOU"]

In [92]:
albums_dict = []
for (albums, artists, sales, country_sales) in zip (albums_lc, artists_lc, sales_lc, country_sales_lc):
    albums_dict.append({
        "Album": albums,
        "Artist": artists,
        "Sales": sales,
        "Sale breakdown": country_sales
    })

albums_dict

[{'Album': '21',
  'Artist': 'ADELE',
  'Sales': '30,000,000',
  'Sale breakdown': 'https://bestsellingalbums.org/album/1034'},
 {'Album': '25',
  'Artist': 'ADELE',
  'Sales': '23,000,000',
  'Sale breakdown': 'https://bestsellingalbums.org/album/1035'},
 {'Album': 'CHRISTMAS',
  'Artist': 'MICHAEL BUBLÉ',
  'Sales': '15,000,000',
  'Sale breakdown': 'https://bestsellingalbums.org/album/30524'},
 {'Album': '1989',
  'Artist': 'TAYLOR SWIFT',
  'Sales': '14,748,116',
  'Sale breakdown': 'https://bestsellingalbums.org/album/45488'},
 {'Album': 'PURPOSE',
  'Artist': 'JUSTIN BIEBER',
  'Sales': '14,000,000',
  'Sale breakdown': 'https://bestsellingalbums.org/album/23318'},
 {'Album': 'DIVIDE',
  'Artist': 'ED SHEERAN',
  'Sales': '13,787,460',
  'Sale breakdown': 'https://bestsellingalbums.org/album/12876'},
 {'Album': 'FROZEN',
  'Artist': 'SOUNDTRACK',
  'Sales': '12,632,083',
  'Sale breakdown': 'https://bestsellingalbums.org/album/42961'},
 {'Album': 'TEENAGE DREAM',
  'Artist': 'KAT

In [93]:
pd.DataFrame(albums_dict)

Unnamed: 0,Album,Artist,Sales,Sale breakdown
0,21,ADELE,30000000,https://bestsellingalbums.org/album/1034
1,25,ADELE,23000000,https://bestsellingalbums.org/album/1035
2,CHRISTMAS,MICHAEL BUBLÉ,15000000,https://bestsellingalbums.org/album/30524
3,1989,TAYLOR SWIFT,14748116,https://bestsellingalbums.org/album/45488
4,PURPOSE,JUSTIN BIEBER,14000000,https://bestsellingalbums.org/album/23318
5,DIVIDE,ED SHEERAN,13787460,https://bestsellingalbums.org/album/12876
6,FROZEN,SOUNDTRACK,12632083,https://bestsellingalbums.org/album/42961
7,TEENAGE DREAM,KATY PERRY,12134000,https://bestsellingalbums.org/album/23977
8,X,ED SHEERAN,11879785,https://bestsellingalbums.org/album/12880
9,DOO-WOPS & HOOLIGANS,BRUNO MARS,11270000,https://bestsellingalbums.org/album/6777


In [94]:
import time
from random import randrange

In [97]:
url = "https://bestsellingalbums.org/decade/2010"

In [106]:
pageurl = "https://bestsellingalbums.org/decade/2010"

In [107]:
df_list = pd.read_html(pageurl)
df_list

ImportError: Missing optional dependency 'html5lib'.  Use pip or conda to install html5lib.

In [102]:
base_url = "https://bestsellingalbums.org/decade/2010-"

In [103]:
counter = 1
total_pages = 10
df_all = []
broken_links = []
for url_number in range (2, total_pages):
    print(f"Scraping link {counter} of {total_pages}")
    counter+=1
    link = f"{base_url}{url_number}"

    try:
        df_list = pd.read.html(link)
        df_all.append(df_list[2])

    except:
        print(f"Something is wrong with {link}!")
        broken_links.append(link)

    finally:
        snooze = randrange (5,7)
        print(f"snoozing for {snooze} seconds before scraping next link.")
        time.sleep(snooze)

    print("Done scraping")

Scraping link 1 of 10
Something is wrong with https://bestsellingalbums.org/decade/2010-2!
snoozing for 6 seconds before scraping next link.
Done scraping
Scraping link 2 of 10
Something is wrong with https://bestsellingalbums.org/decade/2010-3!
snoozing for 5 seconds before scraping next link.


KeyboardInterrupt: 