## Scraping non-tabular, multipage sites
Scrape the top 500 <a href="https://bestsellingalbums.org/decade/2010">best-selling albums of the 2010's</a>. Your data must include the following datapoints:

- Name of album
- Name of artist
- Number of albums sold 
- The link to the page that breaks down sales by country (found by clicking album title)



### My Approach

- verify data on page
- find targets tags and classes using inspect elements
- scrape for a single page to verify
- iterate through all pages

In [1]:
## import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from random import randrange

In [2]:
## Scrape one page of data points (then we build one that iterates through all pages)

## url to scrape

url = "https://bestsellingalbums.org/decade/2010"


In [3]:
## request site
response = requests.get(url)
response

<Response [200]>

In [4]:
## turn response into soup (navigable html from string)
soup = BeautifulSoup(response.text, "html.parser")


In [5]:
## grab ALL albums and store in variable
all_albums = soup.find_all("div", class_="album_card")
# all_albums

In [6]:
## look at one album data
all_albums[0]

<div class="album_card"><div class="rank">1</div><div class="cover"><img class="pic" onerror="this.src='../includes/default.png';this.onerror='';" src="../covers/1034.jpg"/></div><div class="data_col"><div class="album"><a href="https://bestsellingalbums.org/album/1034">21</a></div><div class="artist"><a href="https://bestsellingalbums.org/artist/218" title="ADELE album sales">ADELE</a></div><div class="sales">Sales: 30,000,000</div><div class="rank_mobile">Rankings:</div><div class="ranks_row"><div class="ranks"><a href="https://bestsellingalbums.org/year/2011" title="Best-selling albums of 2011"><span class="ranks_desc_art">Rank in </span>2011</a> : 1</div><div class="ranks"><a href="https://bestsellingalbums.org/decade/2010" title="Best-selling albums of 2010's"><span class="ranks_desc_art">Rank in </span>2010's</a>: 1</div><div class="ranks"><a href="https://bestsellingalbums.org/overall" title="Best-selling albums of all time">Overall<span class="ranks_desc_art"> rank</span></a> :

In [7]:
## because the links and titles are in the same card
## all album titles & links by for loop

albums_list = []
albums_url_list = []
for album in all_albums:
    albums_list.append(album.find("div", class_="album").get_text())
    albums_url_list.append(album.find("a").get("href"))

In [8]:
## see albums list only
print(albums_list)

['21', '25', 'CHRISTMAS', '1989', 'PURPOSE', 'DIVIDE', 'FROZEN', 'TEENAGE DREAM', 'X', 'DOO-WOPS & HOOLIGANS', 'RECOVERY', 'NIGHT VISIONS', 'IN THE LONELY HOUR', 'UNORTHODOX JUKEBOX', 'RED', '+', 'VIEWS', 'BEAUTY BEHIND THE MADNESS', 'WHEN WE ALL FALL ASLEEP, WHERE DO WE GO?', 'BORN THIS WAY', 'MAP OF THE SOUL: 7', 'BEERBONGS & BENTLEYS', 'TAKE CARE', 'SPEAK NOW', 'PRISM', 'BORN TO DIE', 'LOUD', 'ANTI', 'BLURRYFACE', "HOLLYWOOD'S BLEEDING", 'SCORPION', 'STONEY', 'TAKE ME HOME', 'THE GREATEST SHOWMAN', 'BEYONCÉ', 'THE TRUTH ABOUT LOVE', 'REPUTATION', '?', 'TRAVELLER', 'STARBOY', 'UP ALL NIGHT', 'MIDNIGHT MEMORIES', 'MAP OF THE SOUL: PERSONA', 'GOODBYE & GOOD RIDDANCE', 'A HEAD FULL OF DREAMS', 'THE HEIST', 'THE MARSHALL MATHERS LP 2', 'LOVER', 'WATCH THE THRONE', "THIS ONE'S FOR YOU"]


In [9]:
## see album links
print(albums_url_list)

['https://bestsellingalbums.org/album/1034', 'https://bestsellingalbums.org/album/1035', 'https://bestsellingalbums.org/album/30524', 'https://bestsellingalbums.org/album/45488', 'https://bestsellingalbums.org/album/23318', 'https://bestsellingalbums.org/album/12876', 'https://bestsellingalbums.org/album/42961', 'https://bestsellingalbums.org/album/23977', 'https://bestsellingalbums.org/album/12880', 'https://bestsellingalbums.org/album/6777', 'https://bestsellingalbums.org/album/13756', 'https://bestsellingalbums.org/album/19810', 'https://bestsellingalbums.org/album/39978', 'https://bestsellingalbums.org/album/6778', 'https://bestsellingalbums.org/album/45494', 'https://bestsellingalbums.org/album/12875', 'https://bestsellingalbums.org/album/12457', 'https://bestsellingalbums.org/album/47839', 'https://bestsellingalbums.org/album/5207', 'https://bestsellingalbums.org/album/25786', 'https://bestsellingalbums.org/album/6859', 'https://bestsellingalbums.org/album/36763', 'https://bestse

In [10]:
## get artist names
## by List comprehension form
artists_list = [artist.find("div", class_="artist").get_text() for artist in all_albums]
print(artists_list)

['ADELE', 'ADELE', 'MICHAEL BUBLÉ', 'TAYLOR SWIFT', 'JUSTIN BIEBER', 'ED SHEERAN', 'SOUNDTRACK', 'KATY PERRY', 'ED SHEERAN', 'BRUNO MARS', 'EMINEM', 'IMAGINE DRAGONS', 'SAM SMITH', 'BRUNO MARS', 'TAYLOR SWIFT', 'ED SHEERAN', 'DRAKE', 'THE WEEKND', 'BILLIE EILISH', 'LADY GAGA', 'BTS (방탄소년단)', 'POST MALONE', 'DRAKE', 'TAYLOR SWIFT', 'KATY PERRY', 'LANA DEL REY', 'RIHANNA', 'RIHANNA', 'TWENTY ONE PILOTS', 'POST MALONE', 'DRAKE', 'POST MALONE', 'ONE DIRECTION', 'SOUNDTRACK', 'BEYONCÉ', 'P!NK', 'TAYLOR SWIFT', 'XXXTENTACION', 'CHRIS STAPLETON', 'THE WEEKND', 'ONE DIRECTION', 'ONE DIRECTION', 'BTS (방탄소년단)', 'JUICE WRLD', 'COLDPLAY', 'MACKLEMORE & RYAN LEWIS', 'EMINEM', 'TAYLOR SWIFT', 'JAY-Z & KANYE WEST', 'LUKE COMBS']


In [11]:
## target sales data and store in list
sales_list = []

for sale in all_albums:
    sales = sale.find("div", class_="sales").get_text() ## get the sales text
    sales = int(sales.replace("Sales: ","").replace(",","")) ## Turn into integer remove Sales: and commas
    sales_list.append(sales)

In [12]:
## print sales list
print(sales_list)

[30000000, 23000000, 15000000, 14748116, 14000000, 13787460, 12632083, 12134000, 11879785, 11270000, 10873795, 9616263, 9321352, 8976749, 8889124, 7705000, 7687247, 7584588, 7256516, 7166944, 7130621, 7116118, 6920000, 6917500, 6692500, 6674983, 6673000, 6537235, 6500000, 6461665, 6433983, 6371355, 6334619, 6318119, 6290833, 6231084, 6186524, 6182852, 6157000, 6070666, 6046188, 6020087, 6010031, 6002713, 6000000, 5858500, 5790318, 5686733, 5550000, 5490000]


In [13]:
## zip into tuple
album_data = []
for all_data in zip(artists_list, albums_list, sales_list, albums_url_list):
    album_data.append(all_data)
    
print(album_data)

[('ADELE', '21', 30000000, 'https://bestsellingalbums.org/album/1034'), ('ADELE', '25', 23000000, 'https://bestsellingalbums.org/album/1035'), ('MICHAEL BUBLÉ', 'CHRISTMAS', 15000000, 'https://bestsellingalbums.org/album/30524'), ('TAYLOR SWIFT', '1989', 14748116, 'https://bestsellingalbums.org/album/45488'), ('JUSTIN BIEBER', 'PURPOSE', 14000000, 'https://bestsellingalbums.org/album/23318'), ('ED SHEERAN', 'DIVIDE', 13787460, 'https://bestsellingalbums.org/album/12876'), ('SOUNDTRACK', 'FROZEN', 12632083, 'https://bestsellingalbums.org/album/42961'), ('KATY PERRY', 'TEENAGE DREAM', 12134000, 'https://bestsellingalbums.org/album/23977'), ('ED SHEERAN', 'X', 11879785, 'https://bestsellingalbums.org/album/12880'), ('BRUNO MARS', 'DOO-WOPS & HOOLIGANS', 11270000, 'https://bestsellingalbums.org/album/6777'), ('EMINEM', 'RECOVERY', 10873795, 'https://bestsellingalbums.org/album/13756'), ('IMAGINE DRAGONS', 'NIGHT VISIONS', 9616263, 'https://bestsellingalbums.org/album/19810'), ('SAM SMITH',

In [17]:
## convert to df
df = pd.DataFrame(album_data)
df.columns = ["artist", "title", "sales", "more_info"]

## Notice you dont have run multiple for loops because everything you want can be found in the same temporary variable

In [None]:
## all in one
artists_list = []
albums_list = []
albums_url_list = []
sales_list = []

for target in all_albums:
    ## artist name
    artists_list.append(target.find("div", class_="artist").get_text())
    ## album title
    albums_list.append(target.find("div", class_="album").get_text())
    ## album links
    albums_url_list.append(target.find("a").get("href"))
    ##sales
    sales = target.find("div", class_="sales").get_text() ## get the sales text
    sales = int(sales.replace("Sales: ","").replace(",","")) ## Turn into integer remove Sales: and commas
    sales_list.append(sales)

In [None]:
## convert to df
album_data = []
for all_data in zip(artists_list, albums_list, sales_list, albums_url_list):
    album_data.append(all_data)
    
df = pd.DataFrame(album_data)
df.columns = ["artist", "title", "sales", "more_info"]
df

## iterate to capture the first 500

In [None]:

all_dfs = [] ## hold all dfs
url = "https://bestsellingalbums.org/decade/2010" ## base url


count = 1 ## count
while count <=10:
    print(f"Scraping {url}")
    ## get response
    response = requests.get(url)
    print(response)
    ## turn response into soup (navigable html from string)
    soup = BeautifulSoup(response.text, "html.parser")
    print("converted to soup")
    ## grab ALL albums data and store in variable
    all_albums = soup.find_all("div", class_="album_card")
    print("got all album data")
    ## name lists to hold data
    artists_list = []
    albums_list = []
    albums_url_list = []
    sales_list = []

    ## iterate through to capture target data points
    for target in all_albums:
        ## artist name
        artists_list.append(target.find("div", class_="artist").get_text())
        ## album title
        albums_list.append(target.find("div", class_="album").get_text())
        ## album links
        albums_url_list.append(target.find("a").get("href"))
        ##sales
        sales = target.find("div", class_="sales").get_text() ## get the sales text
        sales = int(sales.replace("Sales: ","").replace(",","")) ## Turn into integer remove Sales: and commas
        sales_list.append(sales)
        
    ## zip to tuple
    album_data = []
    for all_data in zip(artists_list, albums_list, sales_list, albums_url_list):
        album_data.append(all_data)
#   convert to df
    df = pd.DataFrame(album_data)
    df.columns = ["artist", "title", "sales", "more_info"]
    all_dfs.append(df)
    print("dataframe in list of dataframes")

    
    ## incredment url and set timer
    count += 1
    url = "https://bestsellingalbums.org/decade/2010"
    url = f"{url}-{count}"
    snoozer = randrange(5,12)
    print(f"snoozing for {snoozer} seconds before next scrape")
    time.sleep(snoozer)
    
print("done scraping all links")        
        
    
    
    

In [None]:
## check if correct number of dfs
len(all_dfs)

In [None]:
## call len a single df to verify correct amount of data points
len(all_dfs[1])

In [None]:
## convert to a single df rather than a list of df
df = pd.concat(all_dfs, ignore_index = True)
df

In [None]:
## call df with 500 albums
df