In [1]:
# creating spotify charts scraper
import pandas as pd
from time import time, sleep
import requests
from datetime import timedelta, date, datetime
from pprint import PrettyPrinter
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import pickle
pp = PrettyPrinter(indent = 2)


In [2]:
class Scraper():
    
    base = "https://spotifycharts.com/regional/us/daily/"
    
    def __init__(self, start_date = date(2017,1,1), end_date = date(2020, 4, 13)):
        """
        input: start_date: date object, first date to consider (oldest possible is 2017-1-1)
               end_date: date object, final date to consider
        """
        self.start_date = max(start_date, date(2017,1,1))
        self.end_date = min(end_date, date.today())
        self.days_count = (end_date - start_date).days 
    
    def _get_table(self, d):
        """
        [Private function]
        Gets html page of songs on Top 200 Chart for given date d and outputs html table with relevant data
        
        input: d = date object
        output: body of table in html form if valid request, else None
        """
        rq = requests.get(self.base + d.strftime("%Y-%m-%d"))
        if rq.status_code == 200:
            s = BeautifulSoup(rq.text, "html.parser")
            topchart = s.find("table", {"class": "chart-table"})
            body = topchart.find("tbody")

            return body
        else:
            print(f"{d}: {rq.status_code}")
            return
        
    def _get_data(self, table, d):
        """
        [Private function]
        Scrapes table to create dataframe of songs on top 200 chart for given date d 
        
        input: table = body of table in html form
               d = date object
        output: dataframe of songs in top 200 chart on inputted date
        """
        data = []
        for tr in table.find_all("tr"):
            song_info = dict()
            song_info["rank"] = tr.find("td", {"class": "chart-table-position"}).text
            song_info["artist"] = tr.find("td", {"class": "chart-table-track"}).find("span").text.replace("by ","").strip()
            song_info["title"] = tr.find("td", {"class": "chart-table-track"}).find("strong").text
            song_info["streams"] = tr.find("td", {"class": "chart-table-streams"}).text
            song_info["date"] = d
            data.append(song_info)
        return data
            
            
    
    def run(self):
        """
        [Public function]
        Iterates through all days between start_date and end_date and scrapes top 200 chart for each date.
        
        output: dataframe of songs in top 200 charts from start_date to end_date
                list of dates where data is not retrievable
        """
        start_time = datetime.now()
        errors = []
        data = []
        for d in tqdm([self.start_date + timedelta(n) for n in range(self.days_count)]):
            table = self._get_table(d)
            if table is None:
                errors.append(d)
            else:
                data.extend(self._get_data(table, d))

        df = pd.DataFrame(data, columns =  ["rank", "date", "artist", "title", "streams"])
        end_time = datetime.now()
        print(f"Finished in {str(end_time - start_time)}")
        return df, errors
            
        

In [6]:
s = Scraper()

In [7]:
df, e = s.run()

HBox(children=(FloatProgress(value=0.0, max=1198.0), HTML(value='')))

2017-05-30: 404
2017-05-31: 404
2017-06-02: 404

Finished in 0:20:47.482751


In [8]:
df.head()

Unnamed: 0,rank,date,artist,title,streams
0,1,2017-01-01,Migos,Bad and Boujee (feat. Lil Uzi Vert),1371493
1,2,2017-01-01,Drake,Fake Love,1180074
2,3,2017-01-01,The Weeknd,Starboy,1064351
3,4,2017-01-01,The Chainsmokers,Closer,1010492
4,5,2017-01-01,Rae Sremmurd,Black Beatles,874289


In [9]:
e #dates for which data is not retrievable

[datetime.date(2017, 5, 30),
 datetime.date(2017, 5, 31),
 datetime.date(2017, 6, 2)]

In [11]:
df = df.reset_index(drop = True)

In [12]:
df.head()

Unnamed: 0,rank,date,artist,title,streams
0,1,2017-01-01,Migos,Bad and Boujee (feat. Lil Uzi Vert),1371493
1,2,2017-01-01,Drake,Fake Love,1180074
2,3,2017-01-01,The Weeknd,Starboy,1064351
3,4,2017-01-01,The Chainsmokers,Closer,1010492
4,5,2017-01-01,Rae Sremmurd,Black Beatles,874289


In [13]:
df.to_csv("TopCharts_incomplete.csv", index = False)