In [32]:
# creating spotify charts scraper
import pandas as pd
from time import time, sleep
import requests
from datetime import timedelta, date, datetime
from pprint import PrettyPrinter
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import pickle
pp = PrettyPrinter(indent = 2)


In [2]:
class Scraper():
    
    base = "https://spotifycharts.com/regional/us/daily/"
    
    def __init__(self, start_date = date(2017,1,1), end_date = date(2020, 4, 13)):
        self.start_date = start_date
        self.end_date = end_date
        self.days_count = (end_date - start_date).days 
    
    def _get_table(self, d):
        rq = requests.get(self.base + d.strftime("%Y-%m-%d"))
        if rq.status_code == 200:
            s = BeautifulSoup(rq.text, "html.parser")
            topchart = s.find("table", {"class": "chart-table"})
            body = topchart.find("tbody")

            return body
        else:
            print(f"{d}: {rq.status_code}")
            return
        
    def _get_data(self, table, d):
        dt = {i:[] for i in ["rank", "artist", "title", "streams", "date"]}
        for tr in table.find_all("tr"):
            dt["rank"].append(tr.find("td", {"class": "chart-table-position"}).text)
            dt["artist"].append(tr.find("td", {"class": "chart-table-track"}).find("span").text.replace("by ","").strip())
            dt["title"].append(tr.find("td", {"class": "chart-table-track"}).find("strong").text)
            dt["streams"].append(tr.find("td", {"class": "chart-table-streams"}).text)
            dt["date"].append(d)
        return pd.DataFrame(dt )
            
            
    
    def run(self):
        start_time = datetime.now()
        errors = []
        df = pd.DataFrame(columns =  ["rank", "artist", "title", "streams", "date"])
        for d in tqdm([self.start_date + timedelta(n) for n in range(self.days_count)]):
            table = self._get_table(d)
            if table is None:
                errors.append(d)
                
            else:
                tempdf = self._get_data(table, d)
                df = pd.concat([df, tempdf])
        
        end_time = datetime.now()
        print(f"Finished in {str(end_time - start_time)}")
        return df, errors
            
        

In [3]:
s = Scraper()

In [6]:
df, e = s.run()

HBox(children=(FloatProgress(value=0.0, max=1198.0), HTML(value='')))

2017-05-30: 404
2017-05-31: 404
2017-06-02: 404

Finished in 0:29:25.179703


In [13]:
df.head()

Unnamed: 0,rank,artist,title,streams,date
0,1,Migos,Bad and Boujee (feat. Lil Uzi Vert),1371493,2017-01-01
1,2,Drake,Fake Love,1180074,2017-01-01
2,3,The Weeknd,Starboy,1064351,2017-01-01
3,4,The Chainsmokers,Closer,1010492,2017-01-01
4,5,Rae Sremmurd,Black Beatles,874289,2017-01-01


In [7]:
len(df.title.unique())

4934

In [8]:
e

[datetime.date(2017, 5, 30),
 datetime.date(2017, 5, 31),
 datetime.date(2017, 6, 2)]

In [22]:
df.set_axis(list(range(len(df))), inplace = True)

In [26]:
df.head()

Unnamed: 0,rank,artist,title,streams,date
0,1,Migos,Bad and Boujee (feat. Lil Uzi Vert),1371493,2017-01-01
1,2,Drake,Fake Love,1180074,2017-01-01
2,3,The Weeknd,Starboy,1064351,2017-01-01
3,4,The Chainsmokers,Closer,1010492,2017-01-01
4,5,Rae Sremmurd,Black Beatles,874289,2017-01-01


In [27]:
df.to_csv("TopCharts.csv", index = False)

In [29]:
def create_unique(df):
    d = dict()
    for i in tqdm(range(len(df))):
        a = str(df.loc[i,"artist"])
        t = str(df.loc[i,"title"])
        if a not in d:
            d[a] = set()
        d[a].add(t)
    return d

In [30]:
d = create_unique(df)

HBox(children=(FloatProgress(value=0.0, max=239000.0), HTML(value='')))




In [34]:
with open('unique.data', 'wb') as f:
    pickle.dump(d, f)