# Spotify Charts Scraper

In [8]:
import requests
from bs4 import BeautifulSoup as bs

---

## Function Definitions

In [None]:
import sys

def line_refresher(printed_string, variables):
    function_line = printed_string.format(*variables)
    def line_refresher_function(*variable):
        sys.stdout.write("\r" + function_line.format(*variable))
        sys.stdout.flush()
    return line_refresher_function

# b = "the man"
# a = line_refresher("Is '{}' working for {}?", ['{}', b])
# a("Elliott")
# sleep(2)
# a("Frankie")
# sleep(2)
# a("the Man")

In [None]:
from math import log10, floor

def sigfigs(number, sf):
    return max(
        round(number, sf - int(floor(log10(abs(number)))) - 1),
        1)

In [None]:
def diff_timer(seconds, reset=False):
    pass
    # Checks if the number of seconds have passed since the last check. If they have, or reset=True, reset the timer.

In [2]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i : i+n]

In [None]:
import multiprocessing as mp
import os

log_folder = "data/logs"

def mp_download_files(chart_type, country_list, date_list, queue, name):
    total_count = len(country_list) * len(date_list)
    current_count = 0
    error_count = 0
    queue.put( (name, total_count, current_count, error_count) )
    
    output_base = os.path.join("data", chart_type)
    
    for country_abbrev, country_name in country_list:
        
        output_folder = os.path.join(output_base, country_abbrev)
        if not os.path.isdir(output_folder):
            os.mkdir(output_folder)
            
        for date in date_list:
            
            # Add updated information to the queue every ten files
            if file_count % 50 == 0:
                queue.put( (name, total_count, current_count, error_count) )
            
            # Get the data
            try:
                # Attempt to get the data
                r_temp = requests.get("https://spotifycharts.com/{}/{}/daily/{}/download".format(
                    chart_type, country_abbrev, date))
                assert r_temp
                
                output_text = "# {} chart for {} on {}\n#".format(chart_type, country_name, date) + r_temp.text
                
                with open("data/{}/{}_{}_{}.csv".format(chart_type, chart_type, country_abbrev, date), "wb") as f:
                    f.write(output_text.encode('utf-8'))
            
            except Exception as e:
                log_text = "EXCEPTION: {}".format(e)
                log_name = "data/logs/{}_{}_{}.txt".format(chart_type, country_abbrev, date)
                with open(log_name, "w") as log:
                    log.write(log_text)
                error_count += 1
            
            current_count += 1
    
    queue.put( (name, total_count, current_count, error_count) )

---

## Regional Daily Top 200 Songs

#### Get a list of all possible countries and dates to scrape

In [9]:
# Get a list of all possible countries and dates to scrape
r_regional = requests.get("https://spotifycharts.com/regional/")
soup = bs(r_regional.text, "html.parser")

#### Save the output to file

In [10]:
import json

# Download the list of available countries
country_div = soup.find("div", attrs={"class":"responsive-select","data-type":"country"})
country_list = [(li.attrs['data-value'], li.text) for li in country_div.find_all("li")]
json.dump(country_list, open("data/regional_countries.txt", "w"))
          
# Download list of available dates
date_div = soup.find("div", attrs={"class":"responsive-select", "data-type":"date"})
date_list = [li.attrs['data-value'] for li in date_div.find_all("li")]
json.dump(date_list, open("data/regional_dates.txt", "w"))

#### Run parallel processes to download the data

The processes must be entirely imported from another file due to the way that `multiprocessing` interacts with the Interactive Environment of Jupyter Notebooks.
For more details, see:

https://stackoverflow.com/questions/23641475/multiprocessing-working-in-python-but-not-in-ipython/23641560#23641560

https://stackoverflow.com/questions/20222534/python-multiprocessing-on-windows-if-name-main

In [11]:
import time
from IPython.display import display, clear_output
import multiprocessing as mp
import download_workers

queue = mp.Queue()

country_chunks = list(chunks(country_list, 6))

processes = [mp.Process(target=download_workers.mp_download_files,
                        args=('regional', country_chunks[i], date_list, queue, i)) for \
                        i in range(len(country_chunks))]

print("Processes Created.")

active_processes = [True for i in range(len(processes))]
process_text = ["process_{}: starting...".format(i) for i in range(len(processes))]

print("Activating processes...")

for p in processes:
    p.start()

for text in process_text:
    display(text)
    
check_ticker = 0
while any(active_processes):
    text_changed = False
    while not queue.empty():
        text_changed = True
        (p_name, total_count, current_count, error_count) = queue.get()
        if current_count == total_count:
            processes[p_name].join()
            active_processes[p_name] = False
        process_text[p_name] = "process_{}: {} / {} files downloaded ({} errors).".format(
            p_name, current_count, total_count, error_count)
    
    if text_changed:
        clear_output(True)
        for text in process_text:
            display(text)
    
#     check_ticker += 1
#     display(check_ticker)
    time.sleep(1)
        
print("\nDownload complete!")






'process_0: 6912 / 6912 files downloaded (1152 errors).'

'process_1: 6912 / 6912 files downloaded (654 errors).'

'process_2: 6912 / 6912 files downloaded (956 errors).'

'process_3: 6912 / 6912 files downloaded (245 errors).'

'process_4: 6912 / 6912 files downloaded (0 errors).'

'process_5: 6912 / 6912 files downloaded (1217 errors).'

'process_6: 6912 / 6912 files downloaded (2275 errors).'

'process_7: 6912 / 6912 files downloaded (1604 errors).'

'process_8: 6912 / 6912 files downloaded (0 errors).'

'process_9: 6912 / 6912 files downloaded (437 errors).'

'process_10: 6912 / 6912 files downloaded (1180 errors).'


Download complete!


In [10]:
for p in processes:
    p.terminate()

In [None]:
# # Downloading files through parallel processes
# import time
# from IPython.display import display, clear_output

# output_queue = mp.Queue()

# print("Creating processes...")

# country_chunks = list(chunks(country_list, 3))

# processes = [mp.Process(target=mp_download_files, args=('regional', country_chunks[i], date_list, output_queue, i)) for \
#                 i in range(len(country_chunks))]

# print("Processes created.")

# active_processes = [True for i in range(len(processes))]

# process_text = ["process_{}: starting...".format(i) for i in range(len(processes))]

# for text in process_text:
#     display(text)
# # display("\n".join(process_text))

# print("Activating processes...")

# for p in processes:
#     p.start()

# while any(active_processes):
#     text_changed = False
#     while not output_queue.empty():
#         text_changed = True
#         (p_name, total_count, current_count, error_count) = output.get()
#         if current_count == total_count:
#             processes[p_name].join()
#             active_processes[p_name] = False
#             process_text[p_name] = "process_{}: {} / {} files downloaded ({} errors).".format(
#                 p_name, current_count, total_count, error_count)
    
#     if text_changed:
#         clear_output(True)
#         display("\n".join(process_text))
        
# print("\nDownload complete!")
        
        
        


In [None]:
from timeit import default_timer as dtimer

total_files = len(date_list) * len(country_list)
base_increment = sigfigs(total_files/100, 1)

update_line = line_refresher("{} / {} files downloaded...", ['{}', total_files])

file_count = 0
next_file_marker = base_increment
last_refresh_time = dtimer()
update_line(file_count)
start_time = last_refresh_time

for country_pair in country_list:
    for date in date_list:
        
        if file_count == 10:
            print("Predicted time: ", total_files/10 * (dtimer() - start_time))
        
        if file_count >= next_file_marker or dtimer() - 30 > last_refresh_time:
            last_refresh_time = dtimer()
            if file_count >= next_file_marker:
                next_file_marker += base_increment
            update_line(file_count)
        
        country_abbrev = country_pair[0]
        country_name = country_pair[1]
        r_temp = requests.get("https://spotifycharts.com/regional/{}/daily/{}/download".format(country_abbrev, date))
        if r_temp:
            with open("data/top-200_{}_{}.csv".format(country_abbrev, date), "wb") as f:
                f.write("# Top 200 list for {} on {}\n#".format(country_name, date).encode('utf-8'))
                f.write(r_temp.text.encode('utf-8'))
        else:
            print("ERROR: Request failed with code '{}' - {} ({}), {}".format(
                r_temp.status_code, country_name, country_abbrev, date))

        file_count += 1
print("\nComplete.")

---

## Regional Daily Viral 50 Songs

In [None]:
# Get a list of all possible countries and dates to scrape
r_regional = requests.get("https://spotifycharts.com/viral/")
soup = bs(r_regional.text, "html.parser")

In [None]:
v_country_div = soup.find("div", attrs={"class":"responsive-select","data-type":"country"})
v_country_list = [(li.attrs['data-value'], li.text) for li in v_country_div.find_all("li")]

In [None]:
v_date_div = soup.find("div", attrs={"class":"responsive-select", "data-type":"date"})
v_date_list = [li.attrs['data-value'] for li in v_date_div.find_all("li")]
print("Time span of charts (years):", len(v_date_list)/365)
v_sorted_date_list = sorted(v_date_list)
print("From {} to {}".format(v_sorted_date_list[0], v_sorted_date_list[-1]))

In [None]:
from timeit import default_timer as dtimer

v_total_files = len(v_date_list) * len(v_country_list)
base_increment = sigfigs(v_total_files/100, 1)

update_line = line_refresher("{} / {} files downloaded...", ['{}', v_total_files])

file_count = 0
next_file_marker = base_increment
last_refresh_time = dtimer()
update_line(file_count)

for country_pair in v_country_list:
    for date in v_date_list:
        
        if file_count >= next_file_marker or dtimer() - 30 > last_refresh_time:
            last_refresh_time = dtimer()
            if file_count >= next_file_marker:
                next_file_marker += base_increment
            update_line(file_count)
        
        country_abbrev = country_pair[0]
        country_name = country_pair[1]
        r_temp = requests.get("https://spotifycharts.com/viral/{}/daily/{}/download".format(country_abbrev, date))
        if r_temp:
            with open("data/viral-50_{}_{}.csv".format(country_abbrev, date), "wb") as f:
                f.write("# Top 200 list for {} on {}\n#".format(country_name, date).encode('utf-8'))
                f.write(r_temp.text.encode('utf-8'))
        else:
            print("ERROR: Request failed with code '{}' - {} ({}), {}".format(
                r_temp.status_code, country_name, country_abbrev, date))

        file_count += 1
print("\nComplete.")

---

## Importing into a DataFrame

In [None]:
import pandas as pd
test_df = pd.read_csv("data/top-200_global_2020-02-29.csv", comment='#', encoding="latin-1")
test_df.head()

In [2]:
import os
import pandas
os.listdir("data/logs")

['regional_ad_2017-01-01.txt',
 'regional_ad_2017-01-02.txt',
 'regional_ad_2017-01-03.txt',
 'regional_ad_2017-01-04.txt',
 'regional_ad_2017-01-05.txt',
 'regional_ad_2017-01-06.txt',
 'regional_ad_2017-01-07.txt',
 'regional_ad_2017-01-08.txt',
 'regional_ad_2017-01-09.txt',
 'regional_ad_2017-01-10.txt',
 'regional_ad_2017-01-11.txt',
 'regional_ad_2017-01-12.txt',
 'regional_ad_2017-01-13.txt',
 'regional_ad_2017-01-14.txt',
 'regional_ad_2017-01-15.txt',
 'regional_ad_2017-01-16.txt',
 'regional_ad_2017-01-17.txt',
 'regional_ad_2017-01-18.txt',
 'regional_ad_2017-01-19.txt',
 'regional_ad_2017-01-20.txt',
 'regional_ad_2017-01-21.txt',
 'regional_ad_2017-01-22.txt',
 'regional_ad_2017-01-23.txt',
 'regional_ad_2017-01-24.txt',
 'regional_ad_2017-01-25.txt',
 'regional_ad_2017-01-26.txt',
 'regional_ad_2017-01-27.txt',
 'regional_ad_2017-01-28.txt',
 'regional_ad_2017-01-29.txt',
 'regional_ad_2017-01-30.txt',
 'regional_ad_2017-01-31.txt',
 'regional_ad_2017-02-01.txt',
 'region

In [3]:
pandas.DataFrame(os.listdir("data/logs"))

Unnamed: 0,0
0,regional_ad_2017-01-01.txt
1,regional_ad_2017-01-02.txt
2,regional_ad_2017-01-03.txt
3,regional_ad_2017-01-04.txt
4,regional_ad_2017-01-05.txt
...,...
9718,regional_za_2018-06-24.txt
9719,regional_za_2018-07-08.txt
9720,regional_za_2018-07-15.txt
9721,regional_za_2018-07-22.txt


In [4]:
import re
def find_country(text):
    return re.findall("^regional_(\w\w)_", text)[0]

In [6]:
country_set = set()
for file in os.listdir("data/logs"):
    country = find_country(file)
    country_set.add(country)

In [7]:
country_set

{'ad',
 'bg',
 'cy',
 'ee',
 'il',
 'in',
 'lt',
 'lu',
 'lv',
 'mc',
 'mt',
 'my',
 'ni',
 'ro',
 'sk',
 'th',
 'vn',
 'za'}

In [11]:
country_list

[('global', 'Global'),
 ('us', 'United States'),
 ('gb', 'United Kingdom'),
 ('ad', 'Andorra'),
 ('ar', 'Argentina'),
 ('at', 'Austria'),
 ('au', 'Australia'),
 ('be', 'Belgium'),
 ('bg', 'Bulgaria'),
 ('bo', 'Bolivia'),
 ('br', 'Brazil'),
 ('ca', 'Canada'),
 ('ch', 'Switzerland'),
 ('cl', 'Chile'),
 ('co', 'Colombia'),
 ('cr', 'Costa Rica'),
 ('cy', 'Cyprus'),
 ('cz', 'Czech Republic'),
 ('de', 'Germany'),
 ('dk', 'Denmark'),
 ('do', 'Dominican Republic'),
 ('ec', 'Ecuador'),
 ('ee', 'Estonia'),
 ('es', 'Spain'),
 ('fi', 'Finland'),
 ('fr', 'France'),
 ('gr', 'Greece'),
 ('gt', 'Guatemala'),
 ('hk', 'Hong Kong'),
 ('hn', 'Honduras'),
 ('hu', 'Hungary'),
 ('id', 'Indonesia'),
 ('ie', 'Ireland'),
 ('il', 'Israel'),
 ('in', 'India'),
 ('is', 'Iceland'),
 ('it', 'Italy'),
 ('jp', 'Japan'),
 ('lt', 'Lithuania'),
 ('lu', 'Luxembourg'),
 ('lv', 'Latvia'),
 ('mc', 'Monaco'),
 ('mt', 'Malta'),
 ('mx', 'Mexico'),
 ('my', 'Malaysia'),
 ('ni', 'Nicaragua'),
 ('nl', 'Netherlands'),
 ('no', 'Norway

In [12]:
full_cl = [x[0] for x in country_list]

In [13]:
whole_cl = [x for x in full_cl if x not in country_set]

In [15]:
whole_cl
len(whole_cl)

48

In [19]:
from datetime import timedelta, date

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

start_date = date(2017, 6, 9)
end_date = date(2020, 3, 4)
for single_date in daterange(start_date, end_date):
    date_string = single_date.strftime("%Y-%m-%d")
    if date_string not in date_list:
        print(date_string)

In [18]:
date_list

['2020-03-04',
 '2020-03-03',
 '2020-03-02',
 '2020-03-01',
 '2020-02-29',
 '2020-02-28',
 '2020-02-27',
 '2020-02-26',
 '2020-02-25',
 '2020-02-24',
 '2020-02-23',
 '2020-02-22',
 '2020-02-21',
 '2020-02-20',
 '2020-02-19',
 '2020-02-18',
 '2020-02-17',
 '2020-02-16',
 '2020-02-15',
 '2020-02-14',
 '2020-02-13',
 '2020-02-12',
 '2020-02-11',
 '2020-02-10',
 '2020-02-09',
 '2020-02-08',
 '2020-02-07',
 '2020-02-06',
 '2020-02-05',
 '2020-02-04',
 '2020-02-03',
 '2020-02-02',
 '2020-02-01',
 '2020-01-31',
 '2020-01-30',
 '2020-01-29',
 '2020-01-28',
 '2020-01-27',
 '2020-01-26',
 '2020-01-25',
 '2020-01-24',
 '2020-01-23',
 '2020-01-22',
 '2020-01-21',
 '2020-01-20',
 '2020-01-19',
 '2020-01-18',
 '2020-01-17',
 '2020-01-16',
 '2020-01-15',
 '2020-01-14',
 '2020-01-13',
 '2020-01-12',
 '2020-01-11',
 '2020-01-10',
 '2020-01-09',
 '2020-01-08',
 '2020-01-07',
 '2020-01-06',
 '2020-01-05',
 '2020-01-04',
 '2020-01-03',
 '2020-01-02',
 '2020-01-01',
 '2019-12-31',
 '2019-12-30',
 '2019-12-