# Spotify Charts Scraper

#### Run parallel processes to download the data

The processes must be entirely imported from another file due to the way that `multiprocessing` interacts with the Interactive Environment of Jupyter Notebooks.
For more details, see:

https://stackoverflow.com/questions/23641475/multiprocessing-working-in-python-but-not-in-ipython/23641560#23641560

https://stackoverflow.com/questions/20222534/python-multiprocessing-on-windows-if-name-main


In [3]:
import requests
from bs4 import BeautifulSoup as bs

import time
from IPython.display import display, clear_output
import multiprocessing as mp
import mp_workers
import os

from timeit import default_timer as dtimer

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import pandas
import numpy
from ast import literal_eval

In [111]:
import mp_workers

---

## Regional Daily Top 200 Songs

In [7]:
def diff_timer(seconds_since_reset, reset=False):
    global g_diff_time
    
    current_time = dtimer()
    timed_out = False
    
    if (current_time - g_diff_time) > seconds_since_reset:
        reset = True
        timed_out = True
        
    if reset:
        g_diff_time = current_time
    
    return timed_out
        

def check_folder(folder_path):
    if not os.path.isdir(folder_path):
        os.mkdir(folder_path)

# Split the main part up into two functions. This will allow me to use functions, but still have the processes as a 
# global variable, which I can then pass to the kill function if I need to.
        
def setup_download_processes(chart_type):
        
    # Make the necessary data folders
    check_folder("data")

    chart_data = os.path.join("data", chart_type)
    check_folder(chart_data)

    # Make necessary logs folders
    log_folder = os.path.join(chart_data, "logs")
    check_folder(log_folder)

    # Get initial page data to scrape possible countries and dates
    r_base = requests.get("https://spotifycharts.com/{}".format(chart_type))
    soup = bs(r_base.text, "html.parser")

    # Get a list of all possible dates
    # NOTE: This works on the assumption that the Global charts exist for all dates
    date_div = soup.find("div", attrs={"class":"responsive-select", "data-type": "date"})
    date_list = [li.attrs['data-value'] for li in date_div.find_all("li")]

    # Get a list of all possible countries
    country_div = soup.find("div", attrs={"class":"responsive-select", "data-type":"country"})
    country_list = [li.attrs['data-value'] for li in country_div.find_all('li')]

    # Split the country_list into equal-sized chunks for parallel processing
    chunk_size = int( len(country_list) / os.cpu_count() )
    country_chunks = [country_list[i : i+chunk_size] for i in range(0, len(country_list), chunk_size)]

    # Create the Queue used for communication during parallel processing
    queue = mp.Queue()

    # Create the processes
    processes = [mp.Process(target=mp_workers.download_chart_files,
                            args=(chart_type, country_chunks[i], date_list, queue, i)) for \
                            i in range(len(country_chunks))]

    print("Processes Created.")
    
    return (processes, queue)

def start_processes(processes, queue):

    # Set up display text and activation check arrays
    active_processes = [True for i in range(len(processes))]
    process_text = ["process_{}: starting...".format(i) for i in range(len(processes))]

    print("Activating processes...")

    # Start the processes
    for i in range(len(processes)):
        processes[i].start()
        print("Activated process {}".format(i))

    # Display initial text
    for text in process_text:
        display(text)

    while any(active_processes):

        text_changed = False

        while not queue.empty():
            text_changed = True

            (p_name, total_count, current_count, error_count) = queue.get()
            if current_count == total_count:
                # The specified process has finished their run. Wait for them to die and set their status to dead
                processes[p_name].join()
                active_processes[p_name] = False

            # Update text information about specified process
            process_text[p_name] = "process_{}: {} / {} files downloaded ({} errors).".format(
                p_name, current_count, total_count, error_count)

        if text_changed:
            clear_output(True)
            for text in process_text:
                display(text)

        time.sleep(1)
        
    queue.close()
    print("\nDownload complete!")
    
def kill_processes(processes, queue):
    for p in processes:
        p.terminate()
    queue.close()
    

## Top 200

In [8]:
processes, queue = setup_download_processes("regional")
start_processes(processes, queue)

'process_0: 9264 / 9264 files downloaded (2 errors).'

'process_1: 9264 / 9264 files downloaded (2 errors).'

'process_2: 9264 / 9264 files downloaded (2 errors).'

'process_3: 9264 / 9264 files downloaded (1 errors).'

'process_4: 9264 / 9264 files downloaded (1 errors).'

'process_5: 9264 / 9264 files downloaded (3 errors).'

'process_6: 9264 / 9264 files downloaded (3 errors).'

'process_7: 9264 / 9264 files downloaded (2 errors).'

'process_8: 2316 / 2316 files downloaded (0 errors).'


Download complete!


## Viral 50

In [9]:
processes, queue = setup_download_processes("viral")
start_processes(processes, queue)

'process_0: 9072 / 9072 files downloaded (1 errors).'

'process_1: 9072 / 9072 files downloaded (2 errors).'

'process_2: 9072 / 9072 files downloaded (3 errors).'

'process_3: 9072 / 9072 files downloaded (1 errors).'

'process_4: 9072 / 9072 files downloaded (2 errors).'

'process_5: 9072 / 9072 files downloaded (1 errors).'

'process_6: 9072 / 9072 files downloaded (2 errors).'

'process_7: 9072 / 9072 files downloaded (2 errors).'

'process_8: 3402 / 3402 files downloaded (0 errors).'


Download complete!


### Kill Remaining Processes

In [5]:
kill_processes(processes, queue)

## Compile Master CSV Files

In [None]:
# Create a CSV for each chart type for each country
# Create a SongInfo.csv file that contains information about each song
# Create an ArtistInfo.csv file that contains information about each artist

---

# Downloading Track Info

In [113]:
with open("data/regional/track_ids") as f:
    data = f.read()
track_ids = data.split()
print(len(track_ids))
print(track_ids[0])

69688
1Y2CGJS28CtJHMVj06vZx8


In [114]:
# Chunk the track_ids
track_id_chunks = [track_ids[i:i+50] for i in range(0, len(track_ids), 50)]

In [11]:
# Create Spotipy Login
cid ="37bf09c4a866456ebea69781382fb291" 
secret = "04a9cd8f2f0e42c3b7072d58932d03a5"

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [32]:
input_data_list = [(i, 'regional', track_id_chunks[i], sp) for i in range(len(track_id_chunks))]

In [50]:
pool = mp.Pool()
pool.map(mp_workers.get_track_info, input_data_list)

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [55]:
df = pandas.DataFrame()
track_info_folder = "data/regional/_track_info"
file_list = os.listdir(track_info_folder)
for file in file_list:
    filepath = os.path.join(track_info_folder, file)
    df2 = pandas.read_csv(filepath)
    df = pandas.concat([df, df2])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [63]:
text_csv = df.to_csv("data/regional/_track_info/master_file.csv", index=False)
# with open("data/regional/_track_info/master_file.csv", "w") as f:
#     f.write(df.to_csv(index=False, encoding="latin-1"))

In [62]:
text_csv[500:601]

'0.084,-6.858,1.0,Freedom,day,0.382,81.95200000000001,4.0,9,0.754\r\n0.159,,7blqqbp3IJHtpsJT5sdtnC,Aşık,'

## Download Artist Info

In [4]:
df = pandas.read_csv("data/regional/_track_info/master_file.csv")

In [5]:
df.loc[ df['additional_artists'].isnull(), 'additional_artists'] = "()"

In [6]:
df['additional_artists']

0                   (('Nemir', '5f6nz3iqzrfiUfKOIKvLvd'),)
1                                                       ()
2                                                       ()
3                                                       ()
4                                                       ()
                               ...                        
69683                                                   ()
69684          (('Anne-Marie', '1zNqDE7qDGCsyzJwohVaoX'),)
69685                                                   ()
69686    (('roselilah', '2ZyAiHtAnET9VY5LsnhUrr'), ('Fr...
69687                                                   ()
Name: additional_artists, Length: 69688, dtype: object

In [70]:
df.columns

Index(['acousticness', 'additional_artists', 'album_id', 'album_name',
       'album_release_date', 'album_tracks', 'artist_id', 'artist_name',
       'available_markets', 'current_popularity', 'danceability', 'duration',
       'energy', 'explicit', 'id', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'name', 'release_date_precision', 'speechiness',
       'tempo', 'time_signature', 'track_number', 'valence'],
      dtype='object')

In [71]:
artist_set = set()
for artist in df['artist_id'].tolist():
    artist_set.add(artist)

In [119]:
artist_data_file = "data/regional/artist_ids"
with open(artist_data_file, "w") as f:
    for artist_id in artist_set:
        text = str(artist_id) + "\n"
        f.write(text)

In [101]:
len(artist_set)

13813

In [1]:
with open("data/regional/artist_ids") as f:
    data = f.read()
    artist_id_list = data.split()

In [7]:
df['additional_artists'] = [literal_eval(x) for x in df['additional_artists']]

In [102]:
for row in df['additional_artists']:
    for artist in row:
        artist_id = artist[1]
        artist_set.add(artist_id)

In [9]:
artist_id_chunks = [artist_id_list[i:i+50] for i in range(0, len(artist_id_list), 50)]

In [12]:
input_data_list = [(i, 'regional', artist_id_chunks[i], sp) for i in range(len(artist_id_chunks))]

In [15]:
pool = mp.Pool()
pool.map(mp_workers.get_artist_info, input_data_list)

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [16]:
df = pandas.DataFrame()
artist_info_folder = "data/regional/_artist_info"
file_list = os.listdir(artist_info_folder)
for file in file_list:
    filepath = os.path.join(artist_info_folder, file)
    df2 = pandas.read_csv(filepath)
    df = pandas.concat([df, df2])

In [20]:
df['current_followers'] = [ literal_eval(x)['total'] for x in df['current_followers'] ]

In [27]:
df.to_csv("data/regional/artist_info.csv", index=False)

---

## TESTING

In [9]:
r = requests.get("https://spotifycharts.com/regional/global/daily/2018-08-23/download")

In [10]:
r.reason

'OK'

In [11]:
r.reason == "Not Found"

False

--

In [12]:
text = r.text

In [14]:
tup = text.partition("\n")

In [15]:
print(tup[0])
print(tup[1])

,,,"Note that these figures are generated using a formula that protects against any artificial inflation of chart positions.",




In [17]:
import os
os.cpu_count()

8

In [25]:
CHART_TYPE = "regional"

# Get initial page data to scrape possible countries and dates
r_base = requests.get("https://spotifycharts.com/{}".format(CHART_TYPE))
soup = bs(r_base.text, "html.parser")

# print(soup)

# Get a list of all countries 
country_div = soup.find("div", attrs={"class":"responsive-select","data-type":"country"})
country_list = [li.attrs['data-value'] for li in country_div.find_all("li")]

# def chunks(lst, n):
#     for i in range(0, len(lst), n):
#         yield lst[i : i+n]

print(len(country_list))
print(os.cpu_count())
chunk_size = int(len(country_list)/os.cpu_count())
country_chunks = [country_list[i : i+chunk_size] for i in range(0, len(country_list), chunk_size)]

for c in country_chunks:
    print(c)

66
8
['global', 'us', 'gb', 'ad', 'ar', 'at', 'au', 'be']
['bg', 'bo', 'br', 'ca', 'ch', 'cl', 'co', 'cr']
['cy', 'cz', 'de', 'dk', 'do', 'ec', 'ee', 'es']
['fi', 'fr', 'gr', 'gt', 'hk', 'hn', 'hu', 'id']
['ie', 'il', 'in', 'is', 'it', 'jp', 'lt', 'lu']
['lv', 'mc', 'mt', 'mx', 'my', 'ni', 'nl', 'no']
['nz', 'pa', 'pe', 'ph', 'pl', 'pt', 'py', 'ro']
['se', 'sg', 'sk', 'sv', 'th', 'tr', 'tw', 'uy']
['vn', 'za']


---

## Function Definitions

In [None]:
import sys

def line_refresher(printed_string, variables):
    function_line = printed_string.format(*variables)
    def line_refresher_function(*variable):
        sys.stdout.write("\r" + function_line.format(*variable))
        sys.stdout.flush()
    return line_refresher_function

# b = "the man"
# a = line_refresher("Is '{}' working for {}?", ['{}', b])
# a("Elliott")
# sleep(2)
# a("Frankie")
# sleep(2)
# a("the Man")

In [None]:
from math import log10, floor

def sigfigs(number, sf):
    return max(
        round(number, sf - int(floor(log10(abs(number)))) - 1),
        1)

In [None]:
def diff_timer(seconds, reset=False):
    pass
    # Checks if the number of seconds have passed since the last check. If they have, or reset=True, reset the timer.

In [2]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i : i+n]

In [None]:
import multiprocessing as mp
import os

log_folder = "data/logs"

def mp_download_files(chart_type, country_list, date_list, queue, name):
    total_count = len(country_list) * len(date_list)
    current_count = 0
    error_count = 0
    queue.put( (name, total_count, current_count, error_count) )
    
    output_base = os.path.join("data", chart_type)
    
    for country_abbrev, country_name in country_list:
        
        output_folder = os.path.join(output_base, country_abbrev)
        if not os.path.isdir(output_folder):
            os.mkdir(output_folder)
            
        for date in date_list:
            
            # Add updated information to the queue every ten files
            if file_count % 50 == 0:
                queue.put( (name, total_count, current_count, error_count) )
            
            # Get the data
            try:
                # Attempt to get the data
                r_temp = requests.get("https://spotifycharts.com/{}/{}/daily/{}/download".format(
                    chart_type, country_abbrev, date))
                assert r_temp
                
                output_text = "# {} chart for {} on {}\n#".format(chart_type, country_name, date) + r_temp.text
                
                with open("data/{}/{}_{}_{}.csv".format(chart_type, chart_type, country_abbrev, date), "wb") as f:
                    f.write(output_text.encode('utf-8'))
            
            except Exception as e:
                log_text = "EXCEPTION: {}".format(e)
                log_name = "data/logs/{}_{}_{}.txt".format(chart_type, country_abbrev, date)
                with open(log_name, "w") as log:
                    log.write(log_text)
                error_count += 1
            
            current_count += 1
    
    queue.put( (name, total_count, current_count, error_count) )

---

#### Save the output to file

In [10]:
import json

# Download the list of available countries

json.dump(country_list, open("data/regional_countries.txt", "w"))
          
# Download list of available dates
date_div = soup.find("div", attrs={"class":"responsive-select", "data-type":"date"})
date_list = [li.attrs['data-value'] for li in date_div.find_all("li")]
json.dump(date_list, open("data/regional_dates.txt", "w"))

'process_0: 6912 / 6912 files downloaded (1152 errors).'

'process_1: 6912 / 6912 files downloaded (654 errors).'

'process_2: 6912 / 6912 files downloaded (956 errors).'

'process_3: 6912 / 6912 files downloaded (245 errors).'

'process_4: 6912 / 6912 files downloaded (0 errors).'

'process_5: 6912 / 6912 files downloaded (1217 errors).'

'process_6: 6912 / 6912 files downloaded (2275 errors).'

'process_7: 6912 / 6912 files downloaded (1604 errors).'

'process_8: 6912 / 6912 files downloaded (0 errors).'

'process_9: 6912 / 6912 files downloaded (437 errors).'

'process_10: 6912 / 6912 files downloaded (1180 errors).'


Download complete!


In [None]:
# # Downloading files through parallel processes
# import time
# from IPython.display import display, clear_output

# output_queue = mp.Queue()

# print("Creating processes...")

# country_chunks = list(chunks(country_list, 3))

# processes = [mp.Process(target=mp_download_files, args=('regional', country_chunks[i], date_list, output_queue, i)) for \
#                 i in range(len(country_chunks))]

# print("Processes created.")

# active_processes = [True for i in range(len(processes))]

# process_text = ["process_{}: starting...".format(i) for i in range(len(processes))]

# for text in process_text:
#     display(text)
# # display("\n".join(process_text))

# print("Activating processes...")

# for p in processes:
#     p.start()

# while any(active_processes):
#     text_changed = False
#     while not output_queue.empty():
#         text_changed = True
#         (p_name, total_count, current_count, error_count) = output.get()
#         if current_count == total_count:
#             processes[p_name].join()
#             active_processes[p_name] = False
#             process_text[p_name] = "process_{}: {} / {} files downloaded ({} errors).".format(
#                 p_name, current_count, total_count, error_count)
    
#     if text_changed:
#         clear_output(True)
#         display("\n".join(process_text))
        
# print("\nDownload complete!")
        
        
        


In [None]:
from timeit import default_timer as dtimer

total_files = len(date_list) * len(country_list)
base_increment = sigfigs(total_files/100, 1)

update_line = line_refresher("{} / {} files downloaded...", ['{}', total_files])

file_count = 0
next_file_marker = base_increment
last_refresh_time = dtimer()
update_line(file_count)
start_time = last_refresh_time

for country_pair in country_list:
    for date in date_list:
        
        if file_count == 10:
            print("Predicted time: ", total_files/10 * (dtimer() - start_time))
        
        if file_count >= next_file_marker or dtimer() - 30 > last_refresh_time:
            last_refresh_time = dtimer()
            if file_count >= next_file_marker:
                next_file_marker += base_increment
            update_line(file_count)
        
        country_abbrev = country_pair[0]
        country_name = country_pair[1]
        r_temp = requests.get("https://spotifycharts.com/regional/{}/daily/{}/download".format(country_abbrev, date))
        if r_temp:
            with open("data/top-200_{}_{}.csv".format(country_abbrev, date), "wb") as f:
                f.write("# Top 200 list for {} on {}\n#".format(country_name, date).encode('utf-8'))
                f.write(r_temp.text.encode('utf-8'))
        else:
            print("ERROR: Request failed with code '{}' - {} ({}), {}".format(
                r_temp.status_code, country_name, country_abbrev, date))

        file_count += 1
print("\nComplete.")

---

## Regional Daily Viral 50 Songs

In [None]:
# Get a list of all possible countries and dates to scrape
r_regional = requests.get("https://spotifycharts.com/viral/")
soup = bs(r_regional.text, "html.parser")

In [None]:
v_country_div = soup.find("div", attrs={"class":"responsive-select","data-type":"country"})
v_country_list = [(li.attrs['data-value'], li.text) for li in v_country_div.find_all("li")]
v_date_div = soup.find("div", attrs={"class":"responsive-select", "data-type":"date"})
v_date_list = [li.attrs['data-value'] for li in v_date_div.find_all("li")]
print("Time span of charts (years):", len(v_date_list)/365)
v_sorted_date_list = sorted(v_date_list)
print("From {} to {}".format(v_sorted_date_list[0], v_sorted_date_list[-1]))

In [None]:
v_date_div = soup.find("div", attrs={"class":"responsive-select", "data-type":"date"})
v_date_list = [li.attrs['data-value'] for li in v_date_div.find_all("li")]
print("Time span of charts (years):", len(v_date_list)/365)
v_sorted_date_list = sorted(v_date_list)
print("From {} to {}".format(v_sorted_date_list[0], v_sorted_date_list[-1]))

In [None]:
from timeit import default_timer as dtimer

v_total_files = len(v_date_list) * len(v_country_list)
base_increment = sigfigs(v_total_files/100, 1)

update_line = line_refresher("{} / {} files downloaded...", ['{}', v_total_files])

file_count = 0
next_file_marker = base_increment
last_refresh_time = dtimer()
update_line(file_count)

for country_pair in v_country_list:
    for date in v_date_list:
        
        if file_count >= next_file_marker or dtimer() - 30 > last_refresh_time:
            last_refresh_time = dtimer()
            if file_count >= next_file_marker:
                next_file_marker += base_increment
            update_line(file_count)
        
        country_abbrev = country_pair[0]
        country_name = country_pair[1]
        r_temp = requests.get("https://spotifycharts.com/viral/{}/daily/{}/download".format(country_abbrev, date))
        if r_temp:
            with open("data/viral-50_{}_{}.csv".format(country_abbrev, date), "wb") as f:
                f.write("# Top 200 list for {} on {}\n#".format(country_name, date).encode('utf-8'))
                f.write(r_temp.text.encode('utf-8'))
        else:
            print("ERROR: Request failed with code '{}' - {} ({}), {}".format(
                r_temp.status_code, country_name, country_abbrev, date))

        file_count += 1
print("\nComplete.")

---

## Importing into a DataFrame

In [None]:
import pandas as pd
test_df = pd.read_csv("data/top-200_global_2020-02-29.csv", comment='#', encoding="latin-1")
test_df.head()

In [2]:
import os
import pandas
os.listdir("data/logs")

['regional_ad_2017-01-01.txt',
 'regional_ad_2017-01-02.txt',
 'regional_ad_2017-01-03.txt',
 'regional_ad_2017-01-04.txt',
 'regional_ad_2017-01-05.txt',
 'regional_ad_2017-01-06.txt',
 'regional_ad_2017-01-07.txt',
 'regional_ad_2017-01-08.txt',
 'regional_ad_2017-01-09.txt',
 'regional_ad_2017-01-10.txt',
 'regional_ad_2017-01-11.txt',
 'regional_ad_2017-01-12.txt',
 'regional_ad_2017-01-13.txt',
 'regional_ad_2017-01-14.txt',
 'regional_ad_2017-01-15.txt',
 'regional_ad_2017-01-16.txt',
 'regional_ad_2017-01-17.txt',
 'regional_ad_2017-01-18.txt',
 'regional_ad_2017-01-19.txt',
 'regional_ad_2017-01-20.txt',
 'regional_ad_2017-01-21.txt',
 'regional_ad_2017-01-22.txt',
 'regional_ad_2017-01-23.txt',
 'regional_ad_2017-01-24.txt',
 'regional_ad_2017-01-25.txt',
 'regional_ad_2017-01-26.txt',
 'regional_ad_2017-01-27.txt',
 'regional_ad_2017-01-28.txt',
 'regional_ad_2017-01-29.txt',
 'regional_ad_2017-01-30.txt',
 'regional_ad_2017-01-31.txt',
 'regional_ad_2017-02-01.txt',
 'region

In [3]:
pandas.DataFrame(os.listdir("data/logs"))

Unnamed: 0,0
0,regional_ad_2017-01-01.txt
1,regional_ad_2017-01-02.txt
2,regional_ad_2017-01-03.txt
3,regional_ad_2017-01-04.txt
4,regional_ad_2017-01-05.txt
...,...
9718,regional_za_2018-06-24.txt
9719,regional_za_2018-07-08.txt
9720,regional_za_2018-07-15.txt
9721,regional_za_2018-07-22.txt


In [4]:
import re
def find_country(text):
    return re.findall("^regional_(\w\w)_", text)[0]

In [6]:
country_set = set()
for file in os.listdir("data/logs"):
    country = find_country(file)
    country_set.add(country)

In [7]:
country_set

{'ad',
 'bg',
 'cy',
 'ee',
 'il',
 'in',
 'lt',
 'lu',
 'lv',
 'mc',
 'mt',
 'my',
 'ni',
 'ro',
 'sk',
 'th',
 'vn',
 'za'}

In [11]:
country_list

[('global', 'Global'),
 ('us', 'United States'),
 ('gb', 'United Kingdom'),
 ('ad', 'Andorra'),
 ('ar', 'Argentina'),
 ('at', 'Austria'),
 ('au', 'Australia'),
 ('be', 'Belgium'),
 ('bg', 'Bulgaria'),
 ('bo', 'Bolivia'),
 ('br', 'Brazil'),
 ('ca', 'Canada'),
 ('ch', 'Switzerland'),
 ('cl', 'Chile'),
 ('co', 'Colombia'),
 ('cr', 'Costa Rica'),
 ('cy', 'Cyprus'),
 ('cz', 'Czech Republic'),
 ('de', 'Germany'),
 ('dk', 'Denmark'),
 ('do', 'Dominican Republic'),
 ('ec', 'Ecuador'),
 ('ee', 'Estonia'),
 ('es', 'Spain'),
 ('fi', 'Finland'),
 ('fr', 'France'),
 ('gr', 'Greece'),
 ('gt', 'Guatemala'),
 ('hk', 'Hong Kong'),
 ('hn', 'Honduras'),
 ('hu', 'Hungary'),
 ('id', 'Indonesia'),
 ('ie', 'Ireland'),
 ('il', 'Israel'),
 ('in', 'India'),
 ('is', 'Iceland'),
 ('it', 'Italy'),
 ('jp', 'Japan'),
 ('lt', 'Lithuania'),
 ('lu', 'Luxembourg'),
 ('lv', 'Latvia'),
 ('mc', 'Monaco'),
 ('mt', 'Malta'),
 ('mx', 'Mexico'),
 ('my', 'Malaysia'),
 ('ni', 'Nicaragua'),
 ('nl', 'Netherlands'),
 ('no', 'Norway

In [12]:
full_cl = [x[0] for x in country_list]

In [13]:
whole_cl = [x for x in full_cl if x not in country_set]

In [15]:
whole_cl
len(whole_cl)

48

In [19]:
from datetime import timedelta, date

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

start_date = date(2017, 6, 9)
end_date = date(2020, 3, 4)
for single_date in daterange(start_date, end_date):
    date_string = single_date.strftime("%Y-%m-%d")
    if date_string not in date_list:
        print(date_string)

In [18]:
date_list

['2020-03-04',
 '2020-03-03',
 '2020-03-02',
 '2020-03-01',
 '2020-02-29',
 '2020-02-28',
 '2020-02-27',
 '2020-02-26',
 '2020-02-25',
 '2020-02-24',
 '2020-02-23',
 '2020-02-22',
 '2020-02-21',
 '2020-02-20',
 '2020-02-19',
 '2020-02-18',
 '2020-02-17',
 '2020-02-16',
 '2020-02-15',
 '2020-02-14',
 '2020-02-13',
 '2020-02-12',
 '2020-02-11',
 '2020-02-10',
 '2020-02-09',
 '2020-02-08',
 '2020-02-07',
 '2020-02-06',
 '2020-02-05',
 '2020-02-04',
 '2020-02-03',
 '2020-02-02',
 '2020-02-01',
 '2020-01-31',
 '2020-01-30',
 '2020-01-29',
 '2020-01-28',
 '2020-01-27',
 '2020-01-26',
 '2020-01-25',
 '2020-01-24',
 '2020-01-23',
 '2020-01-22',
 '2020-01-21',
 '2020-01-20',
 '2020-01-19',
 '2020-01-18',
 '2020-01-17',
 '2020-01-16',
 '2020-01-15',
 '2020-01-14',
 '2020-01-13',
 '2020-01-12',
 '2020-01-11',
 '2020-01-10',
 '2020-01-09',
 '2020-01-08',
 '2020-01-07',
 '2020-01-06',
 '2020-01-05',
 '2020-01-04',
 '2020-01-03',
 '2020-01-02',
 '2020-01-01',
 '2019-12-31',
 '2019-12-30',
 '2019-12-