In [1]:
import requests
import os
import pandas as pd
import datetime
import json

In [2]:
endpoint = 'https://charts-spotify-com-service.spotify.com/auth/v0/charts/{chart}-{country}-weekly/{date}'

In [3]:
database_dir = 'data'
os.makedirs(database_dir, exist_ok=True)

In [4]:
countries = {
    "GLOBAL": "Global",
    "AR": "Argentina",
    "AU": "Australia",
    "AT": "Austria",
    "BY": "Belarus",
    "BE": "Belgium",
    "BO": "Bolivia",
    "BR": "Brazil",
    "BG": "Bulgaria",
    "CA": "Canada",
    "CL": "Chile",
    "CO": "Colombia",
    "CR": "Costa Rica",
    "CY": "Cyprus",
    "CZ": "Czech Republic",
    "DK": "Denmark",
    "DO": "Dominican Republic",
    "EC": "Ecuador",
    "EG": "Egypt",
    "SV": "El Salvador",
    "EE": "Estonia",
    "FI": "Finland",
    "FR": "France",
    "DE": "Germany",
    "GR": "Greece",
    "GT": "Guatemala",
    "HN": "Honduras",
    "HK": "Hong Kong",
    "HU": "Hungary",
    "IS": "Iceland",
    "IN": "India",
    "ID": "Indonesia",
    "IE": "Ireland",
    "IL": "Israel",
    "IT": "Italy",
    "JP": "Japan",
    "KZ": "Kazakhstan",
    "LV": "Latvia",
    "LT": "Lithuania",
    "LU": "Luxembourg",
    "MY": "Malaysia",
    "MX": "Mexico",
    "MA": "Morocco",
    "NL": "Netherlands",
    "NZ": "New Zealand",
    "NI": "Nicaragua",
    "NG": "Nigeria",
    "NO": "Norway",
    "PK": "Pakistan",
    "PA": "Panama",
    "PY": "Paraguay",
    "PE": "Peru",
    "PH": "Philippines",
    "PL": "Poland",
    "PT": "Portugal",
    "RO": "Romania",
    "SA": "Saudi Arabia",
    "SG": "Singapore",
    "SK": "Slovakia",
    "ZA": "South Africa",
    "KR": "South Korea",
    "ES": "Spain",
    "SE": "Sweden",
    "CH": "Switzerland",
    "TW": "Taiwan",
    "TH": "Thailand",
    "TR": "Turkey",
    "AE": "UAE",
    "UA": "Ukraine",
    "GB": "United Kingdom",
    "UY": "Uruguay",
    "US": "USA",
    "VE": "Venezuela",
    "VN": "Vietnam"
}

In [19]:
all_countries = list(countries.keys())

In [72]:
token = "BQBieP6-0DWhtAgvvKIBcNjmDJFYu4cfuoB8ultVprefFtCbdszni-yLlpkH3MY5qBXFQw2X5oVuJhwLITX0rutUKDGkcAqD-HqReNMn-OmfUY5EV3rLKOzgTxfdq3-lpLKAUF3s6uSZbfWAf-3KTpJEW9JeQxOXTAhAU0u3EX7wiDTrElJrDkmGnWWfb-5hq7_7MF0ytYMDGeipXkFhbX4gkYjkEmOL3D_dvp2OMpYufFNXiZs7eJ6HDU2THpac"

In [73]:
headers = {
    'Authorization': f'Bearer {token}',
}

In [9]:
chartTypesMap = {
    "album": "album",
    "artist": "artist",
    "song": "regional",
}

chartTypes = list(chartTypesMap.keys())

In [10]:
def get_weekly_song_rankings(start_date, country_code, chartType):
    response = requests.get(endpoint.format(country=country_code, date=start_date, chart=chartType), headers=headers)
    if response.status_code == 200:
        data = response.json()
        return data, 200
    else:
        print(f"Error: {response.status_code}")
        return None, response.status_code

In [11]:
date_format = '%Y-%m-%d'

In [12]:
def get_entry(entry, chartType):
    ranking = dict(entry['chartEntryData'])
    if chartType == "album":
        ranking['album_id'] = entry['albumMetadata']['albumUri'].split(':')[-1]
        ranking['album_name'] = entry['albumMetadata']['albumName']
        ranking['image_link'] = entry['albumMetadata']['displayImageUri']     
        ranking['artists'] = ', '.join([artist['name'] for artist in entry['albumMetadata']['artists']])
    elif chartType == "artist":
        ranking['artist_id'] = entry['artistMetadata']['artistUri'].split(':')[-1]
        ranking['artist_name'] = entry['artistMetadata']['artistName']
        ranking['image_link'] = entry['artistMetadata']['displayImageUri']      
    elif chartType == "song":
        ranking['track_id'] = entry['trackMetadata']['trackUri'].split(':')[-1]
    
        ranking['track_name'] = entry['trackMetadata']['trackName']
        ranking['image_link'] = entry['trackMetadata']['displayImageUri']      
        ranking['artists'] = ', '.join([artist['name'] for artist in entry['trackMetadata']['artists']])
        ranking['release_date'] = entry['trackMetadata'].get('releaseDate', 'Unknown')
    else:
        raise ValueError("Invalid chart type")
    
    return ranking

In [13]:
def weekly_to_lines(weekly_json, chartType):
    rankings = []
    for entry in weekly_json['entries']:
        ranking = get_entry(entry, chartType)
        ranking['country'] = weekly_json['displayChart']['chartMetadata']['dimensions']['country']
        ranking['date'] = weekly_json['displayChart']['date']
        ranking['chartType'] = weekly_json['displayChart']['chartMetadata']['dimensions']['chartType']  
        rankings.append(ranking.copy())
    return rankings

In [44]:
# Get the earliest date for each country
country_earliest_dates = {}
for country in all_countries:
    json_data, status = get_weekly_song_rankings('latest', country, chartTypesMap['song'])
    if status == 200:
        earliest_date = json_data['displayChart']['chartMetadata']['dimensions']['earliestDate']
        country_earliest_dates[country] = earliest_date
    else:
        print(f"Failed to fetch data for {country}: {status}")

In [None]:
start_date = datetime.datetime(2016, 12, 29)
date = start_date

# Create empty pandas DataFrames to store the rankings and artists
rankings_db = []
chartType = 'song'

while date.year > 2015:
    for country_code in all_countries:
        date_str = date.strftime(date_format)

        if date_str < country_earliest_dates[country_code]:
            continue
        country_lower = country_code.lower()
        json_data, status_code = get_weekly_song_rankings(date_str, country_lower, chartTypesMap[chartType])

        if json_data is None:
            with open("failed.txt", "a") as f:
                f.write(f"{chartType},{date_str},{country_code}\n")

            if status_code == 401:
                token = input("Please enter a new token: ")
                headers['Authorization'] = f'Bearer {token}'
            continue

        ranking = weekly_to_lines(json_data, chartType)
        rankings_db.extend(ranking)

    # Move date to previous week
    date -= datetime.timedelta(weeks=1)


In [75]:
print(date.strftime(date_format))

2016-12-29


In [76]:
current_df = pd.read_csv(os.path.join(database_dir, f'{chartType}.csv'))
print(len(current_df))


5346078


In [77]:
new_df = pd.DataFrame(rankings_db)
print(len(new_df))
# Concatenate and save


583899


In [78]:
final_df = pd.concat([current_df, new_df], ignore_index=True)
print(len(final_df))


5929977


In [None]:
final_df.to_csv(os.path.join(database_dir, f'{chartType}.csv'), index=False)

In [15]:
# Save databases
chart_df = pd.DataFrame(rankings_db)
chart_df.to_csv(os.path.join(database_dir, f'{chartType}.csv'), index=False)
print(len(chart_df))

295800


In [62]:
fetched = {
    "album": [],
    "artist": [],
    "song": []
}

with open("failed.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        split = line.split(",")
        chart = split[0].strip()
        date = split[1].strip()
        country = split[2].strip().lower()

        json_data, sc = get_weekly_song_rankings(date, country, chartTypesMap[chart])
        if json_data is None:
            print(f"Error at {date}, {country} due to error.")
            if sc == 401:
                token = input("Please enter a new token: ")
                headers['Authorization'] = f'Bearer {token}'
            continue
        rankings = weekly_to_lines(json_data, chartType)

        fetched[chart].extend(rankings)

Error: 503
Error at 2023-10-19, nl due to error.
Error: 503
Error at 2023-10-19, nl due to error.


KeyboardInterrupt: 

In [None]:
for chartType in fetched:
    original_df = pd.read_csv(os.path.join(database_dir, f'{chartType}.csv'))
    chart_df = pd.DataFrame(fetched[chartType])
    chart_df = pd.concat([original_df, chart_df], ignore_index=True)
    print(len(chart_df))
    chart_df.to_csv(os.path.join(database_dir, f'{chartType}.csv'), index=False)
    # Save to pickle as well
    chart_df.to_pickle(os.path.join(database_dir, f'{chartType}.pkl'))