In [5]:
# Imports
import pandas as pd
import requests
import os
import argparse
import sys
from urllib.parse import urlparse

### Cover download method

In [6]:
# From https://github.com/opiumozor/spotify-cover-downloader

CLIENT_ID = ""
CLIENT_SECRET = ""


def get_access_token(client_id, client_secret):
    """
    Get the access token from Spotify
    """
    body_params = {'grant_type': "client_credentials"}
    url = 'https://accounts.spotify.com/api/token'

    response = requests.post(url, data=body_params, auth=(client_id, client_secret))
    if response.status_code == 200:
        return response.json()['access_token']
    else:
        sys.exit("Failed to get access token. Is your client_id and client_secret correct?")


def get_api_url(url):
    """
    Get the api url from the song link or the Spotify URI

    Example:
    - https://open.spotify.com/track/7H9sqtNVPp6eoxnJRMUmm4?si=jtQGu_1MQGOF-2WscCvbnA
    - spotify:track:7H9sqtNVPp6eoxnJRMUmm4
    """
    parsed_url = urlparse(url)

    type = None
    spotify_id = None

    if parsed_url.scheme == 'http' or parsed_url.scheme == 'https':
        type = parsed_url.path.split('/')[1]
        spotify_id = parsed_url.path.split('/')[2]
    elif parsed_url.scheme == 'spotify':
        type = parsed_url.path.split(':')[0]
        spotify_id = parsed_url.path.split(':')[1]
    else:
        sys.exit("Failed to build api url.")

    return "https://api.spotify.com/v1/%ss/%s" % (type, spotify_id)  # add an 's' after the type


def spotify_cover_downloader(url, client_id, client_secret, filename):
    """
    Download an album cover from Spotify
    """
    headers = {"Authorization": "Bearer %s" % get_access_token(client_id, client_secret)}
    url = get_api_url(url)

    response = requests.get(url, headers=headers).json()


    cover_url = response['album']['images'][0]['url']
    file_name = "./covers/" + filename + ".jpg"

    img_data = requests.get(cover_url).content
    with open(file_name, 'wb') as handler:
        handler.write(img_data)

    print ("Cover saved! (%s)" % file_name)

### Parser

In [7]:
# From https://github.com/un33k/python-slugify/blob/master/slugify/slugify.py

import re
import sys
import unicodedata
from html.entities import name2codepoint
try:
    import text_unidecode as unidecode
except ImportError:
    import unidecode

CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
DECIMAL_PATTERN = re.compile(r'&#(\d+);')
HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
QUOTE_PATTERN = re.compile(r'[\']+')
DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
DEFAULT_SEPARATOR = '-'


def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False,
            separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True,
            replacements=(), allow_unicode=False):
    """
    Make a slug from the given text.
    :param text (str): initial text
    :param entities (bool): converts html entities to unicode
    :param decimal (bool): converts html decimal to unicode
    :param hexadecimal (bool): converts html hexadecimal to unicode
    :param max_length (int): output string length
    :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
    :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
    :param separator (str): separator between words
    :param stopwords (iterable): words to discount
    :param regex_pattern (str): regex pattern for disallowed characters
    :param lowercase (bool): activate case sensitivity by setting it to False
    :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
    :param allow_unicode (bool): allow unicode characters
    :return (str):
    """

    # user-specific replacements
    if replacements:
        for old, new in replacements:
            text = text.replace(old, new)

    # ensure text is unicode
    if not isinstance(text, str):
        text = str(text, 'utf-8', 'ignore')

    # replace quotes with dashes - pre-process
    text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)

    # decode unicode
    if not allow_unicode:
        text = unidecode.unidecode(text)

    # ensure text is still in unicode
    if not isinstance(text, str):
        text = str(text, 'utf-8', 'ignore')

    # character entity reference
    if entities:
        text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)

    # decimal character reference
    if decimal:
        try:
            text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)
        except Exception:
            pass

    # hexadecimal character reference
    if hexadecimal:
        try:
            text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)
        except Exception:
            pass

    # translate
    if allow_unicode:
        text = unicodedata.normalize('NFKC', text)
    else:
        text = unicodedata.normalize('NFKD', text)

    if sys.version_info < (3,):
        text = text.encode('ascii', 'ignore')

    # make the text lowercase (optional)
    if lowercase:
        text = text.lower()

    # remove generated quotes -- post-process
    text = QUOTE_PATTERN.sub('', text)

    # cleanup numbers
    text = NUMBERS_PATTERN.sub('', text)

    # replace all other unwanted characters
    if allow_unicode:
        pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
    else:
        pattern = regex_pattern or DISALLOWED_CHARS_PATTERN

    text = re.sub(pattern, DEFAULT_SEPARATOR, text)

    # remove redundant
    text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)

    # remove stopwords
    if stopwords:
        if lowercase:
            stopwords_lower = [s.lower() for s in stopwords]
            words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
        else:
            words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
        text = DEFAULT_SEPARATOR.join(words)

    # finalize user-specific replacements
    if replacements:
        for old, new in replacements:
            text = text.replace(old, new)

    # smart truncate if requested
    if max_length > 0:
        text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)

    if separator != DEFAULT_SEPARATOR:
        text = text.replace(DEFAULT_SEPARATOR, separator)

    return text

## Updating song ids for valid file naming

In [None]:
df = pd.read_csv("df_data_proper_name.csv", index_col = 0)
df = df.reset_index()

df['id'] = list(map(lambda x: slugify(x, lowercase=False, separator='_',) , df['id'].values))
df.to_csv("df_data_proper_name.csv", index = False)

## Finding missing uri's and other features

In [8]:
# Access token needs an update every hour
access = "Bearer "

def find_uri(artist, track):
    headers = {
        'Accept': 'application/json',
        'Content-Type': 'application/json',
        'Authorization': access,}

    params = {
        'q': "Artist : " + artist + " Track: " + track,
        'type': 'track',
        'limit': '1',
    }

    response = requests.get('https://api.spotify.com/v1/search', params=params, headers=headers)
    return response.json()['tracks']['items'][0]['uri']

In [None]:
def get_features(uri):
    headers = {
        'Accept': 'application/json',
        'Content-Type': 'application/json',
        'Authorization': access,}

    params = {
        'ids': uri.split(":")[-1]
    }

    response = requests.get('https://api.spotify.com/v1/audio-features', params=params, headers=headers)
    return response.json()['audio_features'][0]

In [None]:
df = df.reset_index()

for i, row in df.iterrows():
    if row['_uri'] == "Unknown":
        uri = find_uri(row["_artist"], row["_song"])
        df.at[i, '_uri'] = uri
        
        features = get_features(uri)
        df.at[i, '_track'] = df.at[i, '_song']
        df.at[i, '_danceability'] = features['danceability']
        df.at[i, '_energy'] = features['energy']
        df.at[i, '_key'] = features['key']      
        df.at[i, '_loudness'] = features['loudness']   
        df.at[i, '_mode'] = features['mode']
        df.at[i, '_speechiness'] = features['speechiness']
        df.at[i, '_acousticness'] = features['acousticness']        
        df.at[i, '_instrumentalness'] = features['instrumentalness']   
        df.at[i, '_liveness'] = features['liveness']      
        df.at[i, '_valence'] = features['valence']
        df.at[i, '_tempo'] = features['tempo']
        df.at[i, '_duration_ms'] = features['duration_ms']
        df.at[i, '_time_signature'] = features['time_signature']

## Scraping

In [None]:
for row in df.itertuples():
    r = list(row)
    if r[7] != 'Unknown':   # r[7] = spotify song uri 
        try: 
            spotify_cover_downloader(r[7], CLIENT_ID, CLIENT_SECRET, r[-2])  # r[-2] = song id
        except SystemExit:
            os.system("ffplay.exe " + 'beep.mp3')