### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import time 

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)  # me muestre todas las columnas
#pd.set_option('display.max_rows', None)  # me muestre todas las filas

### What albums am I interested on?

#### Rolling Stone magazine Top 500

In [2]:
topalbums = pd.read_csv('../data/clean/DB_Kaggle_top500albums_clean.csv', encoding='latin1')

topalbums.head(3)

Unnamed: 0,toprankingorder,album,artist,year
0,1,Sgt. Pepper's Lonely Hearts Club Band,The Beatles,1967
1,2,Pet Sounds,The Beach Boys,1966
2,3,Revolver,The Beatles,1966


#### Rolling Stone article Horrible 50

In [3]:
horriblealbums = pd.read_csv('../data/clean/WS_50horriblealbums_clean.csv', encoding='latin1')

horriblealbums.head(3)

Unnamed: 0,horriblerankingorder,album,artist,rationale,year
0,50,Its Hard,The Who,"In the early Eighties, Pete Townshend was jugg...",1982
1,49,The Bridge,Billy Joel,Billy Joel had nearly a solid decade of succes...,1986
2,48,Van Halen III,Van Halen,When original Van Halen singer David Lee Roth ...,1998


#### Full album list 

In [4]:
#need to be conscious of the fact that some albums have the same name 

topalbums['album'].value_counts().head(3)

album
Greatest Hits                            3
Let It Be                                2
Sgt. Pepper's Lonely Hearts Club Band    1
Name: count, dtype: int64

In [5]:
#let us try to get unique values for album_artist 

topalbums['album_artist'] = topalbums['album'] + ' - ' + topalbums['artist'] 

In [6]:
topalbums.head(3)

Unnamed: 0,toprankingorder,album,artist,year,album_artist
0,1,Sgt. Pepper's Lonely Hearts Club Band,The Beatles,1967,Sgt. Pepper's Lonely Hearts Club Band - The Be...
1,2,Pet Sounds,The Beach Boys,1966,Pet Sounds - The Beach Boys
2,3,Revolver,The Beatles,1966,Revolver - The Beatles


In [7]:
horriblealbums['album'].value_counts().head(2)

album
Its Hard           1
Leather Jackets    1
Name: count, dtype: int64

In [8]:
horriblealbums['album_artist'] = horriblealbums['album'] + ' - ' + horriblealbums['artist']

In [9]:
horriblealbums.head(2)

Unnamed: 0,horriblerankingorder,album,artist,rationale,year,album_artist
0,50,Its Hard,The Who,"In the early Eighties, Pete Townshend was jugg...",1982,Its Hard - The Who
1,49,The Bridge,Billy Joel,Billy Joel had nearly a solid decade of succes...,1986,The Bridge - Billy Joel


In [10]:
all_albums = pd.concat([topalbums, horriblealbums], ignore_index=True)
all_albums.shape

(550, 7)

In [11]:
all_albums.head(3)

Unnamed: 0,toprankingorder,album,artist,year,album_artist,horriblerankingorder,rationale
0,1.0,Sgt. Pepper's Lonely Hearts Club Band,The Beatles,1967,Sgt. Pepper's Lonely Hearts Club Band - The Be...,,
1,2.0,Pet Sounds,The Beach Boys,1966,Pet Sounds - The Beach Boys,,
2,3.0,Revolver,The Beatles,1966,Revolver - The Beatles,,


#### list of albums

In [12]:
albums = all_albums['album'].unique().tolist()

In [13]:
#some had the same name (e.g. Greatest Hits, Let It Be)

len(albums)

547

#### list of albums_artists 

In [14]:
album_artist = all_albums['album_artist'].unique().tolist()

In [15]:
len(album_artist)

550

### Spotify Connection 

In [16]:
#%pip install pyarrow
#%pip install joblib
#%pip install tqdm
#%pip install spotipy
#%pip install python-dotenv
#%pip install requests 

In [17]:
import requests as rq
import pandas as pd
import pyarrow
from joblib import Parallel, delayed
from tqdm import tqdm
import time
import pymongo
from passwords import *

In [19]:
# Obtener el TOKEN

AUTH_URL = "https://accounts.spotify.com/api/token"

creds = {
    "grant_type": "client_credentials",
    "client_id": Client_ID,
    "client_secret": Client_secret
}

response = rq.post(AUTH_URL, 
                         data=creds, 
                         headers={"Content-Type": "application/x-www-form-urlencoded"})


TOKEN = response.json()['access_token']

# Conexión con la API

url = "https://api.spotify.com/"

headers = {"Authorization": f'Bearer {TOKEN}'}

response = rq.get(url,headers=headers)
response

<Response [200]>

### Getting Album Information 

In [20]:
albums[0:5]

["Sgt. Pepper's Lonely Hearts Club Band",
 'Pet Sounds',
 'Revolver',
 'Highway 61 Revisited',
 'Rubber Soul']

In [21]:
#understanding the json 

In [22]:
url = 'https://api.spotify.com/v1/search?q=album%3Apet+sounds&type=album&limit=50&offset=0'

In [23]:
res = rq.get(url, headers=headers)
res.status_code 

200

In [24]:
data = res.json()

In [25]:
type(data)

dict

In [26]:
data.keys()

dict_keys(['albums'])

In [27]:
data['albums'].keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [28]:
data['albums']['items'][0].keys()

dict_keys(['album_type', 'artists', 'available_markets', 'external_urls', 'href', 'id', 'images', 'name', 'release_date', 'release_date_precision', 'total_tracks', 'type', 'uri'])

In [29]:
#quiero el id del album en Spotify 
data['albums']['items'][0]['id']

'6GphKx2QAPRoVGWE9D7ou8'

In [30]:
#quiero el nombre del album en Spotify
data['albums']['items'][0]['name']

'Pet Sounds (Original Mono & Stereo Mix)'

In [31]:
#quiero el nombre del artista del album 
data['albums']['items'][0]['artists'][0]['name']

'The Beach Boys'

In [32]:
#quiero el id del artista del album 
data['albums']['items'][0]['artists'][0]['id']

'3oDbviiivRWhXwIE8hxkVV'

In [33]:
#quiero el release_date del album 
data['albums']['items'][0]['release_date']

'1966-06-16'

In [34]:
#quiero el total de tracks del album 
data['albums']['items'][0]['total_tracks']

27

In [35]:
# Replace spaces with the '+' character in the album names
insertquery = [x.replace(' ', '+') for x in albums]

# Construct a list of URLs for querying album information
album_urls = [f'https://api.spotify.com/v1/search?q=album%3A{album}&type=album&limit=50&offset=0' for album in insertquery]

# Create a list of dictionaries with album information and corresponding URLs
album_url_list = [{'album': album, 'url': url} for album, url in zip(albums, album_urls)]

In [36]:
album_url_list[1]

{'album': 'Pet Sounds',
 'url': 'https://api.spotify.com/v1/search?q=album%3APet+Sounds&type=album&limit=50&offset=0'}

In [37]:
# List to store the retrieved album data
album_data_list = []

# Function to search and retrieve album information
def search_spotify_info(album_url):
    time.sleep(0.30)  # Pause between queries to avoid getting banned

    try:
        response = rq.get(album_url['url'], headers=headers)
        response.raise_for_status()  # Check for request errors
        data = response.json()
        rjson = data.get('albums', {}).get('items', [])

        for index, item in enumerate(rjson):
            if item['name'].lower() == album_url['album'].lower():
                album_data_list.append({
                    'albumnamers': album_url['album'],
                    'albumidspotify': item['id'],  # Spotify ID for the album
                    'albumnamespotify': item['name'],
                    'artistnamespotify': item['artists'][0]['name'],
                    'artistidspotify': item['artists'][0]['id'],
                    'releasedate': item['release_date'],
                    'totaltracks': item['total_tracks']
                })
                
                return rjson[index]

        # If the album is not found, raise an exception to return None
        raise Exception("Album not found")
    
    except Exception as e:
        print(f"An error occurred: {e}")
        album_data_list.append({
                    'albumnamers': None, 
                    'albumidspotify': None, 
                    'albumnamespotify': None, 
                    'artistnamespotify': None,
                    'artistidspotify': None,
                    'releasedate': None,
                    'totaltracks': None 
        })
        return {
            'album_type': None,
            'artists': None,
            'available_markets': None,
            'external_urls': None,
            'href': None,
            'id': None,
            'images': None,
            'name': None,
            'release_date': None,
            'release_date_precision': None,
            'total_tracks': None,
            'type': None,
            'uri': None
        }
    
# Execute the function for each album URL
for album_url in album_url_list:
    search_spotify_info(album_url)

# Create a DataFrame from the list of dictionaries
dataalbums = pd.DataFrame(album_data_list)

An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: Album not found
An error occurred: A

In [38]:
dataalbums.head()

Unnamed: 0,albumnamers,albumidspotify,albumnamespotify,artistnamespotify,artistidspotify,releasedate,totaltracks
0,Sgt. Pepper's Lonely Hearts Club Band,1x1jpjDbetGqX0IKCUIBNj,Sgt. Pepper's Lonely Hearts Club Band,Bloco do Sargento Pimenta,3wGWCP3E3tYqj5memYV9Vq,2017-12-08,13.0
1,Pet Sounds,2CNEkSE8TADXRT2AzcEt1b,Pet Sounds,The Beach Boys,3oDbviiivRWhXwIE8hxkVV,1966-05-16,13.0
2,Revolver,0T1sskJDoybYGvPU5aw5Cf,REVOLVER,Lil Darkie,62F9BiUmjqeXbBztCwiX1U,2020-06-15,1.0
3,Highway 61 Revisited,6YabPKtZAjxwyWbuO9p4ZD,Highway 61 Revisited,Bob Dylan,74ASZWbe4lXaubB36ztrGX,1965-08-30,9.0
4,Rubber Soul,5TIQEIzrI6RQfUVQ5Y571D,Rubber Soul,The Beatles Complete On Ukulele,5o723EMxNulM5ydXRh7Qkk,2020-07-28,16.0


In [39]:
dataalbums.dropna(inplace=True)

In [40]:
dataalbums.shape

(442, 7)

### Getting Artist Information 

In [41]:
artist_ids = dataalbums['artistidspotify'].unique().tolist()

In [42]:
artist_ids[0:3]

['3wGWCP3E3tYqj5memYV9Vq', '3oDbviiivRWhXwIE8hxkVV', '62F9BiUmjqeXbBztCwiX1U']

In [43]:
# STEP 1 of 2 for artist IDs: Construct a list of URLs for querying artist information
artist_urls = [f'https://api.spotify.com/v1/artists/{artist_id}' for artist_id in artist_ids]

# Create a list of dictionaries with artist information and corresponding URLs
artist_url_list = [{'id': artist_id, 'url': url} for artist_id, url in zip(artist_ids, artist_urls)]

In [44]:
# List to store the retrieved artist data
artist_data_list = []

# Function to search and retrieve artist information
def search_artist_info(artist_url):
    time.sleep(0.30)  # Pause between queries to avoid getting banned

    try:
        response = rq.get(artist_url['url'], headers=headers)
        response.raise_for_status()  # Check for request errors
        data = response.json()

        artist_data_list.append({
            'artistidspotify': artist_url['id'],  # Spotify ID for the artist
            'artistnamespotify': data.get('name', None),
            'followers': data.get('followers', {}).get('total', None),
            'popularity': data.get('popularity', None),
            'genres': data.get('genres', None)
            # Add more data points as needed
        })

    except Exception as e:
        print(f"An error occurred: {e}")
        artist_data_list.append({
            'artistidspotify': artist_url['id'],
            'artistnamespotify': None,
            'followers': None,
            'popularity': None,
            'genres': None
            # Add more data points as needed
        })

# Execute the function for each artist URL
for artist_url in artist_url_list:
    search_artist_info(artist_url)

# Create a DataFrame from the list of dictionaries
dataartists = pd.DataFrame(artist_data_list)

In [46]:
dataartists.tail(5)

Unnamed: 0,artistidspotify,artistnamespotify,followers,popularity,genres
317,05o2ENqv0CV8aD6BWKEaBD,Drayton Farley,60133,52,[modern country pop]
318,0gBvuNzrFCOVaiyKexoYMH,Todrick Hall,402075,52,[strut]
319,6LkVV1P1gvsluy7OOZNG7g,NEAT001,25,0,[]
320,7DEseTqRODmSu3C7jxCHl5,The Boxer Rebellion,109037,42,[indie rock]
321,4pdoRs7yHNXakMobf8M9Oz,SASAMI,61174,33,[]


### Getting Tracks Information

In [47]:
album_ids = dataalbums['albumidspotify'].unique().tolist()

In [48]:
album_ids[0:3]

['1x1jpjDbetGqX0IKCUIBNj', '2CNEkSE8TADXRT2AzcEt1b', '0T1sskJDoybYGvPU5aw5Cf']

In [49]:
#test 
urltest = 'https://api.spotify.com/v1/albums/2CNEkSE8TADXRT2AzcEt1b/tracks'

In [50]:
response = rq.get(urltest)
response 

<Response [401]>

In [187]:
# STEP 1 of 2 for track IDs: Construct a list of URLs for querying track information
track_urls = [f'https://api.spotify.com/v1/albums/{album_id}/tracks' for album_id in album_ids]
track_url_list = [{'album_id': album_id, 'url': url} for album_id, url in zip(album_ids, track_urls)]xx