In [2]:
!pip install spotipy



In [3]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import json

In [4]:
with open('./cred.json') as crd:
    cred_dict = json.load(crd)
    client_id = cred_dict.get('client_id')
    client_secret = cred_dict.get('client_secret')

In [5]:
client_credentials = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)

### Creating object of spotify to extract data from Spotify API

In [6]:
sp = spotipy.Spotify(client_credentials_manager=client_credentials)

In [7]:
playlist_link = "https://open.spotify.com/playlist/5ABHKGoOzxkaa28ttQV9sE"

In [8]:
playlist_link.split("/")

['https:', '', 'open.spotify.com', 'playlist', '5ABHKGoOzxkaa28ttQV9sE']

In [9]:
playlist_URI = playlist_link.split('/')[-1]

In [10]:
data = sp.playlist_tracks(playlist_URI)

### Data Exploration

In [11]:
len(data['items'])

100

In [12]:
data['items'][0]['track']['album']['id']

'4yP0hdKOZPNshxUOjY0cZj'

In [13]:
data['items'][0]['track']['album']['name']

'After Hours'

In [14]:
data['items'][0]['track']['album']['release_date']

'2020-03-20'

In [15]:
data['items'][0]['track']['album']['total_tracks']

14

In [16]:
data['items'][0]['track']['album']['external_urls']['spotify']

'https://open.spotify.com/album/4yP0hdKOZPNshxUOjY0cZj'

### Data extraction and data cleaning

In [17]:
album_list = []

for row in data['items']:
    album_id = row['track']['album']['id']
    album_name = row['track']['album']['name']
    album_release_date = row['track']['album']['release_date']
    album_total_tracks = row['track']['album']['total_tracks']
    album_url = row['track']['album']['external_urls']['spotify']
    
    #Convert into dictionary so that we can access it via dataframes:
    album_element = { 'album_id':album_id, 'album_name':album_name, 'album_release_date':album_release_date, 'album_total_tracks':album_total_tracks, 'album_url':album_url }

    album_list.append(album_element)

In [18]:
data['items'][0]['track']['artists']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/1Xyo4u8uXC1ZmMpatF05PJ'},
  'href': 'https://api.spotify.com/v1/artists/1Xyo4u8uXC1ZmMpatF05PJ',
  'id': '1Xyo4u8uXC1ZmMpatF05PJ',
  'name': 'The Weeknd',
  'type': 'artist',
  'uri': 'spotify:artist:1Xyo4u8uXC1ZmMpatF05PJ'}]

In [19]:
artist_list = []
for row in data['items']:
    for key, value in row.items():
        if key == "track":
            for artist in value['artists']:
                artist_element = {'artist_id':artist['id'], 'artist_name':artist['name'], 'external_url':artist['href']}
                artist_list.append(artist_element)

In [20]:
data['items'][0]['track']

{'album': {'album_type': 'album',
  'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/1Xyo4u8uXC1ZmMpatF05PJ'},
    'href': 'https://api.spotify.com/v1/artists/1Xyo4u8uXC1ZmMpatF05PJ',
    'id': '1Xyo4u8uXC1ZmMpatF05PJ',
    'name': 'The Weeknd',
    'type': 'artist',
    'uri': 'spotify:artist:1Xyo4u8uXC1ZmMpatF05PJ'}],
  'available_markets': ['AR',
   'AU',
   'AT',
   'BE',
   'BO',
   'BR',
   'BG',
   'CA',
   'CL',
   'CO',
   'CR',
   'CY',
   'CZ',
   'DK',
   'DO',
   'DE',
   'EC',
   'EE',
   'SV',
   'FI',
   'FR',
   'GR',
   'GT',
   'HN',
   'HK',
   'HU',
   'IS',
   'IE',
   'IT',
   'LV',
   'LT',
   'LU',
   'MY',
   'MT',
   'MX',
   'NL',
   'NZ',
   'NI',
   'NO',
   'PA',
   'PY',
   'PE',
   'PH',
   'PL',
   'PT',
   'SG',
   'SK',
   'ES',
   'SE',
   'CH',
   'TW',
   'TR',
   'UY',
   'US',
   'GB',
   'AD',
   'LI',
   'MC',
   'ID',
   'JP',
   'TH',
   'VN',
   'RO',
   'IL',
   'ZA',
   'SA',
   'AE',
   'BH',
   'QA',
   'OM',
 

In [21]:
song_list = []
for song in data['items']:
    song_id = song['track']['id']
    song_name = song['track']['name']
    song_duration = song['track']['duration_ms']
    song_url = song['track']['external_urls']['spotify']
    song_popularity = song['track']['popularity']
    song_added = song['added_at']
    album_id = song['track']['album']['id']
    artist_id = song['track']['album']['artists'][0]['id']
    
    song_element = {'song_id':song_id, 'song_name':song_name, 'duration_ms':song_duration, 'url':song_url, 'popularity':song_popularity, 'song_added':song_added, 'album_id':album_id, 'artist_id':artist_id}

    song_list.append(song_element)

In [22]:
album_df = pd.DataFrame.from_dict(album_list)

In [23]:
album_df.head(1)

Unnamed: 0,album_id,album_name,album_release_date,album_total_tracks,album_url
0,4yP0hdKOZPNshxUOjY0cZj,After Hours,2020-03-20,14,https://open.spotify.com/album/4yP0hdKOZPNshxU...


In [24]:
album_df = album_df.drop_duplicates(subset=['album_id'])

In [25]:
artist_df = pd.DataFrame.from_dict(artist_list)

In [26]:
artist_df.shape

(133, 3)

In [27]:
artist_df = artist_df.drop_duplicates(subset=['artist_id'])

In [28]:
song_df = pd.DataFrame.from_dict(song_list)

#### Date and time converstions

In [29]:
album_df['album_release_date'] = pd.to_datetime(album_df['album_release_date'])

In [30]:
album_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95 entries, 0 to 99
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   album_id            95 non-null     object        
 1   album_name          95 non-null     object        
 2   album_release_date  95 non-null     datetime64[ns]
 3   album_total_tracks  95 non-null     int64         
 4   album_url           95 non-null     object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 4.5+ KB


In [81]:
song_df['song_added'] = pd.to_datetime(song_df['song_added'])