## 1. Import dependencies and define functions

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import time 
from config import client_id, client_secret
import re
import sys
import math

In [2]:
client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [3]:
# Use regex to simply list of spotify_ids to only the ids. 
def getURI(uri_list):
#     p = re.compile(r'(?:spotify:playlist:)(\w{22})')
    p = re.compile(r'(?:spotify:(?:album|playlist|artist|track):)(\w{22})')
    URIs = [p.match(item).group(1) for item in uri_list]
    return URIs

# Scan a list of playlist URIs and return 
def getPlaylistTracks(playlist_URI_list):
    # Tracks variable will hold the JSON dictionaries,
    # one dictionary for every playlist 
    
    clean_URIs = getURI(playlist_URI_list)
    json_dicts = [sp.playlist_items(uri) for uri in clean_URIs]
    tracks = []
    playlist_count = 0
    song_count = 0
    for i in range(len(json_dicts)):
        json = json_dicts[i]['items']
        playlist_count += 1
        for j in range(len(json)):
            try:
                track = json[j]['track']['id']
                tracks.append(track)
                song_count +=1
            except (TypeError, ValueError):
                pass
    print('Number of playlists scanned: ',playlist_count, 'Total songs: ', song_count)
    return tracks

In [4]:
# Returna list of information for each track
def getTrackFeatures(id):
    features = sp.audio_features(id)

    acousticness = features[0]['acousticness']
    danceability = features[0]['danceability']
    energy = features[0]['energy']
    instrumentalness = features[0]['instrumentalness']
    liveness = features[0]['liveness']
    loudness = features[0]['loudness']
    key = features[0]['key']
    mode = features[0]['mode']
    valence = features[0]['valence']
    speechiness = features[0]['speechiness']
    tempo = features[0]['tempo']
    time_signature = features[0]['time_signature']
    song_uri = features[0]['id']
    duration_ms = features[0]['duration_ms']

    track = [song_uri, key, mode, valence, danceability, acousticness, energy, instrumentalness, liveness, loudness, speechiness, tempo, time_signature, duration_ms]
    return track

## 2. Import our playlist mood data and grab track info for each mood

In [7]:
year_playlists = pd.read_csv('../analysis_data/year_playlists.csv')
playlist_uri = getURI(list(year_playlists['id']))
year_playlists['playlist_uri'] = playlist_uri
year_playlists.head()

Unnamed: 0,id,year,playlist_uri
0,spotify:playlist:6bHjBCFN8Lqj0K54RWFci8,1979,6bHjBCFN8Lqj0K54RWFci8
1,spotify:playlist:7DCh6mOvaEGMVph25k7hyN,1980,7DCh6mOvaEGMVph25k7hyN
2,spotify:playlist:5wbHH4DqF4AVuGmjXI8kcW,1981,5wbHH4DqF4AVuGmjXI8kcW
3,spotify:playlist:1bk6tO6d5oes6n0vhACi5x,1982,1bk6tO6d5oes6n0vhACi5x
4,spotify:playlist:7GN1ulgzXBWpn5VADARkNd,1983,7GN1ulgzXBWpn5VADARkNd


In [8]:
sp.playlist_items(year_playlists['playlist_uri'][0]).keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [9]:
# Loop through ever playlist in the df 
json_dict = {}
for i in range(len(year_playlists['playlist_uri'])):
    year = year_playlists['year'][i]
    tracks = (sp.playlist_items(year_playlists['playlist_uri'][i]))['items']
    json_dict[year] = tracks
print("Done")

Done


In [10]:
years = list(year_playlists['year'])

In [11]:
track_info = []
for year in years:
    track_json = json_dict[year]
    for i in range(len(track_json)):
        name = track_json[i]['track']['name']
        song_uri = track_json[i]['track']['id']
        artist_name = track_json[i]['track']['artists'][0]['name']
        artist_id = track_json[i]['track']['artists'][0]['id']
        track_info.append([name, song_uri, artist_name, artist_id, year])

In [12]:
year_df = pd.DataFrame(track_info, columns = ["name", "song_uri", "artist_name", "artist_id","year"]) 

In [13]:
year_df

Unnamed: 0,name,song_uri,artist_name,artist_id,year
0,My Sharona,1HOMkjp0nHMaTnfAkslCQj,The Knack,0Nn9YwJzcaeuU1jJL06e3r,1979
1,"Bad Girls - 12"" Version",3n48NtvOovv1UMTA41zxMn,Donna Summer,2eogQKWWoohI3BSnoG7E2U,1979
2,Le Freak - Edit,6Lphpr9Z6H282Sguw0dUWa,CHIC,0Xf8oDAJYd2D0k3NLI19OV,1979
3,Do Ya Think I'm Sexy,3wsPg2KrRYZFi0inIFa41x,Rod Stewart,2y8Jo9CKhJvtfeKOsYzRdT,1979
4,Reunited,2Y18kxNc1F6Ut6t5eyKsRD,Peaches & Herb,6qI4LTzMRpTxRzMZPvv2C6,1979
...,...,...,...,...,...
4174,More Than My Hometown,0eBXyY4SatzpE7opnzgXvz,Morgan Wallen,4oUHIQIBe0LHzYfvXNW4QM,2020
4175,Lovin' On You,0nYvjcSlCgjcwogQAwIwNp,Luke Combs,718COspgdWOnwOFpJHRZHS,2020
4176,Said Sum,3sKz6Sd72K0ofPWcJPPk6H,Moneybagg Yo,3tJoFztHeIJkJWMrx0td2f,2020
4177,Slide,2rTnVB1bvwxHtaIl4uVu7f,H.E.R.,3Y7RZ31TRPVadSFVy1o8os,2020


In [14]:
track_ids = list(year_df['song_uri'])
track_year = year_df['year']

In [15]:
len(track_ids)

4179

## 3. Create the dataframes and export

In [16]:
track1100 = track_ids[0:1100]
track2100 = track_ids[1100:2100]
track3100 = track_ids[2100:3100]
track4200 = track_ids[3100:4200]

In [17]:
def createDataFrame(track_ids, export_name):
    tracks = []

    for i in range(len(track_ids)):
        try:
            time.sleep(.5)
            track = getTrackFeatures(track_ids[i])
            tracks.append(track)
        except:
            print(f"The following error occured: {sys.exc_info()[0]}")
            pass

    # create dataset
    df = pd.DataFrame(tracks, columns = ["song_uri", "key", "mode", "valence", "danceability", "acousticness", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "tempo", "time_signature", "duration_ms"])
    df.to_csv(export_name, sep = ',')
    return df

In [18]:
test = createDataFrame(track1100[:5],'test.csv')

In [20]:
test

Unnamed: 0,song_uri,key,mode,valence,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,duration_ms
0,1HOMkjp0nHMaTnfAkslCQj,0,1,0.897,0.586,0.0589,0.7,0.00125,0.0318,-9.558,0.0363,147.245,4,295400
1,3n48NtvOovv1UMTA41zxMn,2,0,0.97,0.872,0.00839,0.873,0.00568,0.451,-6.824,0.0374,120.378,4,295126
2,6Lphpr9Z6H282Sguw0dUWa,7,1,0.89,0.832,0.0422,0.553,0.00356,0.249,-13.705,0.0487,119.825,4,215693
3,3wsPg2KrRYZFi0inIFa41x,2,0,0.877,0.716,0.0107,0.465,0.000667,0.099,-15.205,0.0307,112.087,4,325813
4,2Y18kxNc1F6Ut6t5eyKsRD,5,0,0.346,0.607,0.614,0.462,8e-06,0.0724,-10.004,0.0287,75.752,4,340867


In [21]:
df_chunk1 = createDataFrame(track1100, 'chunk1.csv')

In [22]:
df_chunk1.head()

Unnamed: 0,song_uri,key,mode,valence,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,duration_ms
0,1HOMkjp0nHMaTnfAkslCQj,0,1,0.897,0.586,0.0589,0.7,0.00125,0.0318,-9.558,0.0363,147.245,4,295400
1,3n48NtvOovv1UMTA41zxMn,2,0,0.97,0.872,0.00839,0.873,0.00568,0.451,-6.824,0.0374,120.378,4,295126
2,6Lphpr9Z6H282Sguw0dUWa,7,1,0.89,0.832,0.0422,0.553,0.00356,0.249,-13.705,0.0487,119.825,4,215693
3,3wsPg2KrRYZFi0inIFa41x,2,0,0.877,0.716,0.0107,0.465,0.000667,0.099,-15.205,0.0307,112.087,4,325813
4,2Y18kxNc1F6Ut6t5eyKsRD,5,0,0.346,0.607,0.614,0.462,8e-06,0.0724,-10.004,0.0287,75.752,4,340867


In [23]:
client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [24]:
df_chunk2 = createDataFrame(track2100,'chunk2.csv')

The following error occured: <class 'TypeError'>
The following error occured: <class 'TypeError'>
The following error occured: <class 'TypeError'>


In [25]:
df_chunk2.head()

Unnamed: 0,song_uri,key,mode,valence,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,duration_ms
0,6m59VvDUi0UQsB2eZ9wVbH,6,0,0.775,0.84,0.00195,0.56,0.00417,0.633,-10.53,0.21,111.815,4,261853
1,7j5TIXPi0cCbSSqItmbyZy,6,1,0.304,0.748,0.00348,0.84,0.00647,0.44,-7.701,0.0421,115.997,4,317080
2,0gLQ6jhJsyYfl7PrD1RZ7X,5,1,0.355,0.256,0.433,0.464,0.0,0.121,-10.959,0.0382,205.362,3,209293
3,3wNXqTMimmuTEj1iEPftC7,5,0,0.41,0.792,0.786,0.411,0.00212,0.0499,-12.384,0.033,101.976,4,322173
4,6BjqF9DTiAubeE34grUKVq,1,1,0.383,0.727,0.077,0.528,0.0,0.344,-8.771,0.0585,96.934,4,304373


In [26]:
client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [27]:
df_chunk3 = createDataFrame(track3100,'chunk3.csv')

In [28]:
df_chunk3.head()

Unnamed: 0,song_uri,key,mode,valence,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,duration_ms
0,6fA7akEuTUL3dW1V0GELaZ,6,0,0.527,0.518,0.0404,0.83,4e-06,0.375,-6.814,0.0386,95.468,4,256133
1,5PXYIVQZ8xmNoONoHzbrv2,11,0,0.763,0.739,0.00947,0.947,3e-05,0.326,-1.915,0.0412,88.009,4,261933
2,7mCRbaZCjHY2soGQPw93cY,2,1,0.427,0.46,5.4e-05,0.829,9.2e-05,0.227,-6.266,0.0355,155.843,4,316733
3,1kuZSCuFZh718pUEMhgfSs,6,0,0.762,0.783,0.0817,0.58,0.0,0.0831,-7.398,0.298,93.026,4,284360
4,4pmc2AxSEq6g7hPVlJCPyP,1,1,0.683,0.771,0.00543,0.685,0.00157,0.0537,-4.639,0.0567,88.997,4,230200


In [29]:
client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [30]:
df_chunk4 = createDataFrame(track4200,'chunk4.csv')

The following error occured: <class 'TypeError'>


In [31]:
df_chunk4.head()

Unnamed: 0,song_uri,key,mode,valence,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,duration_ms
0,60jzFy6Nn4M0iD1d94oteF,11,1,0.812,0.563,0.113,0.75,0.0,0.0788,-4.496,0.127,173.906,4,222920
1,4TCL0qqKyqsMZml0G3M9IM,3,1,0.716,0.824,0.00521,0.836,0.000817,0.112,-5.903,0.0404,122.014,4,220640
2,55qBw1900pZKfXJ6Q9A2Lc,10,1,0.605,0.719,0.0132,0.804,3e-06,0.139,-4.581,0.0355,119.999,4,227760
3,7BqBn9nzAq8spo5e7cZ0dJ,5,1,0.424,0.635,0.0134,0.841,0.0,0.0622,-5.379,0.0422,109.021,4,220734
4,2MIqLCofYbazo7rXz5RErV,7,0,0.623,0.767,0.174,0.824,0.0,0.357,-4.558,0.0479,129.96,4,214760


In [34]:
frames = [df_chunk1, df_chunk2, df_chunk3, df_chunk4]
concat_df = pd.concat(frames)
concat_df

Unnamed: 0,song_uri,key,mode,valence,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,duration_ms
0,1HOMkjp0nHMaTnfAkslCQj,0,1,0.897,0.586,0.05890,0.700,0.001250,0.0318,-9.558,0.0363,147.245,4,295400
1,3n48NtvOovv1UMTA41zxMn,2,0,0.970,0.872,0.00839,0.873,0.005680,0.4510,-6.824,0.0374,120.378,4,295126
2,6Lphpr9Z6H282Sguw0dUWa,7,1,0.890,0.832,0.04220,0.553,0.003560,0.2490,-13.705,0.0487,119.825,4,215693
3,3wsPg2KrRYZFi0inIFa41x,2,0,0.877,0.716,0.01070,0.465,0.000667,0.0990,-15.205,0.0307,112.087,4,325813
4,2Y18kxNc1F6Ut6t5eyKsRD,5,0,0.346,0.607,0.61400,0.462,0.000008,0.0724,-10.004,0.0287,75.752,4,340867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1073,0eBXyY4SatzpE7opnzgXvz,6,1,0.574,0.621,0.60100,0.882,0.000000,0.1320,-5.010,0.0459,126.014,4,216573
1074,0nYvjcSlCgjcwogQAwIwNp,4,1,0.530,0.572,0.00165,0.949,0.000195,0.1630,-4.865,0.0600,118.974,4,194867
1075,3sKz6Sd72K0ofPWcJPPk6H,8,0,0.274,0.929,0.01850,0.667,0.000000,0.1000,-6.789,0.3530,126.998,4,155168
1076,2rTnVB1bvwxHtaIl4uVu7f,10,0,0.197,0.832,0.08070,0.469,0.000008,0.2070,-9.141,0.3390,97.023,4,238321


In [35]:
merged = year_df.merge(concat_df, how='inner', on='song_uri')
merged.head()

Unnamed: 0,name,song_uri,artist_name,artist_id,year,key,mode,valence,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,time_signature,duration_ms
0,My Sharona,1HOMkjp0nHMaTnfAkslCQj,The Knack,0Nn9YwJzcaeuU1jJL06e3r,1979,0,1,0.897,0.586,0.0589,0.7,0.00125,0.0318,-9.558,0.0363,147.245,4,295400
1,"Bad Girls - 12"" Version",3n48NtvOovv1UMTA41zxMn,Donna Summer,2eogQKWWoohI3BSnoG7E2U,1979,2,0,0.97,0.872,0.00839,0.873,0.00568,0.451,-6.824,0.0374,120.378,4,295126
2,Le Freak - Edit,6Lphpr9Z6H282Sguw0dUWa,CHIC,0Xf8oDAJYd2D0k3NLI19OV,1979,7,1,0.89,0.832,0.0422,0.553,0.00356,0.249,-13.705,0.0487,119.825,4,215693
3,Do Ya Think I'm Sexy,3wsPg2KrRYZFi0inIFa41x,Rod Stewart,2y8Jo9CKhJvtfeKOsYzRdT,1979,2,0,0.877,0.716,0.0107,0.465,0.000667,0.099,-15.205,0.0307,112.087,4,325813
4,Reunited,2Y18kxNc1F6Ut6t5eyKsRD,Peaches & Herb,6qI4LTzMRpTxRzMZPvv2C6,1979,5,0,0.346,0.607,0.614,0.462,8e-06,0.0724,-10.004,0.0287,75.752,4,340867


In [56]:
#Remove duplicates
print("Number of duplicates: ", merged.duplicated().sum())
merged = merged.drop_duplicates()
merged.to_csv("merged.csv", sep = ',')

Number of duplicates:  526


In [57]:
# Import dependencies

from sqlalchemy import create_engine
import psycopg2
from config import db_password

In [58]:
# Connect df playlst to SQL

db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/spotify_data"
engine = create_engine(db_string)
    
merged.to_sql(name='spotify_year', con=engine, if_exists='replace')