In [None]:
import pandas as pd
import numpy as np
import os
import requests
import datetime
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util

In [None]:
#Add environment variables
username = os.environ.get('LAST_FM_USER_AGENT')
apiKey = os.environ.get('LAST_FM_KEY')

In [None]:
###This uses code from 
#"https://medium.com/@m.w.bochniewicz/music-analysis-with-python-part-1-create-your-own-dataset-with-lastfm-and-spotify-8223a46fad4b"
def getScrobbles(username, apiKey, lastWeek, now, limit=200, page=1):
    url = 'https://ws.audioscrobbler.com/2.0/?method=user.getrecenttracks&user={}&api_key={}&from={}&to={}&limit={}&page={}&format=json'
    responses = []
    artistNames = []
    artistMbids = []
    trackNames = []
    trackMbids = []
    timestamps = []

    requestUrl = url.format(username, apiKey, lastWeek, now, limit, page)
    response = requests.get(requestUrl).json()
    pages = int(response['recenttracks']['@attr']['totalPages'])

    print('{} total pages to retrieve'.format(pages))

    for page in range(1, int(pages) + 1):
        if page % 10 == 0:
            print(page)
        requestUrl = url.format(username, apiKey, lastWeek, now, limit, page)
        responses.append(requests.get(requestUrl))

    for response in responses:
        scrobbles = response.json()
        for scrobble in scrobbles['recenttracks']['track']:
            if 'date' in scrobble.keys():
                artistNames.append(scrobble['artist']['#text'])
                artistMbids.append(scrobble['artist']['mbid'])
                trackNames.append(scrobble['name'])
                trackMbids.append(scrobble['mbid'])
                timestamps.append(scrobble['date']['uts'])
    
    return artistNames, artistMbids, trackNames, trackMbids, timestamps

In [None]:
#Consider listening tracked over the last 7 days
d = datetime.timedelta(days=7)
now = datetime.datetime.utcnow()
lastWeek =  now -  d
#timestamp = dt.replace(tzinfo=timezone.utc).timestamp()
artistNames, artistMbids, trackNames, trackMbids, timestamps= getScrobbles(username, apiKey, int(lastWeek.replace(tzinfo = datetime.timezone.utc).timestamp()), int(now.replace(tzinfo = datetime.timezone.utc).timestamp()))
df = pd.DataFrame()
df['artist'] = artistNames
df['artistMbid'] = artistMbids
df['track'] = trackNames
df['trackMbid'] = trackMbids
df['timestamp'] = timestamps
df['datetime'] = pd.to_datetime(df['timestamp'].astype(int), unit='s')

df.head(10)

In [None]:
df.shape

In [None]:
df.tail()

In [None]:
#Input a song and an artist, and get the song's top 3 most frequent genre tags given by listeners on Last FM
def getTopGenreTags(apiKey, artist, track, mbid = ""):
    if mbid == "":
        url = 'https://ws.audioscrobbler.com/2.0/?method=track.gettoptags&artist={}&track={}&api_key={}&autocorrect=1&format=json'
        requestUrl = url.format(artist, track, apiKey)
    else:
        url = 'https://ws.audioscrobbler.com/2.0/?method=track.gettoptags&artist={}&track={}&api_key={}&mbid={}&autocorrect=1&format=json'
        requestUrl = url.format(artist, track, apiKey, mbid)
    try:
        response = requests.get(requestUrl).json()
        top3 = [x['name'] for x in response['toptags']['tag'][0:3]]
        if len(top3) == 0: 
            #If there are no genre tags for the song, try to get the top tags for the artist
            try:
                url = 'https://ws.audioscrobbler.com/2.0/?method=artist.gettoptags&artist={}&api_key={}&autocorrect=1&format=json'
                requestUrl = url.format(artist, apiKey)
                response = requests.get(requestUrl).json()
                top3 = [x['name'] for x in response['toptags']['tag'][0:3]]
                if len(top3) == 0: 
                    return None
            except:
                print(track+ " not found ")
                return None
        return top3
    except:
        try:
            url = 'https://ws.audioscrobbler.com/2.0/?method=artist.gettoptags&artist={}&api_key={}&autocorrect=1&format=json'
            requestUrl = url.format(artist, apiKey)
            response = requests.get(requestUrl).json()
            top3 = [x['name'] for x in response['toptags']['tag'][0:3]]
            if len(top3) == 0: 
                return None
        except:
            print(track+ " not found ")
            return None
    
    
#getTopGenreTags(apiKey, "Passion Pit", "Sleepyhead", "02950702-7ae1-3dd1-8bf2-2e17c2721a34")
    
    
    

In [None]:
#Format the dataframe of songs with their top genre tags
genreDf = pd.DataFrame()
for index, row in df.iterrows():
    topGenres = getTopGenreTags(apiKey, row["artist"], row["track"], row["trackMbid"])
    insert = {"artist":row["artist"], "track":row["track"]}
    if topGenres == None:
        continue
    for i in range(len(topGenres)):
        col = "genre" + str(i + 1)
        insert[col] = topGenres[i]
    genreDf = genreDf.append(insert, ignore_index=True)
    
genreDf.head()
    

In [None]:
genreDf.tail(10)

## Get Spotify Data on Songs

In [None]:
clientId = os.environ.get('spotifyClientId')
clientSecret = os.environ.get('spotifyClientSecret')
clientCredentialsManager = SpotifyClientCredentials(client_id=clientId, client_secret=clientSecret)
sp = spotipy.Spotify(client_credentials_manager=clientCredentialsManager) #spotify object to access API

spotifyUsername =os.environ.get("spotifyUsername")
#scope = 'user-top-read user-read-recently-played'
SPOTIPY_REDIRECT_URI=os.environ.get('redirect_uri')
token = util.prompt_for_user_token(spotifyUsername, clientId, clientSecret,redirect_uri=SPOTIPY_REDIRECT_URI)
if token:
    sp = spotipy.Spotify(auth=token)
    print("Success")
else:
    print("Can't get token for", username)

In [None]:
#Search songs based on artist and title to get the track ID's
def getSpotifyUris(df):
    savedUris = []
    artistNames = df['artist'].values
    trackNames = df['track'].values
    
    for i in range(len(artistNames)):
        try:
            artist = artistNames[i]
            track = trackNames[i]
            q = 'artist:{} track: {}'.format(artist, track)
            results = sp.search(q=q, limit=1, type='track')
            uri = results['tracks']['items'][0]['uri']
            savedUris.append((uri.split(":")[2], artistNames[i], trackNames[i]))
        except:
            pass
    return savedUris
uri = getSpotifyUris(df)

In [None]:
def getAudioFeatures(trackId):
    return sp.audio_features(trackId)
def getSearchItem(item, ofType):
    #ofType: should be 'artist', 'album', 'track', 'playlist'
    results = sp.search(q= ofType + ':' + item, type=ofType)
    searchItems = results[ofType+"s"]['items']
    if len(searchItems) > 0:
        return searchItems[0]
    else:
        return None

In [None]:
spotifyAudioDf = pd.DataFrame()
spotifyTrackInfoDf = pd.DataFrame()
trackUrl = "https://api.spotify.com/v1/tracks/{}"
for idNum in uri:
    audioFeatures = getAudioFeatures(idNum[0])
    audioFeatures[0]["artist"] = idNum[1]
    audioFeatures[0]["track"] = idNum[2]
    spotifyAudioDf = spotifyAudioDf.append(audioFeatures, ignore_index=True)
    requestUrl = trackUrl.format(audioFeatures[0]["id"])
    response = requests.get(requestUrl, headers={"Content-Type":"application/json", 
                        "Authorization":"Bearer "+ token}).json()
    trackInfoRow = {}
    trackInfoRow["artist"] = idNum[1]
    trackInfoRow["track"] = idNum[2]
    trackInfoRow["popularity"] = response["popularity"]
    trackInfoRow["release_date"] = response["album"]["release_date"]
    trackInfoRow["explicit"] = response["explicit"]
    spotifyTrackInfoDf = spotifyTrackInfoDf.append(trackInfoRow, ignore_index=True)

In [None]:
#Merge dataframes from Spotify and LastFM 
fullDf = pd.merge(genreDf, spotifyTrackInfoDf, on = ["artist", "track"], how = "inner")
fullDf = pd.merge(fullDf, spotifyAudioDf, on = ["artist", "track"], how = "inner")
fullDf.head()

In [None]:
#Keep the useful columns 
fullDf = fullDf[['artist',
 'track',
 'genre1',
 'genre2',
 'genre3',
 'explicit',
 'popularity',
 'release_date',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'type',
 'id',
 'uri',
 'track_href',
 'analysis_url',
 'duration_ms',
 'time_signature']]
fullDf.head()

In [None]:
#Save the file
fullDf.to_csv("myListeningData.csv")