In [40]:
import pandas as pd
import numpy as np

import shutil
import os
import time
from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm_notebook

import spotipy
import spotipy.util as util

from requests.exceptions import ConnectionError 
from requests.exceptions import HTTPError
from requests.exceptions import SSLError
from urllib3.exceptions import MaxRetryError



In [41]:
# Directories

data_dir = "/Users/chrisolen/Documents/uchicago_courses/deep_learning_and_image_recognition/audio_generation/data/"
scripts_dir = "/Users/chrisolen/Documents/uchicago_courses/deep_learning_and_image_recognition/audio_generation/NADE_music_generation/"

# Midi File Local Organization
Download LMD-Matched from: https://colinraffel.com/projects/lmd/

In [53]:
'''

# Moving into a single directory

# Location to move all midi files:

destination = data_dir+"midi_files/"

# Iterating through original file system:

source = data_dir+"lmd_matched/"
level_1 = os.listdir(source)
for i in range(len(level_1)):
    if level_1[i] != ".DS_Store":
        level_2 = os.listdir(source+level_1[i]+"/")
        for j in range(len(level_2)):
            if level_2[j] != ".DS_Store":
                level_3 = os.listdir(source+level_1[i]+"/"+level_2[j]+"/")
                for k in range(len(level_3)):
                    if level_3[k] != ".DS_Store":
                        level_4 = os.listdir(source+level_1[i]+"/"+level_2[j]+"/"+level_3[k]+"/")
                        for f in range(len(level_4)):
                            if level_4[f] != ".DS_Store":
                                shutil.move(source+level_1[i]+"/"+level_2[j]+"/"+level_3[k]+"/"+level_4[f],destination)
                
                
'''        


# Song and Artist Reference Name Text File
Download "all track Echo Nest ID" here: http://millionsongdataset.com/pages/getting-dataset/

In [42]:
# Read in textfile of 'Million Song Dataset' names:

file = open(data_dir+"songlist_with_mbid.txt", 'r')
lines = np.array(file.readlines())

In [43]:
# Remove newline markers:

cleaned_lines = [lines[i].replace("\n","") for i in range(len(lines))]

# Split along "<SEP>"

split_lines = [cleaned_lines[i].split("<SEP>") for i in range(len(cleaned_lines))] 

In [44]:
# Merge into dataframe:

tracks = pd.DataFrame.from_records(split_lines)
tracks.rename(columns={0:"track_id",1:"mb_id",2:"artist",3:"song_title"},inplace=True)


In [45]:
# Removing 'feat.' from the artists field:

cleaned_artist = [tracks['artist'].iloc[i] if 'Feat.' not in tracks['artist'].iloc[i] else tracks['artist'].iloc[i].replace(tracks['artist'].iloc[i],tracks['artist'].iloc[i][tracks['artist'].iloc[i].find('Feat.'):][6:]) for i in range(len(tracks))]

cleaned_artist_2 = [cleaned_artist[i] if 'feat.' not in cleaned_artist[i] else cleaned_artist[i].replace(cleaned_artist[i],cleaned_artist[i][cleaned_artist[i].find('feat.'):][6:]) for i in range(len(cleaned_artist))]






In [46]:
tracks['artist'] = cleaned_artist_2

In [47]:
tracks.head()

Unnamed: 0,track_id,mb_id,artist,song_title
0,TROXMWJ12903CF4396,f275d9a5-3271-436b-8c32-806ec275e23c,Arnold Jarvis,Life Goes On
1,TRDDLOD128F92FF922,f26c72d3-e52c-467b-b651-679c73d8e1a7,!!!,A New Name
2,TRGCKLJ128F92FF90E,f26c72d3-e52c-467b-b651-679c73d8e1a7,!!!,All My Heroes Are Weirdos
3,TRRKQQU128F92FF947,f26c72d3-e52c-467b-b651-679c73d8e1a7,!!!,Bend Over Beethoven
4,TRZRBYS128F92FF94F,f26c72d3-e52c-467b-b651-679c73d8e1a7,!!!,Break In Case Of Anything


In [48]:
tracks.shape

(1000000, 4)

# Retaining Only Records for Which Midi Files Exist

In [49]:
unique_tracks = os.listdir(data_dir+'midi_files')
tracks = tracks[tracks['track_id'].isin(unique_tracks)]
tracks.shape

(31032, 4)

In [50]:
tracks.head()

Unnamed: 0,track_id,mb_id,artist,song_title
277,TRBQUUS12903CB2580,7b28fb58-882b-4907-856e-ef327b624b11,+ / - {Plus/Minus},Flight Data Recorder
300,TRRXTJI12903D01048,6bd4eaa7-aa1c-4138-a392-41693229c7fc,+39,My Relax
463,TRMKKFV12903CB7EA4,,...arrived at ten,Flake
522,TRNVJXQ128F14AFD63,2fc659fa-293f-4288-bff5-29447f31f628,1. Futurologischer Congress,Atem
574,TRLIEDO12903CA41B4,f37c537b-3557-4031-bfd6-ab63ced32854,10 CC,The Things We Do For Love


# Spotify API Authentication and Calls

In [756]:
# Function to authenticate using credentials:

def auth_spotify(my_username, chosen_scope, my_client_id, my_client_secret, my_redirect_uri):
    token = util.prompt_for_user_token(username=my_username,
                                       scope=chosen_scope,
                                       client_id=my_client_id,
                                       client_secret=my_client_secret,
                                       redirect_uri=my_redirect_uri)
    sp = spotipy.Spotify(auth=token, requests_timeout=180)
    return sp




In [757]:
# Define credentials:

my_username="chrisolen"
chosen_scope='user-library-read'
my_client_id='###########'
my_client_secret='###########'
my_redirect_uri='https://grahamschool.uchicago.edu/academic-programs/masters-degrees/analytics'


In [758]:
# Authenticate:

sp = auth_spotify(my_username,
               chosen_scope,
               my_client_id,
               my_client_secret,
               my_redirect_uri)

In [771]:
# Function for one round of hitting the API (for a given track):

def hit_spotify(track_name, artist_name, my_username, chosen_scope, my_client_id, my_client_secret, my_redirect_uri):
    
    sp = ''
    while sp == '':
        try:
            sp = auth_spotify(my_username,
                   chosen_scope,
                   my_client_id,
                   my_client_secret,
                   my_redirect_uri)
        except (spotipy.client.SpotifyException, ConnectionError, MaxRetryError, TimeoutError, SSLError, HTTPError):
            tqdm.write("Going to sleep for 1 minute - Errored out on initial connect attempt")
            time.sleep(60)
        
    track_result = ''
    features = ''
    while track_result == '' and features == '':
        try:
            track_result = sp.search(q=track_name+" "+artist_name,type='track')
            track_id = track_result['tracks']['items'][0]['id']
            artist_id = track_result['tracks']['items'][0]['artists'][0]['id']
            features = sp.audio_features(track_id)
            time_sig = features[0]['time_signature'] 
            tempo = features[0]['tempo'] 
            energy = features[0]['energy'] 
            loudness = features[0]['loudness'] 
            speechiness = features[0]['speechiness'] 
            acousticness = features[0]['acousticness']
            instrumentalness = features[0]['instrumentalness'] 
            genre = sp.artists([artist_id])['artists'][0]['genres']
            result = np.array([track_id, artist_id, time_sig, tempo, energy, loudness, speechiness, acousticness, instrumentalness, genre])
            return result
        except (IndexError, TypeError): # Return NoneType when there are no results
            pass
        except (spotipy.client.SpotifyException, ConnectionError, MaxRetryError, TimeoutError, SSLError, HTTPError):
            tqdm.write("Going to sleep for 1 minute - Errored out on call")
            time.sleep(60)
            sp = auth_spotify(my_username, # Reauthenticate when the token expires
                   chosen_scope,
                   my_client_id,
                   my_client_secret,
                   my_redirect_uri)
            continue
        
        

In [772]:
# API calls:

for i in tqdm_notebook(range(26585,len(tracks)), mininterval = 5.0, leave = False):
    result_array = hit_spotify(tracks.iloc[i]['song_title'], tracks.iloc[i]['artist'],my_username="chrisolen",
                              chosen_scope='user-library-read', my_client_id=my_client_id,
                              my_client_secret=my_client_secret,
                               my_redirect_uri='https://grahamschool.uchicago.edu/academic-programs/masters-degrees/analytics')
    try:
        with open(scripts_dir+"data_cleaning_scripts/track_metadata.txt","ab") as textfile:
            np.savetxt(textfile,result_array.reshape(1, result_array.shape[0]), fmt="%s", delimiter=' | ')
    except AttributeError: 
        with open(scripts_dir+"data_cleaning_scripts/track_metadata.txt","ab") as textfile:
            np.savetxt(textfile,np.array([np.NaN]), fmt="%s", delimiter=' | ')
        
        
    

HBox(children=(IntProgress(value=0, max=4447), HTML(value='')))

retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs


# Cleaning Up Spotify Metadata

In [51]:
textfile = open(scripts_dir+"data_cleaning_scripts/track_metadata.txt", 'r')
track_features = np.array(textfile.readlines())

In [52]:
track_features

array(['5aXSB4ywBWcPVBuOqxJAsZ | 4j4NbPKY5EnsEUhdNICINb | 3 | 108.021 | 0.584 | -9.032 | 0.027 | 0.00184 | 0.155 | []\n',
       '2j7kfbjTPqpaYHide0hFxi | 5Cicr7PrzTNv4bXiZntlex | 4 | 119.997 | 0.553 | -7.488 | 0.216 | 0.00514 | 0.869 | []\n',
       '1T3zWGYGHIAX36IHYHL9Ar | 49myFlNbBmL4X25pekwLqR | 4 | 106.003 | 0.437 | -12.814 | 0.0261 | 0.913 | 0.881 | []\n',
       ...,
       "0wY6UGmdUSOs38DDKMfNfA | 7E3BRXV9ZbCt5lQTCXMTia | 4 | 135.456 | 0.0292 | -23.61 | 0.0346 | 0.967 | 0.968 | ['bow pop', 'chamber pop', 'compositional ambient', 'focus', 'icelandic classical', 'neo-classical', 'neoclassical']\n",
       "4HMHfABjrAez8PucpSM6Qe | 7E3BRXV9ZbCt5lQTCXMTia | 4 | 89.852 | 0.272 | -14.941 | 0.0281 | 0.92 | 0.836 | ['bow pop', 'chamber pop', 'compositional ambient', 'focus', 'icelandic classical', 'neo-classical', 'neoclassical']\n",
       "1i9YBowsDKdycECqeloT9I | 1yZbA9817FU14OeRIJ1Dig | 4 | 125.018 | 0.87 | -11.325 | 0.0425 | 0.059 | 0.538 | ['future ambient']\n"],
      dtype='<

In [53]:
# Remove newline markers:

cleaned_features = [track_features[i].replace("\n","") for i in range(len(track_features))]

# Split along " | "

split_features = [cleaned_features[i].split(" | ") for i in range(len(cleaned_features))] 

In [54]:
# Make the 'nan' lists have the same dimensions as the others:

indexes = [i for i,x in enumerate(split_features) if x[0] == 'nan']

extended = ['nan','nan','nan','nan','nan','nan','nan','nan','nan',['nan']]

for i in range(len(indexes)):
    split_features[indexes[i]] = extended

In [55]:
# Throwing the spotify features in a dataframe:

spotify_features = pd.DataFrame(split_features, columns = ['sp_track_id', 'sp_artist_id', 'time_sig', 'tempo', 'energy', 
                                                             'loudness', 'speechiness', 'acousticness', 
                                                             'instrumentalness', 'genre'])

In [56]:
spotify_features.head()

Unnamed: 0,sp_track_id,sp_artist_id,time_sig,tempo,energy,loudness,speechiness,acousticness,instrumentalness,genre
0,5aXSB4ywBWcPVBuOqxJAsZ,4j4NbPKY5EnsEUhdNICINb,3.0,108.021,0.584,-9.032,0.027,0.00184,0.155,[]
1,2j7kfbjTPqpaYHide0hFxi,5Cicr7PrzTNv4bXiZntlex,4.0,119.997,0.553,-7.488,0.216,0.00514,0.869,[]
2,1T3zWGYGHIAX36IHYHL9Ar,49myFlNbBmL4X25pekwLqR,4.0,106.003,0.437,-12.814,0.0261,0.913,0.881,[]
3,6bsV04oYvnEEh2tmMzbuE3,6MD8yUEX0V8XJR2Xrwz0HO,4.0,136.25,0.883,-7.47,0.0513,0.0366,0.00704,['neue deutsche welle']
4,,,,,,,,,,[nan]


# Merging the Two Frames

In [57]:
# Reseting index of track dataframe to match up with features dataframe:

tracks.reset_index(inplace= True)
tracks.drop(['index'], axis = 1, inplace = True)
tracks.head()

Unnamed: 0,track_id,mb_id,artist,song_title
0,TRBQUUS12903CB2580,7b28fb58-882b-4907-856e-ef327b624b11,+ / - {Plus/Minus},Flight Data Recorder
1,TRRXTJI12903D01048,6bd4eaa7-aa1c-4138-a392-41693229c7fc,+39,My Relax
2,TRMKKFV12903CB7EA4,,...arrived at ten,Flake
3,TRNVJXQ128F14AFD63,2fc659fa-293f-4288-bff5-29447f31f628,1. Futurologischer Congress,Atem
4,TRLIEDO12903CA41B4,f37c537b-3557-4031-bfd6-ab63ced32854,10 CC,The Things We Do For Love


In [58]:
# Merging two dateframes:

track_metadata = tracks.merge(spotify_features, left_on=tracks.index, right_on=spotify_features.index)

In [None]:
track_metadata.drop(['key_0'], axis=1, inplace=True)

In [61]:
track_metadata.tail()

Unnamed: 0,track_id,mb_id,artist,song_title,sp_track_id,sp_artist_id,time_sig,tempo,energy,loudness,speechiness,acousticness,instrumentalness,genre
31027,TRJMGED128F1485763,1e0de31c-4957-4649-9aa3-7b0f1d9d2c84,Étienne Daho,Quelqu'un Qui M'ressemble (Démo),,,,,,,,,,[nan]
31028,TRXFHVQ128F1453822,1e0de31c-4957-4649-9aa3-7b0f1d9d2c84,Étienne Daho,Saudade (Live 2004),3fJymrFJ6BlODWK0QzuVlH,5aDKJuLDczfmHfRSOmHCJk,4.0,116.905,0.966,-2.431,0.0421,0.0112,0.000326,"['chanson', 'french indie pop', 'french indiet..."
31029,TRGSZLI128F4230F3A,6655955b-1c1e-4bcb-84e4-81bcd9efab30,Ólafur Arnalds,0040,0wY6UGmdUSOs38DDKMfNfA,7E3BRXV9ZbCt5lQTCXMTia,4.0,135.456,0.0292,-23.61,0.0346,0.967,0.968,"['bow pop', 'chamber pop', 'compositional ambi..."
31030,TRIAHBB128F930136B,6655955b-1c1e-4bcb-84e4-81bcd9efab30,Ólafur Arnalds,Þau hafa sloppið undan þunga myrkursins,4HMHfABjrAez8PucpSM6Qe,7E3BRXV9ZbCt5lQTCXMTia,4.0,89.852,0.272,-14.941,0.0281,0.92,0.836,"['bow pop', 'chamber pop', 'compositional ambi..."
31031,TRCFVJF128F42AAD7B,aab128f8-a013-46b3-97c7-73f0d18dd7c1,üNN,Three Pilots,1i9YBowsDKdycECqeloT9I,1yZbA9817FU14OeRIJ1Dig,4.0,125.018,0.87,-11.325,0.0425,0.059,0.538,['future ambient']


In [62]:
track_metadata.to_csv(scripts_dir+"data_cleaning_scripts/track_metadata.csv")