In [1]:
import pandas as pd
import numpy as np

import json
import requests
from IPython.display import clear_output

import psycopg2 as pg
conn = pg.connect(database="spotify_db",
                  user="postgres", 
                  password="damara1004")

def run_query(q):
    with conn:
        try:
            cur = conn.cursor()
            cur.execute(q)
            return pd.read_sql(q, conn)

        except (Exception, pg.DatabaseError) as error:
            print(error)

def run_command(c):
    with conn:
        try:
            cur = conn.cursor()
            cur.execute(c)
            cur.close()
            conn.commit()
            
        except (Exception, pg.DatabaseError) as error:
            print(error)

In [2]:
# reading in created datasets
track_metadata = pd.read_csv('../data/all_tracks_cleaned_V2.csv', index_col=0)
spotify_features = pd.read_csv('../data/audio_features_final.csv', index_col=0)

In [3]:
from ast import literal_eval
# reading in file of librosa features
with open('../data/results.txt') as f:
    features = [literal_eval(line.replace("\n","").rstrip(",")) for line in f]

In [4]:
librosa_features = pd.DataFrame(features, columns = features[0].keys())
librosa_features.head()

Unnamed: 0,track_id,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,...,D,D#,E,F,F#,G,G#,A,A#,B
0,000u1dTg7y1XCDXi80hbBX,2198.480202,2489.345784,4768.56876,0.082164,-136.455631,92.286567,4.411706,29.978968,5.862119,...,0.761796,0.583145,0.413551,0.281556,0.245734,0.229757,0.271889,0.33018,0.243558,0.315892
1,003vvx7Niy0yvhvHt4a68B,2324.056734,2369.794157,4786.518665,0.105727,-65.774035,90.26496,-16.848465,32.145438,1.468458,...,0.294782,0.289501,0.217538,0.532355,0.228859,0.202796,0.405416,0.195824,0.233053,0.165492
2,005DVZaX9PuqQ5DHyr2BH3,2462.341196,2271.303356,4765.60356,0.137565,-20.20863,83.652947,-39.032359,-1.407777,-18.29767,...,0.249533,0.554072,0.28743,0.294211,0.190855,0.398437,0.471,0.228423,0.296749,0.238494
3,007zAbR9EMRpOpBzbRcemz,1562.858037,1967.284586,3081.561739,0.062572,-211.867577,121.315694,4.162437,33.818357,5.821059,...,0.363498,0.34878,0.382567,0.449853,0.352761,0.284755,0.228843,0.235576,0.321313,0.30831
4,008ELurRZpm5bv4T6J6vvR,1726.656428,1814.695628,3587.942825,0.086967,-81.891582,126.597123,-35.13589,40.226285,-14.586352,...,0.494739,0.33355,0.294227,0.370115,0.330995,0.416454,0.298092,0.394247,0.404228,0.301494


In [5]:
print(f"Metadata: {len(track_metadata)}; Spotify Features: {len(spotify_features)}; Librosa Features: {len(librosa_features)}")

Metadata: 230456; Spotify Features: 200006; Librosa Features: 17876


Still need to bring in librosa features that are already stored.

In [6]:
q = '''SELECT * FROM librosa_features'''

stored = run_query(q)
stored.head()

Unnamed: 0,track_id,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,...,D,D#,E,F,F#,G,G#,A,A#,B
0,6YDf6QV7QfCEo8O2dbWalK,1756.4558,2046.3342,3311.4383,0.0825,-6.8987,128.4723,-26.9279,12.1849,20.8006,...,0.4522,0.3723,0.5536,0.4106,0.463,0.4977,0.4397,0.5498,0.3668,0.3736
1,0qaWxVVd3KrP4WY27KWpIe,1911.7354,2148.976,4108.5335,0.0858,-43.1119,113.6926,-16.0353,21.3349,-7.6568,...,0.4439,0.2756,0.3617,0.2811,0.4238,0.5576,0.3802,0.5572,0.3689,0.4886
2,1yZMv2GMAibgLGsQiowZrt,1502.5131,1782.0399,2969.307,0.0695,-73.8604,138.2465,-25.7173,24.4074,-0.4938,...,0.2634,0.1774,0.4107,0.5217,0.2529,0.2749,0.2159,0.4378,0.1899,0.1266
3,2eg2gvPXuwZ9FyrPaLgrXi,2430.0319,2420.7924,4922.0702,0.1331,-8.5929,91.2195,-15.3536,17.1535,-0.0333,...,0.4748,0.299,0.4744,0.3313,0.2653,0.3464,0.2482,0.3924,0.2739,0.418
4,0wzruvvN7f5wu39aFcjTMw,2729.2951,2424.4074,5364.5676,0.1561,10.4661,73.9084,-32.9083,6.7778,1.7658,...,0.2522,0.2841,0.509,0.48,0.6721,0.5047,0.5559,0.3559,0.2284,0.2586


Now I'll combine the dataframes.

In [7]:
combined_librosa = stored.append(librosa_features, ignore_index=True)
len(combined_librosa)

46721

Still missing about 153,000 rows of librosa features, need to look into this.

In [8]:
# list of track ids that went through the librosa pipeline
librosa_ids = combined_librosa['track_id'].to_list()

# list of ids of that have spotify features, but did not go through the librosa pipeline
no_librosa = spotify_features[~spotify_features['id'].isin(librosa_ids)]['id'].to_list()

# finding which songs did not go through the librosa pipeline
track_metadata[track_metadata['track_id'].isin(no_librosa)]

Unnamed: 0,track_id,track_name,artist,artist_id,track_album_album_type,track_album_id,track_album_name,track_duration_ms,track_popularity,track_preview_url,subgenres,genre_1,genre_2,genre_3
30442,3tjFYV6RSFtuktYl3ZtYcq,Mood (feat. Iann Dior),24kGoldn,6fWVd57NKTalqvmjRd2t8Z,single,4YMnOf4a7obOcN1Gy2QEuM,Mood (feat. Iann Dior),140525.0,100.0,https://p.scdn.co/mp3-preview/45cb08fdb67744ab...,['cali rap'],cali rap,,
30443,27u7t9d7ZQoyjsCROHuZJ3,Tick Tock (feat. 24kGoldn),Clean Bandit,6MDME20pz9RveH9rEXvrOM,single,3tuAs968COA2vxKjiLvmxr,Tick Tock (feat. 24kGoldn),178373.0,86.0,https://p.scdn.co/mp3-preview/3d3f99ef7d4bea72...,"['dance pop', 'edm', 'pop', 'post-teen pop', '...",dance pop,edm,pop
30444,6piAUJJQFD8oHDUr0b7l7q,VALENTINO,24kGoldn,6fWVd57NKTalqvmjRd2t8Z,album,2eLpj5EDUhyAoTks8sxcKR,DROPPED OUTTA COLLEGE,179133.0,80.0,https://p.scdn.co/mp3-preview/6347fe20d7716220...,['cali rap'],cali rap,,
30445,68UW3plyDDNg1dkNIZRezJ,Tinted Eyes (feat. blackbear & 24kGoldn),DVBBS,5X4LWwbUFNzPkEas04uU82,single,63qrRZqzJKieNXIbN2Uvkx,Tinted Eyes (feat. blackbear & 24kGoldn),175081.0,73.0,https://p.scdn.co/mp3-preview/d198eb9b7ca39a5a...,"['big room', 'canadian electronic', 'dance pop...",big room,canadian electronic,dance pop
30446,660BgHpKo1jhR9MMSFn7CF,VALENTINO - Imanbek Remix,24kGoldn,6fWVd57NKTalqvmjRd2t8Z,single,2v44hAIsegAFzILiv6ghx8,VALENTINO (Imanbek Remix),177975.0,70.0,https://p.scdn.co/mp3-preview/9de5e13ff243c0e0...,['cali rap'],cali rap,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230451,6jVVnTIC1zHFFdn2rBOE2g,This Must Be the Place (Naive Melody) - Live,David Byrne,20vuBdFblWUo2FCOvUzusB,album,1F4G3aGUoFejTCLPQ4tTgh,"Live From Austin, TX",350586.0,42.0,https://p.scdn.co/mp3-preview/98d44f14d88998fb...,"['art pop', 'art punk', 'art rock', 'dance roc...",art pop,art punk,art rock
230452,3xJ8siSD7r5mDqLNtQ5CPf,A Soft Seduction,David Byrne,20vuBdFblWUo2FCOvUzusB,album,4tH7X5vlPeZclx8KNPNfE2,Feelings,181066.0,41.0,https://p.scdn.co/mp3-preview/b8c46ccfdce6b555...,"['art pop', 'art punk', 'art rock', 'dance roc...",art pop,art punk,art rock
230453,6Mc0SvjQ2By4ZJWdF77M2C,Dance on Vaseline,David Byrne,20vuBdFblWUo2FCOvUzusB,album,4tH7X5vlPeZclx8KNPNfE2,Feelings,308026.0,40.0,https://p.scdn.co/mp3-preview/879cb1d23da841d8...,"['art pop', 'art punk', 'art rock', 'dance roc...",art pop,art punk,art rock
230454,392cd1euczCOcZAs0SvvZS,The Last Emperor (Main Title Theme),David Byrne,20vuBdFblWUo2FCOvUzusB,compilation,70iCI16xKlhPlJJT7OIKGH,The Last Emperor Original Soundtrack,241106.0,38.0,,"['art pop', 'art punk', 'art rock', 'dance roc...",art pop,art punk,art rock


It looks like not all song previews were downloaded when I ran the `get_mp3()` function. Songs that did not go through the pipeline do have preview links, so the problem is not that there are 153,000 songs without a preview, its just that the function didn't finish running.

In [67]:
subset = track_metadata[track_metadata['track_id'].isin(no_librosa)]
subset.reset_index(inplace=True, drop=True)

# a list of tuples to make it easier to get the mp3 files
songs = {subset.loc[i,'track_id'] : (subset.loc[i,'artist'], 
                                     subset.loc[i,'track_name'], 
                                     subset.loc[i,'track_preview_url']) 
         
         for i in range(0,len(subset))}

len(songs)

182127

In [68]:
def get_missing_url(track_id, artist, song):
    '''falls back on the iTunes API to get a 30 sec. preview of a song if Spotify
        doesn't provide one, also assigns a different genre since iTunes uses
        more traditional genres, returns track metadata'''

    artist = artist.replace(" ","+")
    song = song.replace(" ","+")

    try:
        r = requests.get(f"https://itunes.apple.com/search?term={artist}+{song}&limit=1")
        content = json.loads(r.text)
        preview = content['results'][0]["previewUrl"]
        
        doc = requests.get(preview)
        with open(f"../data/audio_files_2/track_{track_id}.wav", "wb") as f:
            f.write(doc.content)

    except:
        pass

    
def get_mp3(track_id, properties):
    '''A function that takes an mp3 url, and writes it to the local
        directory "/tmp"'''
    if properties[2]:
        try:
            doc = requests.get(properties[2])
            with open(f'../data/audio_files_2/track_{track_id}.wav', 'wb') as f:
                f.write(doc.content)
        except:
            pass
        
    else:
        try:
            get_missing_url(track_id, properties[0], properties[1])
        except:
            pass

In [79]:
sample_ids = list(songs.keys())[:5]

sample = {s: songs[s] for s in sample_ids}
sample

{'3tjFYV6RSFtuktYl3ZtYcq': ('24kGoldn',
  'Mood (feat. Iann Dior)',
  'https://p.scdn.co/mp3-preview/45cb08fdb67744ab7f1f172bb750e9c10415c37a?cid=b05f690e17ba4b758d92a214cfbc7f24'),
 '27u7t9d7ZQoyjsCROHuZJ3': ('Clean Bandit',
  'Tick Tock (feat. 24kGoldn)',
  'https://p.scdn.co/mp3-preview/3d3f99ef7d4bea725a9bebc740cadf56e4f548bb?cid=b05f690e17ba4b758d92a214cfbc7f24'),
 '6piAUJJQFD8oHDUr0b7l7q': ('24kGoldn',
  'VALENTINO',
  'https://p.scdn.co/mp3-preview/6347fe20d7716220bbe67c2c10ff091696719ca2?cid=b05f690e17ba4b758d92a214cfbc7f24'),
 '68UW3plyDDNg1dkNIZRezJ': ('DVBBS',
  'Tinted Eyes (feat. blackbear & 24kGoldn)',
  'https://p.scdn.co/mp3-preview/d198eb9b7ca39a5a5589d8238ad416de7dcee032?cid=b05f690e17ba4b758d92a214cfbc7f24'),
 '660BgHpKo1jhR9MMSFn7CF': ('24kGoldn',
  'VALENTINO - Imanbek Remix',
  'https://p.scdn.co/mp3-preview/9de5e13ff243c0e0fdbe52503a1250fcf6832de6?cid=b05f690e17ba4b758d92a214cfbc7f24')}

In [None]:
l = len(songs)

for i,(track_id,properties) in enumerate(songs.items()):
    get_mp3(track_id, properties)
    clear_output(wait=True)
    print(f"{i+1}/{l}")

30794/182127


### Cleaning the data sets
Going to decide which columns to keep, thus deciding how to design the tables in the database.

#### Librosa Dataset
The librosa dataset doesn't need to be cleaned because I generated it myself, using a pipeline that generates the features I wanted. I only need to check for duplicates.

In [10]:
# checking for duplicates in librosa dataset
combined_librosa.duplicated().value_counts()

False    46721
dtype: int64

#### Metadata
The metadata dataset contains a lot of extra information that I won't necessarily use in the app. I'll save the original dataset as a csv just in case, but I'll only keep the necessary information in the database.

In [12]:
metadata_cols = ['track_id','track_name','artist','artist_id','genre_1','genre_2','genre_3']

track_metadata[metadata_cols]

Unnamed: 0,track_id,track_name,artist,artist_id,genre_1,genre_2,genre_3
0,6YDf6QV7QfCEo8O2dbWalK,Real Love Baby,Father John Misty,2kGBy2WHvF0VdZyqiVCkDT,art pop,chamber pop,freak folk
1,0qaWxVVd3KrP4WY27KWpIe,In a River,Rostam,04XggbrM51GcFPTxBYtRXT,art pop,chamber pop,indie pop
2,1yZMv2GMAibgLGsQiowZrt,Nancy From Now On,Father John Misty,2kGBy2WHvF0VdZyqiVCkDT,art pop,chamber pop,freak folk
3,2eg2gvPXuwZ9FyrPaLgrXi,Chateau Lobby #4 (in C for Two Virgins),Father John Misty,2kGBy2WHvF0VdZyqiVCkDT,art pop,chamber pop,freak folk
4,0wzruvvN7f5wu39aFcjTMw,Hollywood Forever Cemetery Sings,Father John Misty,2kGBy2WHvF0VdZyqiVCkDT,art pop,chamber pop,freak folk
...,...,...,...,...,...,...,...
230451,6jVVnTIC1zHFFdn2rBOE2g,This Must Be the Place (Naive Melody) - Live,David Byrne,20vuBdFblWUo2FCOvUzusB,art pop,art punk,art rock
230452,3xJ8siSD7r5mDqLNtQ5CPf,A Soft Seduction,David Byrne,20vuBdFblWUo2FCOvUzusB,art pop,art punk,art rock
230453,6Mc0SvjQ2By4ZJWdF77M2C,Dance on Vaseline,David Byrne,20vuBdFblWUo2FCOvUzusB,art pop,art punk,art rock
230454,392cd1euczCOcZAs0SvvZS,The Last Emperor (Main Title Theme),David Byrne,20vuBdFblWUo2FCOvUzusB,art pop,art punk,art rock


#### Spotify Features

In [13]:
cols = ['id','danceability','energy','loudness','speechiness','acousticness','instrumentalness','liveness','valence','tempo']

spotify_features[cols]

Unnamed: 0,id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0Tbgn7Ocm54WoqJUsIlfp4,0.644,0.912,-6.072,0.0384,0.00301,0.657000,0.0782,0.222,126.011
1,1XibR7VrDllfBz6yuTndmb,0.696,0.623,-7.600,0.1260,0.01750,0.000091,0.1430,0.573,173.951
2,3tjFYV6RSFtuktYl3ZtYcq,0.700,0.722,-3.558,0.0369,0.22100,0.000000,0.2720,0.756,90.989
3,27u7t9d7ZQoyjsCROHuZJ3,0.779,0.705,-3.895,0.0344,0.36900,0.000008,0.1240,0.946,101.022
4,6piAUJJQFD8oHDUr0b7l7q,0.747,0.717,-4.841,0.1790,0.19900,0.000000,0.1320,0.523,150.965
...,...,...,...,...,...,...,...,...,...,...
200001,6jVVnTIC1zHFFdn2rBOE2g,0.709,0.490,-13.565,0.1520,0.43100,0.000013,0.9460,0.745,117.158
200002,3xJ8siSD7r5mDqLNtQ5CPf,0.523,0.215,-15.376,0.0274,0.87000,0.003780,0.1060,0.346,99.057
200003,6Mc0SvjQ2By4ZJWdF77M2C,0.727,0.840,-9.915,0.0650,0.01040,0.038900,0.1000,0.460,111.013
200004,392cd1euczCOcZAs0SvvZS,0.400,0.208,-16.351,0.0311,0.74400,0.775000,0.2670,0.551,180.243
