# Data Exploration

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import duckdb
import pickle

In [5]:
# Display Options for Pandas
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 100
pd.options.display.max_rows = None

In [3]:
data_path = Path("../data")
data_path_string = data_path.resolve().as_posix()

### ETL and EDA

In [4]:
con = duckdb.connect(database=f"{data_path}/spotify.db")

In [9]:
def clean_db(con: duckdb.DuckDBPyConnection):
    con.execute("drop table if exists lyrics_features")
    con.execute("drop table if exists low_level_audio_features")
    con.execute("drop table if exists albums")
    con.execute("drop table if exists artists")
    con.execute("drop table if exists tracks")
    con.execute("drop table if exists features")
    con.execute("drop view if exists lookup")

clean_db(con)

In [11]:
def load_data_into_db(con: duckdb.DuckDBPyConnection, data_path_string: str):
    con.read_csv(f"{data_path_string}/lyrics_features.csv", all_varchar=False).create("lyric_features")
    con.read_csv(f"{data_path_string}/low_level_audio_features.csv", all_varchar=False).create("low_level_audio_features")
    con.read_csv(f"{data_path_string}/spotify_albums.csv", all_varchar=False).create("albums")
    con.read_csv(f"{data_path_string}/spotify_artists.csv", all_varchar=False).create("artists")
    con.read_csv(f"{data_path_string}/spotify_tracks.csv", all_varchar=False).create("tracks")

load_data_into_db(con, data_path_string)

In [13]:
con.table("lyric_features").limit(3).df()

Unnamed: 0,column0,mean_syllables_word,mean_words_sentence,n_sentences,n_words,sentence_similarity,track_id,vocabulary_wealth
0,0,-1.0,-1.0,-1,-1,-1.0,5KIfHjHI5NIsPHNt58qua0,-1.0
1,1,1.1,5.65,31,326,0.043011,13keyz9ikBe6ZpRasw7l4X,0.45
2,2,1.37,4.77,74,532,0.050352,1WugzepXsLjnsM0K4UaWYc,0.59


In [14]:
con.table("low_level_audio_features").limit(3).df()


Unnamed: 0,column000,Chroma_1,Chroma_10,Chroma_11,Chroma_12,Chroma_2,Chroma_3,Chroma_4,Chroma_5,Chroma_6,Chroma_7,Chroma_8,Chroma_9,MEL_1,MEL_10,MEL_100,MEL_101,MEL_102,MEL_103,MEL_104,MEL_105,MEL_106,MEL_107,MEL_108,MEL_109,MEL_11,MEL_110,MEL_111,MEL_112,MEL_113,MEL_114,MEL_115,MEL_116,MEL_117,MEL_118,MEL_119,MEL_12,MEL_120,MEL_121,MEL_122,MEL_123,MEL_124,MEL_125,MEL_126,MEL_127,MEL_128,MEL_13,MEL_14,MEL_15,MEL_16,MEL_17,MEL_18,MEL_19,MEL_2,MEL_20,MEL_21,MEL_22,MEL_23,MEL_24,MEL_25,MEL_26,MEL_27,MEL_28,MEL_29,MEL_3,MEL_30,MEL_31,MEL_32,MEL_33,MEL_34,MEL_35,MEL_36,MEL_37,MEL_38,MEL_39,MEL_4,MEL_40,MEL_41,MEL_42,MEL_43,MEL_44,MEL_45,MEL_46,MEL_47,MEL_48,MEL_49,MEL_5,MEL_50,MEL_51,MEL_52,MEL_53,MEL_54,MEL_55,MEL_56,MEL_57,MEL_58,MEL_59,MEL_6,MEL_60,MEL_61,MEL_62,MEL_63,MEL_64,MEL_65,MEL_66,MEL_67,MEL_68,MEL_69,MEL_7,MEL_70,MEL_71,MEL_72,MEL_73,MEL_74,MEL_75,MEL_76,MEL_77,MEL_78,MEL_79,MEL_8,MEL_80,MEL_81,MEL_82,MEL_83,MEL_84,MEL_85,MEL_86,MEL_87,MEL_88,MEL_89,MEL_9,MEL_90,MEL_91,MEL_92,MEL_93,MEL_94,MEL_95,MEL_96,MEL_97,MEL_98,MEL_99,MFCC_1,MFCC_10,MFCC_11,MFCC_12,MFCC_13,MFCC_14,MFCC_15,MFCC_16,MFCC_17,MFCC_18,MFCC_19,MFCC_2,MFCC_20,MFCC_21,MFCC_22,MFCC_23,MFCC_24,MFCC_25,MFCC_26,MFCC_27,MFCC_28,MFCC_29,MFCC_3,MFCC_30,MFCC_31,MFCC_32,MFCC_33,MFCC_34,MFCC_35,MFCC_36,MFCC_37,MFCC_38,MFCC_39,MFCC_4,MFCC_40,MFCC_41,MFCC_42,MFCC_43,MFCC_44,MFCC_45,MFCC_46,MFCC_47,MFCC_48,MFCC_5,MFCC_6,MFCC_7,MFCC_8,MFCC_9,Spectral_contrast_1,Spectral_contrast_2,Spectral_contrast_3,Spectral_contrast_4,Spectral_contrast_5,Spectral_contrast_6,Spectral_contrast_7,Tonnetz_1,Tonnetz_2,Tonnetz_3,Tonnetz_4,Tonnetz_5,Tonnetz_6,ZCR,entropy_energy,spectral_bandwith,spectral_centroid,spectral_rollOff_max,spectral_rollOff_min,track_id
0,0,0.438296,0.472769,0.427441,0.436688,0.467697,0.493862,0.512244,0.568658,0.560524,0.513068,0.544648,0.544643,0.00014,0.667637,1.6e-05,9e-06,6e-06,5e-06,5e-06,5e-06,5e-06,5e-06,5e-06,5e-06,1.036885,5e-06,5e-06,5e-06,5e-06,5e-06,6e-06,6e-06,4e-06,2.706399e-07,3.611252e-09,0.96069,3.454723e-09,3.324464e-09,3.249713e-09,3.211787e-09,3.196705e-09,3.123937e-09,3.096601e-09,3.132088e-09,3.115733e-09,1.309046,1.727061,1.172454,2.451283,2.383262,3.495376,1.708026,0.001025,1.812995,2.062649,1.269325,0.964405,0.662044,0.739214,0.527574,0.688505,0.454942,0.311517,0.017772,0.296182,0.268481,0.286426,0.31414,0.34632,0.24444,0.223053,0.363009,0.532202,0.552576,0.292901,0.418687,0.308175,0.248862,0.20499,0.144929,0.10692,0.073315,0.041151,0.037855,0.031073,1.598215,0.026433,0.024971,0.017033,0.011429,0.010716,0.009004,0.009666,0.00933,0.010194,0.008812,2.168316,0.009764,0.010562,0.008871,0.008313,0.006119,0.004583,0.003738,0.005562,0.007731,0.006603,1.188808,0.00609,0.006469,0.004985,0.004448,0.003946,0.003418,0.002923,0.00266,0.002992,0.002804,0.171523,0.002465,0.002034,0.001669,0.001572,0.001691,0.001971,0.00217,0.001762,0.001506,0.001308,0.357523,0.001449,0.001439,0.001132,0.000947,0.000729,0.000423,0.000289,0.000152,7.3e-05,3.2e-05,-373.998138,-7.316626,-16.166148,-1.957894,-1.502101,-4.469502,-1.413008,-6.843215,-3.468242,-2.280824,-4.339178,139.977509,-5.973232,-6.111434,-5.0659,-6.364006,-9.749821,-7.584262,-4.942808,-5.60471,-6.23482,-7.20738,-31.957462,-4.700275,-4.880559,-5.913836,-4.999497,-3.399483,-4.613783,-5.075487,-3.299941,-3.443411,-3.432436,11.430949,-2.920989,-2.365429,-2.451597,-1.518548,-1.851851,-0.415478,1.505456,1.401088,1.320392,-1.84045,-14.810051,6.317596,-7.692614,-9.200216,21.827447,13.797794,15.869637,16.190687,16.09394,16.975988,33.174617,0.00337,0.00045,-0.007696,0.018434,-0.001759,-0.006392,0.067966,-89.113389,2564.247669,3558.400706,4508.506071,367.831109,19YEk4OVQZn3GfoxbpNrU6
1,1,0.596605,0.368288,0.285263,0.302211,0.905805,0.510909,0.221708,0.311248,0.491277,0.416469,0.411171,0.553497,0.053749,20.364112,0.022766,0.01999,0.013676,0.015588,0.016917,0.022716,0.020967,0.021773,0.015938,0.014059,24.822703,0.008655,0.00535,0.003345,0.002055,0.001265,0.001166,0.001026,0.000534,1.876189e-05,5.515725e-08,19.904134,4.847649e-08,4.373043e-08,3.991839e-08,3.699672e-08,3.468608e-08,3.28462e-08,3.1595e-08,3.051798e-08,2.98875e-08,29.14225,27.009086,13.279951,11.811473,34.071757,73.849335,16.803801,1.346285,10.113495,5.480764,17.990936,19.527366,5.496805,13.353018,41.659928,81.923985,22.62972,3.932411,7.441805,2.977619,2.733937,2.493947,1.930312,6.046661,16.889717,24.328768,9.966199,4.33098,10.60816,34.019282,6.376572,2.894632,3.903454,4.606321,1.17432,0.634846,0.725404,4.232507,16.313335,8.959169,41.188623,1.691884,0.936274,1.62897,4.439873,3.783127,5.148844,1.909928,0.97706,0.874126,0.265108,31.481187,0.912242,1.609034,0.296289,0.124289,0.534848,0.969555,0.684228,0.343321,0.189051,0.322922,22.301071,0.738576,0.2028,0.092214,0.093725,0.110804,0.090871,0.200644,0.435192,0.191827,0.253115,33.023468,0.130659,0.041194,0.031797,0.035058,0.019192,0.024231,0.035309,0.020572,0.014062,0.010085,119.91754,0.007199,0.021992,0.031079,0.026668,0.035419,0.03539,0.0403,0.053737,0.041265,0.035966,-136.036925,-13.662825,1.777589,-5.828749,-7.871496,-6.416711,-2.610934,-14.187075,-2.200613,-9.666467,-11.58136,187.428063,2.625204,-5.644814,-5.422538,-3.891026,-6.472341,-4.434039,-12.484578,-2.710994,-1.504884,-4.882684,-32.267121,1.324287,-3.128912,-0.300005,-3.100963,1.619165,-7.962902,-11.856033,-0.698356,0.131466,-0.787129,20.453472,-2.834232,-5.093038,-7.295887,-0.530392,-0.953761,0.319913,-3.478913,-4.759174,2.13342,-16.072231,15.709864,-20.811081,-2.327808,-7.654338,20.098375,11.539465,14.87055,19.947194,22.068094,29.896203,46.590125,-0.03852,-0.022681,-0.160844,0.046941,0.005665,-0.026928,0.047308,-127.945239,2370.181495,1499.68959,3647.394611,230.165275,6zJms3MX11Qu1IKF44LoRW
2,2,0.505224,0.50042,0.506773,0.488258,0.498356,0.573582,0.690761,0.742858,0.686282,0.657118,0.550437,0.491275,381.736538,50.09699,0.166018,0.159811,0.180781,0.169495,0.153824,0.150577,0.122907,0.104753,0.088342,0.074665,41.289397,0.060141,0.040727,0.030447,0.018008,0.012491,0.00916,0.006501,0.002696,0.0001114347,6.4371e-06,25.838512,6.474858e-06,5.948196e-06,5.798902e-06,6.219467e-06,6.09078e-06,5.806435e-06,6.175595e-06,6.012861e-06,5.571497e-06,26.049473,49.293065,76.258536,115.575303,35.11696,29.367355,23.180991,539.954759,49.485727,48.526434,24.819987,26.903197,44.271672,25.120495,14.562212,15.190511,8.085279,7.609208,368.558497,8.200001,7.65717,7.623656,3.147984,3.046287,2.332731,2.379385,1.651035,1.931251,2.763406,309.964223,1.823783,1.508735,1.729277,3.145477,4.45402,2.488309,1.283118,1.844057,2.482388,2.174284,136.197737,1.507624,0.795523,1.333339,1.868642,1.655892,1.494542,2.690885,3.04357,1.886327,1.491218,56.506845,1.049795,0.986083,0.798397,0.821179,0.605001,0.556066,0.635436,0.709118,1.101624,1.447257,42.056093,1.382997,1.173134,0.612823,0.34514,0.302651,0.373615,0.860088,0.557344,0.478376,0.448018,25.894599,0.943383,0.621177,0.491787,0.421173,0.54869,0.745446,1.007432,0.775511,0.503187,0.3787,37.324754,0.452261,0.554115,0.446938,0.383027,0.411804,0.319628,0.237688,0.213929,0.22654,0.190923,-102.343033,7.161194,5.186434,2.420537,9.843801,-1.172872,6.484431,-1.974161,1.874922,-1.688387,7.072006,144.69913,1.209707,0.513359,1.930574,3.142626,4.957028,-1.244892,0.15332,-1.850325,-0.633414,-3.387202,-20.528202,-0.834693,0.329848,-0.125831,-0.710406,3.085452,-0.202125,0.03145,0.451666,-2.306372,-1.679232,57.308663,-3.642473,-0.351983,-1.392111,2.126408,-0.573808,0.246533,1.076051,-1.0957,-0.267367,-16.283814,19.554979,-12.680799,-2.186734,-6.616386,16.922045,11.849578,15.361625,17.554909,17.666317,20.939717,47.123197,0.002485,-0.00415,0.008915,-0.006929,0.004968,0.008947,0.058463,-238.285176,2973.294736,1543.550034,5623.34933,187.290534,1WugzepXsLjnsM0K4UaWYc


In [16]:
con.table("albums").limit(1).df()

Unnamed: 0,column00,album_type,artist_id,available_markets,external_urls,href,id,images,name,release_date,release_date_precision,total_tracks,track_id,track_name_prev,uri,type
0,0,single,3DiDSECUqqY1AuBP8qtaIa,"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH', 'BO', 'BR', 'CA', 'CH', 'CL', 'CO', 'CR', 'CY',...",{'spotify': 'https://open.spotify.com/album/1gAM7M4rBwEbSPeAQR2nx1'},https://api.spotify.com/v1/albums/1gAM7M4rBwEbSPeAQR2nx1,1gAM7M4rBwEbSPeAQR2nx1,"[{'height': 640, 'url': 'https://i.scdn.co/image/5872e4d8fac4ef7552576d481b1d676189b4056a', 'wid...",If I Ain't Got You EP,2019-02-08,day,6,2iejTMy9XZ8Gaae0aQ2yl0,track_32,spotify:album:1gAM7M4rBwEbSPeAQR2nx1,album


In [17]:
con.table("artists").limit(3).df()

Unnamed: 0,column0,artist_popularity,followers,genres,id,name,track_id,track_name_prev,type
0,0,44,23230,"['sertanejo', 'sertanejo pop', 'sertanejo tradicional', 'sertanejo universitario']",4mGnpjhqgx4RUdsIJiURdo,Juliano Cezar,0wmDmAILuW9e2aRttkl4aC,track_9,artist
1,1,22,313,[],1dLnVku4VQUOLswwDFvRc9,The Grenadines,4wqwj0gA8qPZKLl5WVqXml,track_30,artist
2,2,26,1596,['danish pop rock'],6YVY310fjfUzKi8hiqR7iK,Gangway,1bFqWDbvHmZe2f4Nf9qaD8,track_38,artist


In [18]:
con.table("tracks").limit(3).df()

Unnamed: 0,column00,acousticness,album_id,analysis_url,artists_id,available_markets,country,danceability,disc_number,duration_ms,energy,href,id,instrumentalness,key,liveness,loudness,lyrics,mode,name,playlist,popularity,preview_url,speechiness,tempo,time_signature,track_href,track_name_prev,track_number,uri,valence,type
0,0,0.294,0D3QufeCudpQANOR7luqdr,https://api.spotify.com/v1/audio-analysis/5qljLQuKnNJf4F4vfxQB0V,['3mxJuHRn2ZWD5OofvJtDZY'],"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH', 'BO', 'BR', 'CA', 'CH', 'CL', 'CO', 'CR', 'CY',...",BE,0.698,1.0,235584.0,0.606,https://api.spotify.com/v1/tracks/5qljLQuKnNJf4F4vfxQB0V,5qljLQuKnNJf4F4vfxQB0V,3e-06,10.0,0.151,-7.447,"\r\n\r\nPerhaps I am bound to be restless\r\nAlways yearning, never satisfied\r\nPerhaps I'm ach...",0.0,Blood,Hipsteribrunssi,28.0,https://p.scdn.co/mp3-preview/1b05a902da3a251d07a38aa710ffae559fc33d08?cid=b3cdb16d0df2409abf6a8...,0.0262,115.018,4.0,https://api.spotify.com/v1/tracks/5qljLQuKnNJf4F4vfxQB0V,track_14,1.0,spotify:track:5qljLQuKnNJf4F4vfxQB0V,0.622,track
1,1,0.863,1bcqsH5UyTBzmh9YizdsBE,https://api.spotify.com/v1/audio-analysis/3VAX2MJdmdqARLSU5hPMpm,['4xWMewm6CYMstu0sPgd9jJ'],"['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG', 'BH', 'BO', 'BR', 'CA', 'CH', 'CL', 'CO', 'CR', 'CY',...",BE,0.719,1.0,656960.0,0.308,https://api.spotify.com/v1/tracks/3VAX2MJdmdqARLSU5hPMpm,3VAX2MJdmdqARLSU5hPMpm,0.0,6.0,0.253,-10.34,\r\nYour Gods and my Gods-do you or I know which are the stronger? Native Proverb.\r\nEAST of Su...,1.0,The Ugly Duckling,Animal Stories,31.0,https://p.scdn.co/mp3-preview/d8140736a6131cb5595f061975173a272c343a0a?cid=b3cdb16d0df2409abf6a8...,0.922,115.075,3.0,https://api.spotify.com/v1/tracks/3VAX2MJdmdqARLSU5hPMpm,track_3,3.0,spotify:track:3VAX2MJdmdqARLSU5hPMpm,0.589,track
2,2,0.75,4tKijjmxGClg4JOLAyo2qE,https://api.spotify.com/v1/audio-analysis/1L3YAhsEMrGVvCgDXj2TYn,['3hYaK5FF3YAglCj5HZgBnP'],['GB'],BE,0.466,1.0,492840.0,0.931,https://api.spotify.com/v1/tracks/1L3YAhsEMrGVvCgDXj2TYn,1L3YAhsEMrGVvCgDXj2TYn,0.0,4.0,0.938,-13.605,"\r\n\r\nClosed off from love, I didn't need the pain\r\nOnce or twice was enough and it was all ...",0.0,Jimmy Launches His Own Range Of Greetings Cards,Best Of British Comedy,31.0,https://p.scdn.co/mp3-preview/c8af28fb15185b18977152eb50eefef8d90af5a2?cid=b3cdb16d0df2409abf6a8...,0.944,79.565,4.0,https://api.spotify.com/v1/tracks/1L3YAhsEMrGVvCgDXj2TYn,track_4,4.0,spotify:track:1L3YAhsEMrGVvCgDXj2TYn,0.085,track


In [25]:
# Construct Lookup View
def create_lookup_table(con: duckdb.DuckDBPyConnection):
    sql = """
        drop view if exists lookup;
        create view lookup as
        select 
            t.id, t.name as track_name, ar.name as artist_name, a.name as album_name, 
            t.preview_url, t.track_href, t.analysis_url
        from tracks t
        left join albums a on t.album_id = a.id
        left join artists ar on a.artist_id = ar.id
        """
    con.execute(sql)

create_lookup_table(con)

In [20]:
con.view("lookup").limit(3).df()

Unnamed: 0,id,track_name,artist_name,album_name,preview_url,track_href,analysis_url
0,5qljLQuKnNJf4F4vfxQB0V,Blood,Jesse Markin,Blood,https://p.scdn.co/mp3-preview/1b05a902da3a251d07a38aa710ffae559fc33d08?cid=b3cdb16d0df2409abf6a8...,https://api.spotify.com/v1/tracks/5qljLQuKnNJf4F4vfxQB0V,https://api.spotify.com/v1/audio-analysis/5qljLQuKnNJf4F4vfxQB0V
1,3VAX2MJdmdqARLSU5hPMpm,The Ugly Duckling,Favorite Kids Stories,"Storytime Classics, Vol. 2",https://p.scdn.co/mp3-preview/d8140736a6131cb5595f061975173a272c343a0a?cid=b3cdb16d0df2409abf6a8...,https://api.spotify.com/v1/tracks/3VAX2MJdmdqARLSU5hPMpm,https://api.spotify.com/v1/audio-analysis/3VAX2MJdmdqARLSU5hPMpm
2,4PrAZpH9Ic7S47E78BN6E4,Already Gone,Alvvays,Antisocialites,https://p.scdn.co/mp3-preview/10a9607c178fe1eaeb3d12aed9cee82da4963f24?cid=b3cdb16d0df2409abf6a8...,https://api.spotify.com/v1/tracks/4PrAZpH9Ic7S47E78BN6E4,https://api.spotify.com/v1/audio-analysis/4PrAZpH9Ic7S47E78BN6E4


In [26]:
# Construct Features Table
def create_features_table(con: duckdb.DuckDBPyConnection):
    sql = """
        drop table if exists features;
        create table features as
        select
            t.id, t.acousticness, t.danceability, t.energy, t.instrumentalness, 
            t.liveness, t.loudness, t.speechiness, t.tempo, t.valence
        from tracks t
        """
    con.execute(sql)

create_features_table(con)

In [23]:
con.query("summarize features").df()

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,id,VARCHAR,000RDCYioLteXcutOjeweY,7zzpno7uAqkAzWZDQuGEFA,101691,,,,,,101939,0.0%
1,acousticness,DOUBLE,0.0,0.996,4628,0.3521235614982393,0.3348552389774226,0.0408862825254198,0.2379812289449274,0.6444174472884937,101939,0.0%
2,danceability,DOUBLE,0.0,0.989,1199,0.5860150295765171,0.1777242568588397,0.4797636257135789,0.6102385939777873,0.7144499849384677,101939,0.0%
3,energy,DOUBLE,0.0,1.0,2353,0.5864791632319346,0.2601704025337212,0.4111254863222407,0.6284573068498541,0.7977735646075533,101939,0.0%
4,instrumentalness,DOUBLE,0.0,1.0,5341,0.1487759026008676,0.3040235636638487,0.0,3.851858879787776e-05,0.0353911544379579,101939,0.0%
5,liveness,DOUBLE,0.0,0.999,1715,0.197640306457781,0.1753908592406075,0.0955599216436363,0.1237481178772909,0.2415430915139056,101939,0.0%
6,loudness,DOUBLE,-60.0,2.719,22382,-9.462719675492316,6.198508047679088,-11.157471170426556,-7.60183592122598,-5.512450857917222,101939,0.0%
7,speechiness,DOUBLE,0.0,0.969,1638,0.1288414963850934,0.2033237132844151,0.0363723471988273,0.0507314702840772,0.1040761462001869,101939,0.0%
8,tempo,DOUBLE,0.0,244.035,50300,118.35852701125256,30.22407426199396,95.87399140685004,118.2468924909389,136.20637236145168,101939,0.0%
9,valence,DOUBLE,0.0,0.993,1762,0.4828126257801232,0.2616904844759442,0.2711558539347785,0.4769632390749201,0.6933812005186372,101939,0.0%


## ML

In [5]:
def load_data_into_pd(con: duckdb.DuckDBPyConnection) -> pd.DataFrame:
    return con.query("select * from features").df()

df = load_data_into_pd(con)

In [28]:
df.columns

Index(['id', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence'],
      dtype='object')

In [29]:
df.describe()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
count,101939.0,101939.0,101939.0,101939.0,101939.0,101939.0,101939.0,101939.0,101939.0
mean,0.352124,0.586015,0.586479,0.148776,0.19764,-9.46272,0.128841,118.358527,0.482813
std,0.334855,0.177724,0.26017,0.304024,0.175391,6.198508,0.203324,30.224074,0.26169
min,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0
25%,0.0407,0.48,0.411,0.0,0.0956,-11.149,0.0364,95.973,0.271
50%,0.238,0.61,0.629,3.7e-05,0.124,-7.599,0.0506,118.067,0.477
75%,0.645,0.714,0.798,0.0344,0.241,-5.509,0.104,136.045,0.693
max,0.996,0.989,1.0,1.0,0.999,2.719,0.969,244.035,0.993


In [6]:
if Path(data_path / "pipeline.pkl").exists():
    print("hello")

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

def get_pipeline(df) -> Pipeline:
    # Check if pipeline exists
    if Path(data_path / "pipeline.pkl").exists():
        with open(data_path / "pipeline.pkl", "rb") as f:
            return pickle.load(f)
    
    # Otherwise, create pipeline
    pipeline = Pipeline([ ("scaler", MinMaxScaler()) ])
    pipeline.fit(df.loc[:, df.columns != "id"])
    with open(data_path / "pipeline.pkl", "wb") as f:
        pickle.dump(pipeline, f)
        
    return pipeline

pipe = get_pipeline(df)

In [13]:
def preprocess_data(df: pd.DataFrame) -> np.ndarray:
    pipe = get_pipeline(df)
    X = pipe.transform(df.loc[:, df.columns != "id"])
    return X

X = preprocess_data(df)

In [27]:
from sklearn.neighbors import NearestNeighbors

def get_model_knn(X: np.ndarray or None = None) -> NearestNeighbors:
    # Check if model exists
    if Path(data_path / "knn.pkl").exists():
        with open(data_path / "knn.pkl", "rb") as f:
            return pickle.load(f)

    # Otherwise, create model
    knn = NearestNeighbors(n_neighbors=5, algorithm="ball_tree")
    knn.fit(X)
    with open(data_path / "knn.pkl", "wb") as f:
        pickle.dump(knn, f)

    return knn

knn = get_model_knn(X)

In [12]:
# lookup ids
def lookup_ids(con: duckdb.DuckDBPyConnection, lookup_query: str) -> str:
    sql = """
      select *
      from lookup
      where 
        regexp_matches(lower(concat(track_name, ' ', artist_name)), $param)
      limit 10
    """
    return con.execute(sql, { "param": lookup_query.lower()}).fetch_df()

df_songs = lookup_ids(con, "like a prayer")
df_songs

Unnamed: 0,id,track_name,artist_name,album_name,preview_url,track_href,analysis_url
0,0OuGlX0EnsXQ4vvOunF9A3,Like a Prayer,Goshfather,Like a Prayer,https://p.scdn.co/mp3-preview/dbe9844b5b38df522bff03e411aedb6c52c87559?cid=b3cdb16d0df2409abf6a8...,https://api.spotify.com/v1/tracks/0OuGlX0EnsXQ4vvOunF9A3,https://api.spotify.com/v1/audio-analysis/0OuGlX0EnsXQ4vvOunF9A3
1,2v7ywbUzCgcVohHaKUcacV,Like a Prayer,Madonna,Like a Prayer,https://p.scdn.co/mp3-preview/274e7167b6222fc3f8e167bf036c7286bf265f2d?cid=b3cdb16d0df2409abf6a8...,https://api.spotify.com/v1/tracks/2v7ywbUzCgcVohHaKUcacV,https://api.spotify.com/v1/audio-analysis/2v7ywbUzCgcVohHaKUcacV
2,1z3ugFmUKoCzGsI6jdY4Ci,Like a Prayer,Madonna,Celebration (double disc version),https://p.scdn.co/mp3-preview/b56a70770267b00ccae13c2e8c8a34ed54627d02?cid=b3cdb16d0df2409abf6a8...,https://api.spotify.com/v1/tracks/1z3ugFmUKoCzGsI6jdY4Ci,https://api.spotify.com/v1/audio-analysis/1z3ugFmUKoCzGsI6jdY4Ci


In [13]:
# Select Song
df_test = df.query("id == '2v7ywbUzCgcVohHaKUcacV'")
df_test

Unnamed: 0,id,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
85789,2v7ywbUzCgcVohHaKUcacV,0.432,0.66,0.629,7.3e-05,0.172,-12.359,0.0387,111.926,0.324


In [21]:
def make_recommendations(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, id: str, n_neighbors: int = 10) -> pd.DataFrame:
    # Predict
    df_test = df.query(f"id == '{id}'")
    X_test = preprocess_data(df_test)
    knn = get_model_knn()
    distances, indices = knn.kneighbors(X_test, n_neighbors=n_neighbors)
    ids = df.loc[indices[0].tolist(), :]

    # Lookup song details
    sql = """
        select * 
        from lookup
        where id in (select id from ids)
    """
    details = con.query(sql).df()

    # Merge
    merge = ids.merge(details, how="left", on="id")
    return merge


In [22]:
make_recommendations(con, df, id="2v7ywbUzCgcVohHaKUcacV")


Unnamed: 0,id,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,track_name,artist_name,album_name,preview_url,track_href,analysis_url
0,2v7ywbUzCgcVohHaKUcacV,0.432,0.66,0.629,7.3e-05,0.172,-12.359,0.0387,111.926,0.324,Like a Prayer,Madonna,Like a Prayer,https://p.scdn.co/mp3-preview/274e7167b6222fc3...,https://api.spotify.com/v1/tracks/2v7ywbUzCgcV...,https://api.spotify.com/v1/audio-analysis/2v7y...
1,7mrsLJSTzTMRQjEzWDAlOw,0.421,0.619,0.653,8e-05,0.147,-8.791,0.0294,118.958,0.327,Painted In The Sound,Justus Proffit,L.A.'s Got Me Down,https://p.scdn.co/mp3-preview/d5277b8f48b35e00...,https://api.spotify.com/v1/tracks/7mrsLJSTzTMR...,https://api.spotify.com/v1/audio-analysis/7mrs...
2,0flFPv83zCKHwzuhlEDILy,0.416,0.633,0.682,0.000105,0.148,-6.547,0.0359,114.999,0.324,Take Your Time,Vance Joy,Nation of Two,https://p.scdn.co/mp3-preview/6841b4be0f54da98...,https://api.spotify.com/v1/tracks/0flFPv83zCKH...,https://api.spotify.com/v1/audio-analysis/0flF...
3,5axqYa8jADY4TKO9jMr7Zt,0.503,0.607,0.664,0.00213,0.115,-10.376,0.0314,110.01,0.33,Lloré,Monsieur Periné,Caja De Música (Edición Especial),https://p.scdn.co/mp3-preview/fc993da8974873c4...,https://api.spotify.com/v1/tracks/5axqYa8jADY4...,https://api.spotify.com/v1/audio-analysis/5axq...
4,0fqjy5gVNVmQQmxOLLfb3c,0.4,0.638,0.687,0.000114,0.166,-6.556,0.0351,114.98,0.323,Take Your Time,Vance Joy,Nation Of Two,https://p.scdn.co/mp3-preview/c4defe3d34b142df...,https://api.spotify.com/v1/tracks/0fqjy5gVNVmQ...,https://api.spotify.com/v1/audio-analysis/0fqj...
5,68vdi4VhdQ3JTTRWPi5z6U,0.447,0.629,0.645,7.4e-05,0.0992,-10.735,0.103,115.077,0.283,Electric (feat. Khalid) [Ryan Riback Remix],Alina Baraz,The Color Of You (Remixes),https://p.scdn.co/mp3-preview/506689b5b743e367...,https://api.spotify.com/v1/tracks/68vdi4VhdQ3J...,https://api.spotify.com/v1/audio-analysis/68vd...
6,4ZmAMOU0bcmrwwOvEK8aDT,0.407,0.626,0.574,0.0,0.17,-7.73,0.03,119.531,0.272,Quién Diría,Ricardo Arjona,Canciones De Amor,https://p.scdn.co/mp3-preview/319ca1a607c2ce81...,https://api.spotify.com/v1/tracks/4ZmAMOU0bcmr...,https://api.spotify.com/v1/audio-analysis/4ZmA...
7,3mY5W2fJF9wKoy5d4IOtsZ,0.445,0.651,0.594,0.0,0.123,-5.879,0.0404,113.971,0.3,Anywhere With You (feat. Andie Nora),Wallaby,Anywhere With You (feat. Andie Nora),https://p.scdn.co/mp3-preview/8da4a78de402994d...,https://api.spotify.com/v1/tracks/3mY5W2fJF9wK...,https://api.spotify.com/v1/audio-analysis/3mY5...
8,0wl2rHiiQRHAW41UHYOI1u,0.414,0.66,0.599,0.0124,0.127,-9.132,0.0271,135.066,0.336,Family Tapes,Dead Horses,Family Tapes,https://p.scdn.co/mp3-preview/9c1dbca4aac171c1...,https://api.spotify.com/v1/tracks/0wl2rHiiQRHA...,https://api.spotify.com/v1/audio-analysis/0wl2...
9,1lIz3Wn8b1lBsl0pvOpmU7,0.491,0.599,0.661,2e-06,0.153,-8.14,0.0529,124.052,0.306,Watching You,Lea Rue,Watching You,https://p.scdn.co/mp3-preview/cda37d5344b7e0b3...,https://api.spotify.com/v1/tracks/1lIz3Wn8b1lB...,https://api.spotify.com/v1/audio-analysis/1lIz...


## Clean Up

In [6]:
con.close()