Annoy offical repository can be found here: https://github.com/spotify/annoy

In [1]:
import pandas as pd

from annoy import AnnoyIndex

## Loading data

In [2]:
tracks_df = pd.read_csv("./data/tracks_mod.csv")

In [3]:
tracks_df.shape

(586672, 20)

In [4]:
tracks_df.dtypes

id                   object
name                 object
popularity          float64
duration_ms           int64
explicit              int64
artists              object
id_artists           object
release_date         object
danceability        float64
energy              float64
key                 float64
loudness            float64
mode                float64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature      float64
dtype: object

In [5]:
tracks_df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6.0,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0.0,-13.338,1.0,0.451,0.674,0.744,0.151,0.127,104.851,3.0
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0.0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0.0,-22.136,1.0,0.957,0.797,0.0,0.148,0.655,102.009,1.0
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0.0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1.0,-21.18,1.0,0.0512,0.994,0.0218,0.212,0.457,130.418,5.0
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,,7.0,-27.961,1.0,0.0504,0.995,0.918,0.104,0.397,169.98,3.0
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0.0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3.0,-16.9,0.0,0.039,0.989,0.13,0.311,0.196,103.22,4.0


## Cleaning the data

In [6]:
tracks_df.dropna(inplace=True)

In [7]:
tracks_df.shape

(348066, 20)

In [8]:
print(f"Lost data: {round(1 - tracks_df.shape[0] / 586672, 2)}%")

Lost data: 0.41%


### Building the similarity model

In [44]:
features = ["explicit", "danceability", "energy", "loudness", "speechiness", "liveness", "valence"]
len_features = len(features)

In [45]:
annoy = AnnoyIndex(len_features, "euclidean")

In [46]:
for i in tracks_df.index:
    v = tracks_df.loc[i][features].values
    annoy.add_item(i, v)

In [47]:
annoy.build(1000, n_jobs=-1)

True

In [48]:
annoy.save("spotify.ann")

True

## Getting similar songs

In [49]:
annoy_loaded = AnnoyIndex(len_features, "euclidean")

In [50]:
annoy_loaded.load("spotify.ann")

True

In [51]:
maluma_tracks_df = tracks_df.loc[tracks_df["artists"].str.contains("Maluma")]
maluma_tracks_df["release_date"] = pd.to_datetime(maluma_tracks_df["release_date"])
maluma_tracks_df.sort_values(by=["release_date"], ascending=False).head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maluma_tracks_df["release_date"] = pd.to_datetime(maluma_tracks_df["release_date"])


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
353004,6cDE7xhQZBSHNWB5RX0U9l,Créeme,0.0,214293,0,"['KAROL G', 'Maluma']","['790FomKkXshlbRYZFtlgla', '1r4hJ1h58CWwUQe3Mx...",2021-04-14,0.714,0.874,6.0,-3.576,1.0,0.0549,0.128,0.000146,0.105,0.648,96.998,4.0


In [52]:
neighbors = annoy_loaded.get_nns_by_item(353004, 10)

In [53]:
tracks_df.loc[neighbors]

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
352972,2Gv3HJSezY4Tln815S7UZH,Créeme,0.0,214293,0,"['KAROL G', 'Maluma']","['790FomKkXshlbRYZFtlgla', '1r4hJ1h58CWwUQe3Mx...",2021-04-14,0.714,0.874,6.0,-3.576,1.0,0.0549,0.128,0.000146,0.105,0.648,96.998,4.0
353004,6cDE7xhQZBSHNWB5RX0U9l,Créeme,0.0,214293,0,"['KAROL G', 'Maluma']","['790FomKkXshlbRYZFtlgla', '1r4hJ1h58CWwUQe3Mx...",2021-04-14,0.714,0.874,6.0,-3.576,1.0,0.0549,0.128,0.000146,0.105,0.648,96.998,4.0
468734,2ZehPPTmn6QxlI1KedHo51,Créeme,2.0,214293,0,"['KAROL G', 'Maluma']","['790FomKkXshlbRYZFtlgla', '1r4hJ1h58CWwUQe3Mx...",2021-04-09,0.714,0.874,6.0,-3.576,1.0,0.0549,0.128,0.000146,0.105,0.648,96.998,4.0
158545,4EKZsrsCKyqr64FBHLc0DU,Créeme,64.0,214293,0,"['KAROL G', 'Maluma']","['790FomKkXshlbRYZFtlgla', '1r4hJ1h58CWwUQe3Mx...",2019-05-03,0.715,0.874,6.0,-3.576,1.0,0.0548,0.126,0.000162,0.105,0.652,96.995,4.0
90874,0yyZN5ASdrYu0XYWFzfxUu,3 A.M.,70.0,183573,0,"['Jesse & Joy', 'Gente De Zona']","['1mX1TWKpNxDSAH16LgDfiR', '2cy1zPcrFcXAJTP0AP...",2017-08-18,0.702,0.84,0.0,-3.547,1.0,0.0879,0.0112,0.0,0.119,0.624,101.99,4.0
273912,2wuUYbSO8DuvDkxadXVxk4,酸,12.0,212787,0,['Leon Lai'],['0ubIxkefJsoYY8JXc2HJoa'],1999-05-26,0.683,0.881,0.0,-3.601,1.0,0.0348,0.0031,0.0376,0.074,0.611,94.012,4.0
578570,1wtSEG4Lxa3h9Sza10BoER,Rendez-Vous i Rio,22.0,195973,0,['Lustans Lakejer'],['3oB3e3MPyQBX5NttDmNUNN'],1981-02,0.67,0.898,4.0,-3.557,0.0,0.0317,0.0402,5.2e-05,0.0779,0.63,144.019,4.0
470422,35jzZ9P5F81sO2VGDxz3OH,Who Needs Love,44.0,211533,0,['Razorlight'],['450iujbtN6XgiA9pv6fVZz'],2006-01-01,0.764,0.887,2.0,-3.605,1.0,0.0308,0.253,0.0,0.0926,0.676,113.063,4.0
155157,4haBiYrYQ7AbvQxlt1QahV,Fuera,52.0,217480,0,"['RBD', 'Anahí', 'Dulce María', 'Maite Perroni...","['7cjh6y0V9SsyCrWSXTzwOs', '0TeVa4xdLB8vdzjsvK...",2005-01-01,0.718,0.866,11.0,-3.59,0.0,0.0417,0.2,0.0,0.088,0.717,100.042,4.0
241467,5yttPxcfowfL4TUYKDqdJe,Sally,35.0,227267,0,['Carmel'],['3uDjQUKw2WHSYtZcz2xjDo'],1984-01-01,0.689,0.883,0.0,-3.643,1.0,0.0367,0.0191,0.0442,0.118,0.64,135.884,4.0
