In [1]:
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from pandas._libs.lib import fast_zip
import os

# Load and Clean Data

In [2]:
# Load first 1 million data points, excluding the user id
data = pd.read_csv('data/spotify_dataset.csv', on_bad_lines='skip', usecols=[' "artistname"', ' "trackname"', ' "playlistname"'])

# Rename columns
data.rename(columns={' "artistname"': 'artists', ' "trackname"': 'title', ' "playlistname"': 'playlist'}, inplace=True)

# Remove all parantheticals in song names to treat more songs as the same
data['title'].str.replace(r"\(.*\)","", regex=True)

0                                Red Shoes
1            Peace, Love And Understanding
2                         7 Years Too Late
3                    Accidents Will Happen
4                                   Alison
                         ...              
12901974                         Wild Side
12901975                             Woman
12901976       You Don't Know How It Feels
12901977                      You Wreck Me
12901978                   Youth Gone Wild
Name: title, Length: 12901979, dtype: object

In [3]:
data.head()

Unnamed: 0,artists,title,playlist
0,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,Elvis Costello,Alison,HARD ROCK 2010


## Create 'sentences'

In [4]:
# Label each unique song (based on artist and song title) with its own id
ids, songs = pd.factorize(fast_zip([data.artists.values, data.title.values]))
data['songs'] = ids


# Form 'sentences' based on each playlist
min_num_songs = 5
groups = data.groupby('playlist').filter(lambda x: x['songs'].nunique() >= min_num_songs)
groups = groups.groupby('playlist')['songs'].unique()

sentences = groups.apply(list).tolist()

# Train Word2Vec Model

In [30]:
# To load: model = Word2Vec.load("song2vec.model")
model = Word2Vec(
    sentences=sentences,
    vector_size=20,        # Example: 20 dimensions
    window=10,             # Example: 10 songs before and 10 after
    min_count=2,           # Example: Ignore songs appearing less than 2 times
    sg=1,                  # Example: Use Skip-gram
    workers=8,             # Example: Use 8 CPU cores
    epochs=10              # Example: Iterate 10 times over the data
)
model.save("song2vec2.model")

In [6]:
model.wv[1]

array([ 0.38246003,  0.6203271 , -0.38880336, -0.09840385,  1.2375836 ,
       -0.9882781 , -1.3208984 ,  1.7355688 ,  0.2760223 ,  0.6414492 ,
       -1.2785425 , -0.49925712,  0.34828863,  0.49347827, -0.23920417,
        1.4617358 , -0.6441757 , -1.4714783 , -2.3783753 , -0.48084772],
      dtype=float32)

# Check Model

In [7]:
song_list = data.drop_duplicates(subset=['title', 'artists'])

In [28]:
# Search for song
title = ''
artist = ''

song_list[song_list['title'].str.contains(title, na=False, case=False) & song_list['artists'].str.contains(artist, na=False, case=False)]

Unnamed: 0,artists,title,playlist,songs
196,DJ Shadow,Fixed Income,Chill out,195
1775,Twin Shadow,Run My Heart,Chelsea,1445
4417,Twin Shadow,Castles In The Snow,Everything at once,3615
4637,Hadouken!,Crank It Up,Everything at once,3813
5038,Savoir Adore,Dreamers,Everything at once,4170
...,...,...,...,...
12884298,Voladoras,Geetar Slut,Starred,2821972
12888065,Belladonna,Dub Funk,Nudiscoisms,2822600
12892714,Shadow Dancer,Murder Room,Early 2015,2823076
12900668,Eddie Amador,Take Care Of My Heart,4.14.15,2823893


In [26]:
for song_id, score in model.wv.most_similar(107706, topn=10):
    song = song_list[song_list['songs'] == song_id].head(1)
    print(f'{song_id:10} {song["title"].values[0]:30} by {song["artists"].values[0]:20} -- {score:.4f}')

    614069 Call It What You Want          by Tesla                -- 0.9904
    887800 Calling To You                 by Robert Plant         -- 0.9857
   1326032 Child's Play                   by Tnt                  -- 0.9845
   1092241 Caffeine                       by Alice Cooper         -- 0.9843
   1067017 Camel's Night Out              by Eric Johnson         -- 0.9838
   1721661 Calm Before the Storm          by Saxon                -- 0.9833
    894087 Cagey Cretins                  by Blue Oyster Cult     -- 0.9826
   1112419 Cheap Sunglasses - Remastered Live Version by ZZ Top               -- 0.9825
    100310 Bus Stop - 1999 Remastered Version by Tin Machine          -- 0.9824
   1686649 Carmen                         by Toto                 -- 0.9819
