In [1]:
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from pandas._libs.lib import fast_zip
import os

# Load and Clean Data

In [2]:
# Load first 1 million data points, excluding the user id
data = pd.read_csv('data/spotify_dataset.csv', on_bad_lines='skip', usecols=[' "artistname"', ' "trackname"', ' "playlistname"'])

# Rename columns
data.rename(columns={' "artistname"': 'artists', ' "trackname"': 'title', ' "playlistname"': 'playlist'}, inplace=True)

# Remove all parantheticals in song names to treat more songs as the same
data['title'].str.replace(r"\(.*\)","", regex=True)

0                                Red Shoes
1            Peace, Love And Understanding
2                         7 Years Too Late
3                    Accidents Will Happen
4                                   Alison
                         ...              
12901974                         Wild Side
12901975                             Woman
12901976       You Don't Know How It Feels
12901977                      You Wreck Me
12901978                   Youth Gone Wild
Name: title, Length: 12901979, dtype: object

In [3]:
data.head()

Unnamed: 0,artists,title,playlist
0,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,Elvis Costello,Alison,HARD ROCK 2010


## Create 'sentences'

In [5]:
# Label each unique song (based on artist and song title) with its own id
ids, songs = pd.factorize(fast_zip([data.artists.values, data.title.values]))
data['songs'] = ids


# Form 'sentences' based on each playlist
min_num_songs = 5
groups = data.groupby('playlist').filter(lambda x: x['songs'].nunique() >= min_num_songs)
groups = groups.groupby('playlist')['songs'].unique()

sentences = groups.apply(list).tolist()

# Train Word2Vec Model

In [19]:
# To load: model = Word2Vec.load("song2vec.model")
model = Word2Vec.load("song2vec.model")
"""
model = Word2Vec(
    sentences=sentences,
    vector_size=20,        # Example: 20 dimensions
    window=10,             # Example: 10 songs before and 10 after
    min_count=2,           # Example: Ignore songs appearing less than 2 times
    sg=1,                  # Example: Use Skip-gram
    workers=8,             # Example: Use 8 CPU cores
    epochs=10              # Example: Iterate 10 times over the data
)
model.save("song2vec2.model")
"""

'\nmodel = Word2Vec(\n    sentences=sentences,\n    vector_size=20,        # Example: 20 dimensions\n    window=10,             # Example: 10 songs before and 10 after\n    min_count=2,           # Example: Ignore songs appearing less than 2 times\n    sg=1,                  # Example: Use Skip-gram\n    workers=8,             # Example: Use 8 CPU cores\n    epochs=10              # Example: Iterate 10 times over the data\n)\nmodel.save("song2vec2.model")\n'

In [7]:
model.wv[1]

array([ 0.4263834 ,  0.5521066 , -0.2698889 , -0.09454672,  1.0843053 ,
       -0.8065107 , -1.2720923 ,  1.7668669 ,  0.30876648,  0.56067955,
       -1.2315571 , -0.5868361 ,  0.30129227,  0.49245536, -0.16328426,
        1.4507519 , -0.84748226, -1.6008003 , -2.4196336 , -0.5084578 ],
      dtype=float32)

# Check Model

In [20]:
song_list = data.drop_duplicates(subset=['title', 'artists'])

In [35]:
# Search for song
title = 'call me maybe'
artist = ''

song_list[song_list['title'].str.contains(title, na=False, case=False) & song_list['artists'].str.contains(artist, na=False, case=False)]

Unnamed: 0,artists,title,playlist,songs
15358,Carly Rae Jepsen,Call Me Maybe,Starred,10997
58543,Carly Rae Jepsen Tribute Team,Call Me Maybe,Fluxblog,45073
65401,Carly Rae Jepsen,Call Me Maybe - 10 Kings vs Ollie Green Remix,Amor,49687
96029,Sam Tsui,Call Me Maybe,Sam Tsui,70237
148780,Carly Rae Jepsen,Call Me Maybe - Manhattan Clique Remix,De todo un poco,99243
...,...,...,...,...
11111110,Yoga Pop Ups,Call Me Maybe,Journaling PlayList,2586230
11186423,C-Rok,Call Me Maybe - Carly Rae Jepsen - C-Rok Rok...,#SMDayDenver,2599382
11404626,Chambaland,Let It Go vs. Call Me Maybe (Demi Lovato vs. C...,Chambaland Mashups,2628439
11556827,Workout Remix Factory,Call Me Maybe (138 BPM),Workout Remix Factory – Running Workout #1 Hits,2651962


In [36]:
for song_id, score in model.wv.most_similar(15358, topn=10):
    song = song_list[song_list['songs'] == song_id].head(1)
    print(f'{song_id:10} {song["title"].values[0]:30} by {song["artists"].values[0]:20} -- {score:.4f}')

     94372 Belongings                     by Clock Opera          -- 0.9681
     11903 Belong                         by The Pains Of Being Pure At Heart -- 0.9553
    122586 Betty's A Bombshell            by Grouplove            -- 0.9550
     91416 Bicycles                       by The Maccabees        -- 0.9548
    351697 Birthday                       by the bird and the bee -- 0.9540
     75650 Becoming A Jackal              by Villagers            -- 0.9518
      3232 Best Friend                    by The Drums            -- 0.9505
     18108 Black White & Blue             by Ladyhawke            -- 0.9492
    145464 Bang Pop                       by Free Energy          -- 0.9481
    372144 Be Good                        by Tokyo Police Club    -- 0.9480
