In [1]:
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from pandas._libs.lib import fast_zip
import os

# Load and Clean Data

In [2]:
# Load first 1 million data points, excluding the user id
data = pd.read_csv('frontend/data/dataset.csv', on_bad_lines='skip', usecols=[' "artistname"', ' "trackname"', ' "playlistname"'])

# Rename columns
data.rename(columns={' "artistname"': 'artists', ' "trackname"': 'title', ' "playlistname"': 'playlist'}, inplace=True)

# Remove all parantheticals in song names to treat more songs as the same
data['title'].str.replace(r"\(.*\)","", regex=True)

0                                Red Shoes
1            Peace, Love And Understanding
2                         7 Years Too Late
3                    Accidents Will Happen
4                                   Alison
                         ...              
12901974                         Wild Side
12901975                             Woman
12901976       You Don't Know How It Feels
12901977                      You Wreck Me
12901978                   Youth Gone Wild
Name: title, Length: 12901979, dtype: object

In [3]:
data.head()

Unnamed: 0,artists,title,playlist
0,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,Elvis Costello,Alison,HARD ROCK 2010


## Create 'sentences'

In [4]:
# Label each unique song (based on artist and song title) with its own id
ids, songs = pd.factorize(fast_zip([data.artists.values, data.title.values]))
data['songs'] = ids


# Form 'sentences' based on each playlist
min_num_songs = 5
groups = data.groupby('playlist').filter(lambda x: x['songs'].nunique() >= min_num_songs)
groups = groups.groupby('playlist')['songs'].unique()

sentences = groups.apply(list).tolist()

# Train Word2Vec Model

In [5]:
# To load: model = Word2Vec.load("song2vec.model")
model = Word2Vec.load("song2vec.model")
"""
model = Word2Vec(
    sentences=sentences,
    vector_size=20,        # Example: 20 dimensions
    window=10,             # Example: 10 songs before and 10 after
    min_count=2,           # Example: Ignore songs appearing less than 2 times
    sg=1,                  # Example: Use Skip-gram
    workers=8,             # Example: Use 8 CPU cores
    epochs=10              # Example: Iterate 10 times over the data
)
model.save("song2vec2.model")
"""

'\nmodel = Word2Vec(\n    sentences=sentences,\n    vector_size=20,        # Example: 20 dimensions\n    window=10,             # Example: 10 songs before and 10 after\n    min_count=2,           # Example: Ignore songs appearing less than 2 times\n    sg=1,                  # Example: Use Skip-gram\n    workers=8,             # Example: Use 8 CPU cores\n    epochs=10              # Example: Iterate 10 times over the data\n)\nmodel.save("song2vec2.model")\n'

In [6]:
model.wv[1]

array([ 0.52399874,  0.31521776, -0.09809922,  0.10885182,  0.9538687 ,
       -0.81175107, -1.1404134 ,  1.7771199 ,  0.2299308 ,  0.5151783 ,
       -1.3428736 , -0.40820363,  0.5099922 ,  0.3855613 , -0.31446534,
        1.5877265 , -0.6026317 , -1.5204601 , -2.409446  , -0.54963505],
      dtype=float32)

# Check Model

In [7]:
song_list = data.drop_duplicates(subset=['title', 'artists'])

In [8]:
# Search for song
title = 'Sweater Weather'
artist = ''

song_list[song_list['title'].str.contains(title, na=False, case=False) & song_list['artists'].str.contains(artist, na=False, case=False)]

Unnamed: 0,artists,title,playlist,songs
9509,The Neighbourhood,Sweater Weather,Everything at once,8201
427147,Slaves,Sweater Weather,Punk Goes Pop,237649
881070,The Neighbourhood,Sweater Weather - Spotify Sessions Curated by ...,relaxing,420506
925470,Scott Bradlee's Postmodern Jukebox,Sweater Weather,Scott Bradlee & Postmodern Jukebox – Clubbin' ...,439411
958803,Alyson Stoner & MAX,Sweater Weather,Sancta Sapientia,451516
1027908,Vitamin String Quartet,Sweater Weather (String Quartet Tribute to the...,Vitamin String,477717
1295062,Sledding With Tigers,Sweater Weather,Calm,565496
1541242,Earlimart,Sweater Weather,Summer 14,642067
1655653,Piano Tribute Players,Sweater Weather,Piano,681434
2288350,The Vamps,Sweater Weather,The Vamps,847654


In [9]:
for song_id, score in model.wv.most_similar(9509, topn=10):
    song = song_list[song_list['songs'] == song_id].head(1)
    print(f'{song_id:10} {song["title"].values[0]:30} by {song["artists"].values[0]:20} -- {score:.4f}')

    211939 Played-A-Live (The Bongo Song) - Radio Cut by Safri Duo            -- 0.9713
     55504 Please Don't Go                by Double You           -- 0.9451
    331423 Pjanoo                         by Eric Prydz           -- 0.9379
    226905 Party Shaker - Video Edit      by R.I.O.               -- 0.9283
      2051 Play Hard (feat. Ne-Yo & Akon) [New Edit] by David Guetta         -- 0.9247
   1284456 Orbion - Max Graham vs Protoculture Remix by Armin van Buuren     -- 0.9225
    419847 Powerbeat                      by Broiler              -- 0.9213
     53849 Put Your Hands Up For Detroit  by Fedde Le Grand       -- 0.9211
    707084 Rain Down Love                 by Freemasons           -- 0.9182
    136860 Pressure                       by Ylvis                -- 0.9176
