In [3]:
import pandas as pd
import datetime
import time
import matplotlib.pyplot as plt

import sqlite3
from gensim.models import Word2Vec
import multiprocessing
import logging

# Preprocessing

In [4]:
ls /home/luongnv/Documents/DataScience/Project-Tripi/data

BPR.ipynb                    [0m[01;34mdata[0m/           [01;34mml-100k[0m/
BPR.py                       df_cleaned.csv  [01;31mml-100k.zip[0m
[01;34mcleaned-table[0m/               df_grouped.csv  PhoeniX_cosine_hotel.csv
cosine_hotel_500samples.csv  [01;34mexpedia-hotel[0m/  plot-sample.ipynb
cosine_hotel.csv             mapped.ipynb    [01;34mpublic[0m/
cosine_hotel_rank1.csv       mapping.csv     ranking.ipynb
cosine_hotel_rank1_full.csv  mapping.ipynb   [01;34mtable[0m/


# Training the Model

In [None]:
DATA_PATH = '/home/luongnv/Documents/DataScience/Project-Tripi/data/public/music-session/spud.sqlite'

spud = sqlite3.connect(DATA_PATH)
cur = spud.cursor()
cur.execute('SELECT name FROM sqlite_master WHERE type="table";')
cur.fetchall()

In [None]:
dataset = pd.read_sql('SELECT \
    l.user AS user_id, \
    l.date AS listen_date, \
    t.trackid AS track_id, \
    t.title AS track_title, \
    a.artistid AS artist_id, \
    a.name AS artist_name, \
    m.albumid AS album_id, \
    m.name AS album_name, \
    m.artist AS album_artist \
  FROM lastfmtracklistens AS l \
  INNER JOIN tracks t ON l.track = t.trackid \
  INNER JOIN artists a ON t.artist = a.artistid \
  INNER JOIN albums m ON t.album = m.albumid;', con=spud)

print(dataset.shape)

In [None]:
dataset.sample(10)

In [None]:
dataset['listen_date'] = dataset['listen_date'].apply(lambda s: datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S+00:00'))
dataset.sort_values('listen_date', ascending=True, inplace=True)

In [None]:
_ = plt.hist(dataset["track_id"].value_counts().values, bins=50, range=(0, 50))

In [None]:
start = time.time()
sessions = []
for user_id, df in dataset.groupby('user_id', sort=False):
    session = []
    for index, row in df.iterrows():
        session.append(str(row['track_id']))
  
    sessions.append(session)
    
print("Took {}'s".format(time.time() - start))

print(sessions[0])

In [None]:
def make_ngrams(session, n=5):
    return list(zip(*[session[i:] for i in range(n)]))

In [None]:
gen_sessions = []
for session in sessions:
    gen_sessions.extend(make_ngrams(session, 11))

print(len(gen_sessions))

In [None]:
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
logging.root.level = logging.INFO

In [None]:
model = Word2Vec(
    gen_sessions,
    size=32,            # Vector dimensions
    window=5,           # Sliding window size
    sg=1,               # Use the skip gram model
    hs=0,               # Use negative sampling
    negative=20,        # Number of negative samples
    ns_exponent=-0.5,   # Unigram distribution's exponent
    sample=1e-4,        # Subsampling rate
    workers=(2 * multiprocessing.cpu_count() + 1),
    iter=5
)
model.save("music_session.model")

# Exploring the Model

## Most similar to:

In [None]:
w2v_model.wv.most_similar(positive=["homer"])

## Similarities:

In [None]:
w2v_model.wv.similarity("moe_'s", 'tavern')