http://www.cs.ucl.ac.uk/fileadmin/UCL-CS/research/Research_Notes/RN_11_21.pdf

In [28]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import crud as crud
import pandas as pd
import matplotlib
from sklearn.decomposition import LatentDirichletAllocation,TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

import scipy

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
train = pd.read_csv("../experiments/triplets/train.csv")
test = pd.read_csv("../experiments/triplets/test.csv")
df = pd.concat([train, test])
del train, test
df.shape

(1549423, 5)

In [3]:
conn = crud.create_connection("../db/track_metadata.db")
tables = crud.get_tables(conn)

for table_name in tables.name.tolist():
    print(table_name)
    records = crud.get_records(conn, table_name)
    print(records.head())
conn.close()   

records_columns = ['song_id', 'artist_id']
songs = records[records_columns].drop_duplicates()
del records
songs.head()

songs
             track_id              title             song_id  \
0  TRMMMYQ128F932D901       Silent Night  SOQMMHC12AB0180CB8   
1  TRMMMKD128F425225D        Tanssi vaan  SOVFVAK12A8C1350D9   
2  TRMMMRX128F93187D9  No One Could Ever  SOGTUKN12AB017F4F1   
3  TRMMMCH128F425532C      Si Vos Querés  SOBNYVR12A8C13558C   
4  TRMMMWA128F426B589   Tangle Of Aspens  SOHSBXH12A8C13B0DF   

                                release           artist_id  \
0                 Monster Ballads X-Mas  ARYZTJS1187B98C555   
1                           Karkuteillä  ARMVN3U1187FB3A1EB   
2                                Butter  ARGEKB01187FB50750   
3                               De Culo  ARNWYLR1187B9B2F9C   
4  Rene Ablaze Presents Winter Sessions  AREQDTE1269FB37231   

                            artist_mbid       artist_name   duration  \
0  357ff05d-848a-44cf-b608-cb34b5701ae5  Faster Pussy cat  252.05506   
1  8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9  Karkkiautomaatti  156.55138   
2  3d403d44-36

Unnamed: 0,song_id,artist_id
0,SOQMMHC12AB0180CB8,ARYZTJS1187B98C555
1,SOVFVAK12A8C1350D9,ARMVN3U1187FB3A1EB
2,SOGTUKN12AB017F4F1,ARGEKB01187FB50750
3,SOBNYVR12A8C13558C,ARNWYLR1187B9B2F9C
4,SOHSBXH12A8C13B0DF,AREQDTE1269FB37231


In [4]:
df = df.merge(songs, on="song_id")
df['user_id_idx'] = df['user_id_idx'].astype(str)
df.shape

(1549423, 6)

In [5]:
artist_user = df.groupby('artist_id')['user_id_idx'].apply(' '.join).reset_index()
artist_user_list = artist_user.user_id_idx.tolist()
vectorizer = CountVectorizer(analyzer='word')
data_vectorized = vectorizer.fit_transform(artist_user_list)

In [7]:
lda_model = LatentDirichletAllocation(n_components=20, max_iter=10, learning_method='online', 
                                      random_state=100, batch_size=128, evaluate_every = -1,
                                      n_jobs = -1)
lda_output = lda_model.fit_transform(data_vectorized)

In [8]:
print(lda_model)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=20, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=100, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)


In [38]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

Log Likelihood:  -20122392.43560283
Perplexity:  436852.15223394043


In [37]:
len(lda_output)

3689

In [24]:
lda_output[0]

array([1.05617759e-01, 6.67159822e-01, 1.60771704e-04, 1.60771704e-04,
       1.60771704e-04, 1.60771707e-04, 1.60771705e-04, 1.60771706e-04,
       1.60771704e-04, 1.31457471e-02, 1.60771704e-04, 1.60771705e-04,
       1.60771704e-04, 1.60771707e-04, 4.88107829e-02, 1.60771704e-04,
       1.60771704e-04, 1.60771705e-04, 1.60771706e-04, 1.62854314e-01])

In [12]:
# similarity of artist vectors
scipy.spatial.distance.jensenshannon(lda_output[0], lda_output[1])

0.8234229503227553

In [26]:
# entropy (diverseness) of each artist
scipy.stats.entropy(lda_output[0])

1.028410307429089

In [35]:
# Define Search Param
search_params = {'n_components': [3, 4, 5, 6, 7, 8, 9, 10, 12], 'learning_decay': [.7, .8, 0.85, .9, 0.95]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online',random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)


GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_components': [3, 4, 5, 6, 7, 8, 9, 10, 12], 'learning_decay': [0.7, 0.8, 0.85, 0.9, 0.95]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [36]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.95, 'n_components': 3}
Best Log Likelihood Score:  -7378392.735243574
Model Perplexity:  355429.7258744338
