In [1]:
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import glob
import datetime
import time
import sys
import os
import tables
import io
from zipfile import ZipFile
from gensim.models.ldamodel import LdaModel
import gensim as gensim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import log_loss, accuracy_score
from itertools import chain




*Attempts to group song lyrics into topics and matching hot songs' words back to songs*

In [2]:
# Path to the Million Song Dataset subset (uncompressed) - change to the location on your laptop
# Cannot store this on github as it is too large
msd_subset_path = '../../MSD_data/MillionSongSubset/'

# Keep these - folders match the structure of the uncompressed file
msd_subset_data_path = os.path.join(msd_subset_path, 'data')
msd_subset_addf_path = os.path.join(msd_subset_path, 'AdditionalFiles')

In [3]:
# Uses CountVectorizer to extract words from texts
def getCntVect(what, stopwords) :
    count_vect = CountVectorizer(input='fname',
                                 stop_words=stopwords)
    dataset = count_vect.fit_transform(what)
    return (count_vect, dataset)

# Parses MXM file (from MSD website) and returns track - reconstructed song text (obviously, without order)
def getText(fname, check_id = None) :
    track2i = {}
    i = 0
    allStr = []
    text = []
    with ZipFile('../../MSD_data/' + fname + '.zip') as z:
        with z.open(fname) as src:
            for line in src:
                line = line.decode("utf-8")
                if line[0] == '#' :
                    continue
                if line[0] == '%' :
                    all_words = line[1:].split(',')
                    continue
                parts = line.split(',')
                trackid = parts[0]
                if check_id != None and trackid not in check_id :
                    continue
                track2i[trackid] = i
                i += 1
                str = ''
                strOnly = set()
                for i in range(2, len(parts)) :
                    code = parts[i].split(':')
                    w = all_words[int(code[0]) - 1]
                    strOnly.add(w)
                    for j in range(int(code[1])) :
                        str += w
                        str += ' '
                text.append(str)
                allStr.append(strOnly)

    return (track2i, text, all_words, allStr)

(track2i, text, all_words, allStr) = getText('mxm_dataset.txt')

In [4]:
# Runs LdaModel on MXM texts
def doLda(text) :
    (cnt, data) = getCntVect(text, [])
    corpus = gensim.matutils.Sparse2Corpus(data, documents_columns=False)
    id_map = dict((v, k) for k, v in cnt.vocabulary_.items())
    k = 4
    model = LdaModel(corpus, num_topics=k, id2word = id_map, passes=10)
    top_words = [[word for word,_ in model.show_topic(topicno, topn=50)] for topicno in range(model.num_topics)]
    top_betas = [[beta for _,beta in model.show_topic(topicno, topn=50)] for topicno in range(model.num_topics)]
    topn = 10
    for t in range(k) :
        print("Topic", t + 1)
        for w, b in zip(top_words[t][:topn], top_betas[t][:topn]) :
            print("\t", w, b)
    return (model, corpus, id_map, top_words, top_betas, topn)

In [5]:
(model, corpus, id_map, top_words, top_betas, topn) = doLda(text)

Topic 1
	 de 0.022160897
	 ich 0.018060055
	 du 0.01805798
	 la 0.016165841
	 je 0.015607751
	 und 0.014444811
	 le 0.014070877
	 na 0.01399028
	 et 0.013456377
	 les 0.012770066
Topic 2
	 que 0.04897225
	 la 0.043464936
	 de 0.03538346
	 no 0.031064974
	 me 0.024077276
	 el 0.022054244
	 mi 0.019173343
	 te 0.01858312
	 en 0.017377324
	 se 0.015482152
Topic 3
	 the 0.084077306
	 and 0.03258288
	 of 0.024401527
	 in 0.023476878
	 to 0.0221634
	 is 0.015325431
	 my 0.013465866
	 we 0.012949138
	 on 0.012298603
	 it 0.009627162
Topic 4
	 you 0.068860695
	 not 0.027703244
	 to 0.02769147
	 it 0.02625265
	 me 0.026121996
	 and 0.019990988
	 do 0.018758655
	 the 0.016844038
	 my 0.015072892
	 that 0.013839817


In [6]:
subset_full = pd.read_pickle(msd_subset_path+'subset_full_clean.pkl')

In [7]:
# For classification
# Simplify by only setting top 25% of song_hotness to be "hot" and the rest "not"

def convert_y_to_categorical(cutoff = 0.75):
    threshold = y.quantile(cutoff)
    Y = [0 if i < threshold else 1 for i in y]
    return np.array(Y)

In [8]:
# Do LDA for only hit songs
y = subset_full['song_hotttnesss']
y = convert_y_to_categorical()
track_hit = subset_full[y == 1].track_id
tr_hit_set = set()
for tr in track_hit :
    tr_hit_set.add(tr)
(track2iH, textH, all_wordsH, allStrH) = getText('mxm_dataset.txt', tr_hit_set)

In [9]:
(modelH, corpusH, id_mapH, top_wordsH, top_betasH, topnH) = doLda(textH)

Topic 1
	 you 0.051109936
	 the 0.03854662
	 to 0.026324727
	 it 0.025889928
	 and 0.02138989
	 not 0.018996118
	 my 0.018835433
	 me 0.018000921
	 is 0.015719961
	 that 0.012735757
Topic 2
	 the 0.058687788
	 and 0.031193392
	 of 0.023487253
	 to 0.02131898
	 we 0.020014198
	 in 0.014852025
	 is 0.01413148
	 will 0.01336591
	 your 0.013263011
	 you 0.013147092
Topic 3
	 it 0.025667327
	 me 0.022456665
	 you 0.020410355
	 the 0.019318076
	 and 0.019004613
	 my 0.015383999
	 oh 0.014782661
	 am 0.014333639
	 to 0.013418681
	 get 0.0132293
Topic 4
	 que 0.043104775
	 la 0.028041458
	 de 0.02220759
	 me 0.017549973
	 no 0.01712435
	 en 0.01573908
	 el 0.014656312
	 tu 0.013585265
	 mi 0.013338813
	 te 0.012018661


In [19]:
# Extract words which are present in topics of hit songs but not all songs
allW = set(chain.from_iterable(top_words))
hotW = set(chain.from_iterable(top_wordsH))
keyWords = hotW - allW
print('Words in hot songs:', keyWords)

Words in hot songs: {'saturday', 'dos', 'parti', 'cada', 'turn', 'nos', 'white', 'ser', 'here', 'tanto', 'see', 'est', 'again', 'whoa', 'tengo', 'ven', 'nah', 'wait', 'togeth', 'start', 'call', 'light', 'freez'}


In [20]:
# Make sure subset_full only has tracks we have lyrics for
track_ids = subset_full.values[:, 0]
for i in range(subset_full.shape[0]) :
    track = track_ids[i]
    if track not in track2i :
        subset_full.drop(subset_full[subset_full.track_id == track].index, inplace=True)

In [21]:
# Predict a track to be hit if it contains key words of hit songs
y_pred = np.zeros(subset_full.shape[0])
track_ids = subset_full.values[:, 0]
c = 0
for i in range(subset_full.shape[0]) :
    track = track_ids[i]
    if track in track2i :
        j = track2i[track]
        for w in keyWords :
            if w in allStr[j] :
                y_pred[i] = 1
    else :
        print ('track not found', track)
        c += 1

In [22]:
y = subset_full['song_hotttnesss']
y = convert_y_to_categorical()

In [23]:
# Check that nothing is skipped - set was already cleaned 
assert c == 0

In [24]:
print('Accuracy', accuracy_score(y, y_pred))

Accuracy 0.42206235011990406


*42% accuracy - not bad!*