In [1]:
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import glob
import datetime
import time
import sys
import os
import tables
import io
from zipfile import ZipFile
from gensim.models.ldamodel import LdaModel
import gensim as gensim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import log_loss, accuracy_score
from itertools import chain




*Attempts to group song lyrics into topics and matching hot songs' words back to songs*

In [2]:
# Path to the Million Song Dataset subset (uncompressed) - change to the location on your laptop
# Cannot store this on github as it is too large
msd_subset_path = '../../MSD_data/MillionSongSubset/'

# Keep these - folders match the structure of the uncompressed file
msd_subset_data_path = os.path.join(msd_subset_path, 'data')
msd_subset_addf_path = os.path.join(msd_subset_path, 'AdditionalFiles')

In [3]:
# Uses CountVectorizer to extract words from texts
def getCntVect(what, stopwords) :
    count_vect = CountVectorizer(input='fname',
                                 stop_words=stopwords)
    dataset = count_vect.fit_transform(what)
    return (count_vect, dataset)

# Parses MXM file (from MSD website) and returns track - reconstructed song text (obviously, without order)
def getText(fname, check_id = None) :
    track2i = {}
    i = 0
    allStr = []
    text = []
    with ZipFile('../../MSD_data/' + fname + '.zip') as z:
        with z.open(fname) as src:
            for line in src:
                line = line.decode("utf-8")
                if line[0] == '#' :
                    continue
                if line[0] == '%' :
                    all_words = line[1:].split(',')
                    continue
                parts = line.split(',')
                trackid = parts[0]
                if check_id != None and trackid not in check_id :
                    continue
                track2i[trackid] = i
                i += 1
                str = ''
                strOnly = set()
                for i in range(2, len(parts)) :
                    code = parts[i].split(':')
                    w = all_words[int(code[0]) - 1]
                    strOnly.add(w)
                    for j in range(int(code[1])) :
                        str += w
                        str += ' '
                text.append(str)
                allStr.append(strOnly)

    return (track2i, text, all_words, allStr)

(track2i, text, all_words, allStr) = getText('mxm_dataset.txt')

In [4]:
# Runs LdaModel on MXM texts
def doLda(text) :
    (cnt, data) = getCntVect(text, [])
    corpus = gensim.matutils.Sparse2Corpus(data, documents_columns=False)
    id_map = dict((v, k) for k, v in cnt.vocabulary_.items())
    k = 4
    model = LdaModel(corpus, num_topics=k, id2word = id_map, passes=10)
    top_words = [[word for word,_ in model.show_topic(topicno, topn=50)] for topicno in range(model.num_topics)]
    top_betas = [[beta for _,beta in model.show_topic(topicno, topn=50)] for topicno in range(model.num_topics)]
    topn = 10
    for t in range(k) :
        print("Topic", t + 1)
        for w, b in zip(top_words[t][:topn], top_betas[t][:topn]) :
            print("\t", w, b)
    return (model, corpus, id_map, top_words, top_betas, topn)

In [5]:
(model, corpus, id_map, top_words, top_betas, topn) = doLda(text)

Topic 1
	 you 0.06896217
	 not 0.02772753
	 to 0.027723389
	 it 0.02626611
	 me 0.026186557
	 and 0.01999828
	 do 0.018791465
	 the 0.016847981
	 my 0.015083564
	 that 0.013842474
Topic 2
	 ich 0.039700765
	 und 0.031753503
	 die 0.029994661
	 du 0.025936678
	 da 0.023506653
	 der 0.01786892
	 nicht 0.016851075
	 das 0.015330857
	 ist 0.014606803
	 in 0.014363281
Topic 3
	 the 0.084027715
	 and 0.03257721
	 of 0.02437285
	 in 0.023455951
	 to 0.022142988
	 is 0.015319636
	 my 0.013463411
	 we 0.0129176825
	 on 0.012376971
	 it 0.009642142
Topic 4
	 la 0.039082862
	 que 0.038148064
	 de 0.036884256
	 no 0.02104369
	 me 0.019135416
	 en 0.017834427
	 el 0.015212119
	 un 0.014472179
	 te 0.014052592
	 tu 0.013633844


In [6]:
subset_full = pd.read_pickle(msd_subset_path+'subset_full_clean.pkl')

In [7]:
# For classification
# Simplify by only setting top 25% of song_hotness to be "hot" and the rest "not"

def convert_y_to_categorical(cutoff = 0.75):
    threshold = y.quantile(cutoff)
    Y = [0 if i < threshold else 1 for i in y]
    return np.array(Y)

In [8]:
# Do LDA for only hit songs
y = subset_full['song_hotttnesss']
y = convert_y_to_categorical()
track_hit = subset_full[y == 1].track_id
tr_hit_set = set()
for tr in track_hit :
    tr_hit_set.add(tr)
(track2iH, textH, all_wordsH, allStrH) = getText('mxm_dataset.txt', tr_hit_set)

In [9]:
(modelH, corpusH, id_mapH, top_wordsH, top_betasH, topnH) = doLda(textH)

Topic 1
	 que 0.04219505
	 la 0.022654649
	 de 0.022192998
	 me 0.016727462
	 no 0.015936652
	 en 0.015661983
	 el 0.014473983
	 tu 0.013031961
	 mi 0.013021611
	 te 0.011659139
Topic 2
	 you 0.046358056
	 and 0.03077111
	 not 0.018667452
	 it 0.017111741
	 the 0.016676141
	 to 0.01463232
	 is 0.013307138
	 my 0.011992037
	 have 0.011016317
	 do 0.010618285
Topic 3
	 the 0.07309293
	 and 0.026463933
	 of 0.021253124
	 to 0.02118038
	 we 0.019077085
	 it 0.017731711
	 in 0.017420586
	 is 0.015665404
	 your 0.014126421
	 you 0.013375721
Topic 4
	 you 0.05128066
	 the 0.031130154
	 to 0.026808992
	 it 0.024223857
	 me 0.021972094
	 and 0.021395301
	 not 0.019841786
	 my 0.017942755
	 is 0.0146789
	 am 0.012874113


In [10]:
# Extract words which are present in topics of hit songs but not all songs
allW = set(chain.from_iterable(top_words))
hotW = set(chain.from_iterable(top_wordsH))
keyWords = hotW - allW
print('Words in hot songs:', keyWords)

Words in hot songs: {'believ', 'solo', 'whi', 'danc', 'away', 'more', 'see', 'stop', 'tanto', 'start', 'dos', 'think', 'mas', 'cada', 'todo', 'eu', 'nos', 'again', 'hay', 'round', 'parti', 'better', 'não', 'ven', 'would'}


In [11]:
# Make sure subset_full only has tracks we have lyrics for
track_ids = subset_full.values[:, 0]
for i in range(subset_full.shape[0]) :
    track = track_ids[i]
    if track not in track2i :
        subset_full.drop(subset_full[subset_full.track_id == track].index, inplace=True)

In [12]:
# Predict a track to be hit if it contains key words of hit songs
y_pred = np.zeros(subset_full.shape[0])
track_ids = subset_full.values[:, 0]
c = 0
for i in range(subset_full.shape[0]) :
    track = track_ids[i]
    if track in track2i :
        j = track2i[track]
        for w in keyWords :
            if w in allStr[j] :
                y_pred[i] = 1
    else :
        print ('track not found', track)
        c += 1

In [13]:
y = subset_full['song_hotttnesss']
y = convert_y_to_categorical()

In [14]:
# Check that nothing is skipped - set was already cleaned 
assert c == 0

In [15]:
print('Accuracy', accuracy_score(y, y_pred))

Accuracy 0.39148681055155876


*~40% accuracy - much lower than baseline*