In [5]:
import pandas as pd
import numpy as np
import feather
import pprint

#source: https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [2]:
mxm_dataset = pd.read_feather('mxm_dataset.feather')
stop_words_tidytext = pd.read_feather('stop_words_tidytext')

In [5]:
#sample the data for quick initial analysis
tf_data = mxm_dataset.sample(frac= 1, random_state = 0).reset_index()
features = tf_data.columns

In [6]:
#remove english stopwords from tidytext list
stop_words = []
for i in stop_words_tidytext.word:
    if i in features:
        stop_words.append(i)

tf_data = tf_data.drop(stop_words, axis=1)
tf_data = tf_data.drop(['track_id', 'index'], axis=1)

In [7]:
#convery to tfidf to emphasize words that occur less frequently
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import csr_matrix
features = tf_data.columns
tf_data = csr_matrix(tf_data)
tfidf = TfidfTransformer()
tfidf_data = tfidf.fit_transform(tf_data)
tf_data = pd.DataFrame(tf_data.toarray(), columns=features)
tfidf_data = pd.DataFrame(tfidf_data.toarray(), columns=features)

In [8]:
#extract topics
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

#lda with tfidf
lda_tfidf = LatentDirichletAllocation(n_topics=10, random_state=0)
lda_tfidf.fit(tfidf_data)

#lda with  tf
lda_tf = LatentDirichletAllocation(n_topics=10, random_state=0)
lda_tf.fit(tf_data)

#nmf with if
nmf_tfidf = NMF(n_components=10, random_state=0)
nmf_tfidf.fit(tfidf_data)

#nmf with if
nmf_tf = NMF(n_components=10, random_state=0)
nmf_tf.fit(tf_data)



NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=10, random_state=0, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [9]:
#returns the top words for each topic
def corpus_topics_top_words(model, features, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict[topic_idx] = [features[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topic_dict

In [10]:
#shows topic weights for each song
def song_topics(model, song):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict[topic_idx] = sum(topic*song)
    return topic_dict

In [11]:
#tfidf, lda topic words
top_per_topic_words = corpus_topics_top_words(lda_tfidf, features, 10)
for i in list(top_per_topic_words.keys()):
    print(str(i) + ' '+ str(top_per_topic_words[i]))

0 ['love', 'babi', 'time', 'feel', 'yeah', 'ca', 'gonna', 'heart', 'wanna', 'girl']
1 ['de', 'el', 'la', 'en', 'te', 'mi', 'es', 'ich', 'tu', 'se']
2 ['che', 'di', 'na', 'il', 'ja', 'la', 'se', 'mi', 'è', 'ma']
3 ['i’m', 'don’t', 'it’', 'mari', 'refrain', 'you’r', 'warrior', '–', 'can’t', 'ye']
4 ['love', 'time', 'feel', 'day', 'life', 'ca', 'eye', 'world', 'live', 'heart']
5 ['nigga', 'ya', 'shit', 'fuck', 'rock', 'yo', 'em', 'yeah', 'bitch', 'wanna']
6 ['jag', 'da', 'det', 'och', 'som', 'du', 'og', 'ba', 'på', 'är']
7 ['la', 'je', 'de', 'les', 'le', 'pas', 'dan', 'des', 'qui', 'cest']
8 ['god', 'death', 'lord', 'blood', 'soul', 'die', 'jesus', 'burn', 'dark', 'earth']
9 ['christma', 'don', 'whoa', 'll', 'yea', 've', 'hallelujah', 'ni', 'wa', 'woah']


In [12]:
#tf, lda topic words
top_per_topic_words = corpus_topics_top_words(lda_tf, features, 10)
for i in list(top_per_topic_words.keys()):
    print(str(i) + ' '+ str(top_per_topic_words[i]))

0 ['love', 'day', 'heart', 'night', 'feel', 'time', 'dream', 'eye', 'fall', 'alway']
1 ['la', 'de', 'le', 'je', 'les', 'da', 'di', 'il', 'tu', 'che']
2 ['love', 'babi', 'yeah', 'gonna', 'wanna', 'girl', 'hey', 'ooh', 'littl', 'gotta']
3 ['ich', 'und', 'die', 'du', 'der', 'nicht', 'das', 'ist', 'es', 'ein']
4 ['nigga', 'ya', 'caus', 'rock', 'shit', 'boy', 'play', 'fuck', 'money', 'everybodi']
5 ['na', 'de', 'eu', 'push', 'não', 'é', 'ik', 'um', 'doo', 'gimm']
6 ['burn', 'run', 'dead', 'kill', 'fire', 'blood', 'die', 'black', 'head', 'death']
7 ['ca', 'time', 'whi', 'tri', 'life', 'feel', 'caus', 'noth', 'wo', 'mind']
8 ['de', 'el', 'la', 'en', 'te', 'mi', 'tu', 'se', 'es', 'yo']
9 ['world', 'god', 'soul', 'lord', 'live', 'free', 'life', 'heaven', 'war', 'save']


In [13]:
#tfidf, NMF topic words
top_per_topic_words = corpus_topics_top_words(nmf_tfidf, features, 10)
for i in list(top_per_topic_words.keys()):
    print(str(i) + ' '+ str(top_per_topic_words[i]))

0 ['ca', 'feel', 'whi', 'tri', 'believ', 'wo', 'caus', 'someth', 'noth', 'everyth']
1 ['de', 'el', 'la', 'en', 'te', 'mi', 'tu', 'se', 'es', 'por']
2 ['love', 'heart', 'true', 'girl', 'onli', 'kiss', 'forev', 'alway', 'hold', 'sweet']
3 ['ich', 'und', 'die', 'du', 'der', 'nicht', 'das', 'ist', 'ein', 'mich']
4 ['je', 'de', 'la', 'les', 'le', 'pas', 'des', 'dan', 'qui', 'à']
5 ['babi', 'girl', 'ooh', 'night', 'pleas', 'littl', 'cri', 'tonight', 'babe', 'honey']
6 ['yeah', 'gonna', 'wanna', 'girl', 'hey', 'nigga', 'ya', 'gotta', 'caus', 'fuck']
7 ['che', 'di', 'la', 'il', 'è', 'mi', 'ma', 'da', 'ti', 'io']
8 ['life', 'day', 'world', 'night', 'eye', 'dream', 'live', 'light', 'heart', 'fall']
9 ['time', 'mind', 'wait', 'gonna', 'chang', 'wast', 'everi', 'day', 'mine', 'tri']


In [14]:
#tf, NMF topic words
top_per_topic_words = corpus_topics_top_words(nmf_tfidf, features, 10)
for i in list(top_per_topic_words.keys()):
    print(str(i) + ' '+ str(top_per_topic_words[i]))

0 ['ca', 'feel', 'whi', 'tri', 'believ', 'wo', 'caus', 'someth', 'noth', 'everyth']
1 ['de', 'el', 'la', 'en', 'te', 'mi', 'tu', 'se', 'es', 'por']
2 ['love', 'heart', 'true', 'girl', 'onli', 'kiss', 'forev', 'alway', 'hold', 'sweet']
3 ['ich', 'und', 'die', 'du', 'der', 'nicht', 'das', 'ist', 'ein', 'mich']
4 ['je', 'de', 'la', 'les', 'le', 'pas', 'des', 'dan', 'qui', 'à']
5 ['babi', 'girl', 'ooh', 'night', 'pleas', 'littl', 'cri', 'tonight', 'babe', 'honey']
6 ['yeah', 'gonna', 'wanna', 'girl', 'hey', 'nigga', 'ya', 'gotta', 'caus', 'fuck']
7 ['che', 'di', 'la', 'il', 'è', 'mi', 'ma', 'da', 'ti', 'io']
8 ['life', 'day', 'world', 'night', 'eye', 'dream', 'live', 'light', 'heart', 'fall']
9 ['time', 'mind', 'wait', 'gonna', 'chang', 'wast', 'everi', 'day', 'mine', 'tri']


In [15]:
lda_tf_2 = LatentDirichletAllocation(n_topics=2, random_state=0)
lda_tf_2.fit(tf_data)



0 ['love', 'time', 'feel', 'ca', 'babi', 'yeah', 'day', 'life', 'caus', 'heart']
1 ['la', 'de', 'en', 'el', 'tu', 'te', 'se', 'mi', 'es', 'ich']


In [18]:
top_per_topic_words = corpus_topics_top_words(lda_tf_2, features, 10)
for i in list(top_per_topic_words.keys()):
    print(str(i) + ' '+ str(top_per_topic_words[i]))

0 ['love', 'time', 'feel', 'ca', 'babi', 'yeah', 'day', 'life', 'caus', 'heart']
1 ['la', 'de', 'en', 'el', 'tu', 'te', 'se', 'mi', 'es', 'ich']


In [21]:
lda_tf_5 = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tf_5.fit(tf_data)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=5,
             perp_tol=0.1, random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [19]:
top_per_topic_words = corpus_topics_top_words(lda_tf_5, features, 10)
for i in list(top_per_topic_words.keys()):
    print(str(i) + ' '+ str(top_per_topic_words[i]))

0 ['love', 'world', 'life', 'eye', 'light', 'heart', 'dream', 'god', 'die', 'soul']
1 ['la', 'de', 'en', 'el', 'tu', 'te', 'se', 'mi', 'le', 'si']
2 ['babi', 'yeah', 'hey', 'ya', 'wanna', 'girl', 'nigga', 'rock', 'ooh', 'gonna']
3 ['ich', 'und', 'die', 'du', 'der', 'nicht', 'da', 'das', 'ist', 'es']
4 ['love', 'time', 'ca', 'feel', 'day', 'caus', 'tri', 'whi', 'gonna', 'life']


In [17]:
lda_tf_25 = LatentDirichletAllocation(n_topics=25, random_state=0)
lda_tf_25.fit(tf_data)



0 ['love', 'heart', 'alway', 'hold', 'feel', 'onli', 'kiss', 'true', 'pleas', 'mine']
1 ['na', 'da', 'di', 'che', 'la', 'il', 'se', 'mi', 'eu', 'ma']
2 ['babi', 'ooh', 'woman', 'jag', 'det', 'babe', 'du', 'alright', 'crazi', 'och']
3 ['world', 'god', 'heaven', 'war', 'angel', 'live', 'earth', 'king', 'fight', 'born']
4 ['walk', 'blue', 'black', 'town', 'rememb', 'white', 'red', 'watch', 'morn', 'shine']
5 ['run', 'call', 'friend', 'nobodi', 'beauti', 'lover', 'care', 'fool', 'river', 'push']
6 ['die', 'burn', 'fire', 'ah', 'dead', 'hell', 'flame', 'citi', 'kill', 'devil']
7 ['night', 'light', 'dream', 'sky', 'star', 'fli', 'wind', 'dark', 'alon', 'sleep']
8 ['time', 'feel', 'life', 'ca', 'live', 'tri', 'mind', 'believ', 'day', 'chang']
9 ['everyth', 'talk', 'anyth', 'easi', 'somebodi', 'honey', 'sorri', 'cos', 'drive', 'dem']
10 ['yeah', 'everybodi', 'uh', 'mama', 'yea', 'bout', 'feelin', 'parti', 'brother', 'babi']
11 ['ya', 'de', 'ik', 'en', 'van', 'je', 'bien', 'pa', 'dat', 'het']
1

In [20]:
top_per_topic_words = corpus_topics_top_words(lda_tf_25, features, 10)
for i in list(top_per_topic_words.keys()):
    print(str(i) + ' '+ str(top_per_topic_words[i]))

0 ['love', 'heart', 'alway', 'hold', 'feel', 'onli', 'kiss', 'true', 'pleas', 'mine']
1 ['na', 'da', 'di', 'che', 'la', 'il', 'se', 'mi', 'eu', 'ma']
2 ['babi', 'ooh', 'woman', 'jag', 'det', 'babe', 'du', 'alright', 'crazi', 'och']
3 ['world', 'god', 'heaven', 'war', 'angel', 'live', 'earth', 'king', 'fight', 'born']
4 ['walk', 'blue', 'black', 'town', 'rememb', 'white', 'red', 'watch', 'morn', 'shine']
5 ['run', 'call', 'friend', 'nobodi', 'beauti', 'lover', 'care', 'fool', 'river', 'push']
6 ['die', 'burn', 'fire', 'ah', 'dead', 'hell', 'flame', 'citi', 'kill', 'devil']
7 ['night', 'light', 'dream', 'sky', 'star', 'fli', 'wind', 'dark', 'alon', 'sleep']
8 ['time', 'feel', 'life', 'ca', 'live', 'tri', 'mind', 'believ', 'day', 'chang']
9 ['everyth', 'talk', 'anyth', 'easi', 'somebodi', 'honey', 'sorri', 'cos', 'drive', 'dem']
10 ['yeah', 'everybodi', 'uh', 'mama', 'yea', 'bout', 'feelin', 'parti', 'brother', 'babi']
11 ['ya', 'de', 'ik', 'en', 'van', 'je', 'bien', 'pa', 'dat', 'het']
1

# Assigning topics

After exploring a few basic models, here are my initial takeaways from the model I believe performed the best:
* lda model
* term frequency as the word representation
* 25 topic counts.

There is still a lot of room for improvement, but I think there is  quite a bit we could do with these topics as a supplemental part of this project. Each word is assigned a weight to each topic, so we can provide a score for each song (therefore also each artist, location, etc..) for how prevelent each topic is. Below are the top 10 weighted words in each topic, with the topic name I selected (I'm only selecting topics which appeared clear, there are another 13 topics at the bottom of this notebook). 

Would love to hear any feedback.

Clear Topics:
* **Love**: 'love', 'heart', 'alway', 'hold', 'feel', 'onli', 'kiss', 'true', 'pleas', 'mine'
* **Pain/Loss/Fear**: 'blind', 'death', 'lost', 'fear', 'control', 'learn', 'build', 'scream', 'becom', 'bird'
* **Religion**: 'world', 'god', 'heaven', 'war', 'angel', 'live', 'earth', 'king', 'fight', 'born'
* **Death**: 'die', 'burn', 'fire', 'ah', 'dead', 'hell', 'flame', 'citi', 'kill', 'devil'
* **Dancing**: 'wanna', 'danc', 'caus', 'ride', 'bad', 'beat', 'readi', 'shake', 'move', 'gonna'


Clear Groupings that Don't Indicate a Topic:
*  **"sing/songy words**: 'na', 'da', 'di', 'che', 'la', 'il', 'se', 'mi', 'eu', 'ma'
*  **Sleep/Night**: 'night', 'light', 'dream', 'sky', 'star', 'fli', 'wind', 'dark', 'alon', 'sleep'
*  **Dutch?**: 'ya', 'de', 'ik', 'en', 'van', 'je', 'bien', 'pa', 'dat', 'het'
*  **Hip/hop genre**:'fuck', 'nigga', 'shit', 'yo', 'em', 'ya', 'bitch', 'ai', 'yall', 'ass'
*  **German**:'ich', 'und', 'die', 'du', 'der', 'nicht', 'das', 'ist', 'es', 'ein'
*  **Spanish**: 'de', 'la', 'el', 'en', 'te', 'mi', 'tu', 'se', 'es', 'yo'


Topics I'd say are "a bit of a stretch":
* **motivation/inpiration**: 'time', 'feel', 'life', 'ca', 'live', 'tri', 'mind', 'believ', 'day', 'chang'