In [1]:
import pandas as pd
import numpy as np
import feather
import pprint

#source: https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [2]:
mxm_dataset = pd.read_feather('mxm_dataset.feather')
stop_words_tidytext = pd.read_feather('stop_words_tidytext')

In [5]:
#sample the data for quick initial analysis
tf_data = mxm_dataset.sample(frac= 1, random_state = 0).reset_index()
features = tf_data.columns

In [6]:
#remove english stopwords from tidytext list
stop_words = []
for i in stop_words_tidytext.word:
    if i in features:
        stop_words.append(i)

tf_data = tf_data.drop(stop_words, axis=1)
tf_data = tf_data.drop(['track_id', 'index'], axis=1)

In [7]:
#convery to tfidf to emphasize words that occur less frequently
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import csr_matrix
features = tf_data.columns
tf_data = csr_matrix(tf_data)
tfidf = TfidfTransformer()
tfidf_data = tfidf.fit_transform(tf_data)
tf_data = pd.DataFrame(tf_data.toarray(), columns=features)
tfidf_data = pd.DataFrame(tfidf_data.toarray(), columns=features)

In [8]:
#extract topics
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

#lda with tfidf
lda_tfidf = LatentDirichletAllocation(n_topics=10, random_state=0)
lda_tfidf.fit(tfidf_data)

#lda with  tf
lda_tf = LatentDirichletAllocation(n_topics=10, random_state=0)
lda_tf.fit(tf_data)

#nmf with if
nmf_tfidf = NMF(n_components=10, random_state=0)
nmf_tfidf.fit(tfidf_data)

#nmf with if
nmf_tf = NMF(n_components=10, random_state=0)
nmf_tf.fit(tf_data)



NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=10, random_state=0, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [9]:
#returns the top words for each topic
def corpus_topics_top_words(model, features, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict[topic_idx] = [features[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topic_dict

In [10]:
#shows topic weights for each song
def song_topics(model, song):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict[topic_idx] = sum(topic*song)
    return topic_dict

In [11]:
#tfidf, lda topic words
top_per_topic_words = corpus_topics_top_words(lda_tfidf, features, 10)
for i in list(top_per_topic_words.keys()):
    print(str(i) + ' '+ str(top_per_topic_words[i]))

0 ['love', 'babi', 'time', 'feel', 'yeah', 'ca', 'gonna', 'heart', 'wanna', 'girl']
1 ['de', 'el', 'la', 'en', 'te', 'mi', 'es', 'ich', 'tu', 'se']
2 ['che', 'di', 'na', 'il', 'ja', 'la', 'se', 'mi', 'è', 'ma']
3 ['i’m', 'don’t', 'it’', 'mari', 'refrain', 'you’r', 'warrior', '–', 'can’t', 'ye']
4 ['love', 'time', 'feel', 'day', 'life', 'ca', 'eye', 'world', 'live', 'heart']
5 ['nigga', 'ya', 'shit', 'fuck', 'rock', 'yo', 'em', 'yeah', 'bitch', 'wanna']
6 ['jag', 'da', 'det', 'och', 'som', 'du', 'og', 'ba', 'på', 'är']
7 ['la', 'je', 'de', 'les', 'le', 'pas', 'dan', 'des', 'qui', 'cest']
8 ['god', 'death', 'lord', 'blood', 'soul', 'die', 'jesus', 'burn', 'dark', 'earth']
9 ['christma', 'don', 'whoa', 'll', 'yea', 've', 'hallelujah', 'ni', 'wa', 'woah']


In [12]:
#tf, lda topic words
top_per_topic_words = corpus_topics_top_words(lda_tf, features, 10)
for i in list(top_per_topic_words.keys()):
    print(str(i) + ' '+ str(top_per_topic_words[i]))

0 ['love', 'day', 'heart', 'night', 'feel', 'time', 'dream', 'eye', 'fall', 'alway']
1 ['la', 'de', 'le', 'je', 'les', 'da', 'di', 'il', 'tu', 'che']
2 ['love', 'babi', 'yeah', 'gonna', 'wanna', 'girl', 'hey', 'ooh', 'littl', 'gotta']
3 ['ich', 'und', 'die', 'du', 'der', 'nicht', 'das', 'ist', 'es', 'ein']
4 ['nigga', 'ya', 'caus', 'rock', 'shit', 'boy', 'play', 'fuck', 'money', 'everybodi']
5 ['na', 'de', 'eu', 'push', 'não', 'é', 'ik', 'um', 'doo', 'gimm']
6 ['burn', 'run', 'dead', 'kill', 'fire', 'blood', 'die', 'black', 'head', 'death']
7 ['ca', 'time', 'whi', 'tri', 'life', 'feel', 'caus', 'noth', 'wo', 'mind']
8 ['de', 'el', 'la', 'en', 'te', 'mi', 'tu', 'se', 'es', 'yo']
9 ['world', 'god', 'soul', 'lord', 'live', 'free', 'life', 'heaven', 'war', 'save']


In [13]:
#tfidf, NMF topic words
top_per_topic_words = corpus_topics_top_words(nmf_tfidf, features, 10)
for i in list(top_per_topic_words.keys()):
    print(str(i) + ' '+ str(top_per_topic_words[i]))

0 ['ca', 'feel', 'whi', 'tri', 'believ', 'wo', 'caus', 'someth', 'noth', 'everyth']
1 ['de', 'el', 'la', 'en', 'te', 'mi', 'tu', 'se', 'es', 'por']
2 ['love', 'heart', 'true', 'girl', 'onli', 'kiss', 'forev', 'alway', 'hold', 'sweet']
3 ['ich', 'und', 'die', 'du', 'der', 'nicht', 'das', 'ist', 'ein', 'mich']
4 ['je', 'de', 'la', 'les', 'le', 'pas', 'des', 'dan', 'qui', 'à']
5 ['babi', 'girl', 'ooh', 'night', 'pleas', 'littl', 'cri', 'tonight', 'babe', 'honey']
6 ['yeah', 'gonna', 'wanna', 'girl', 'hey', 'nigga', 'ya', 'gotta', 'caus', 'fuck']
7 ['che', 'di', 'la', 'il', 'è', 'mi', 'ma', 'da', 'ti', 'io']
8 ['life', 'day', 'world', 'night', 'eye', 'dream', 'live', 'light', 'heart', 'fall']
9 ['time', 'mind', 'wait', 'gonna', 'chang', 'wast', 'everi', 'day', 'mine', 'tri']


In [14]:
#tf, NMF topic words
top_per_topic_words = corpus_topics_top_words(nmf_tfidf, features, 10)
for i in list(top_per_topic_words.keys()):
    print(str(i) + ' '+ str(top_per_topic_words[i]))

0 ['ca', 'feel', 'whi', 'tri', 'believ', 'wo', 'caus', 'someth', 'noth', 'everyth']
1 ['de', 'el', 'la', 'en', 'te', 'mi', 'tu', 'se', 'es', 'por']
2 ['love', 'heart', 'true', 'girl', 'onli', 'kiss', 'forev', 'alway', 'hold', 'sweet']
3 ['ich', 'und', 'die', 'du', 'der', 'nicht', 'das', 'ist', 'ein', 'mich']
4 ['je', 'de', 'la', 'les', 'le', 'pas', 'des', 'dan', 'qui', 'à']
5 ['babi', 'girl', 'ooh', 'night', 'pleas', 'littl', 'cri', 'tonight', 'babe', 'honey']
6 ['yeah', 'gonna', 'wanna', 'girl', 'hey', 'nigga', 'ya', 'gotta', 'caus', 'fuck']
7 ['che', 'di', 'la', 'il', 'è', 'mi', 'ma', 'da', 'ti', 'io']
8 ['life', 'day', 'world', 'night', 'eye', 'dream', 'live', 'light', 'heart', 'fall']
9 ['time', 'mind', 'wait', 'gonna', 'chang', 'wast', 'everi', 'day', 'mine', 'tri']
