In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
mxm_dataset = pd.read_feather('../../data/transform/mxm_dataset.feather')
mxm_tracks = pd.read_feather('../../data/transform/unique_tracks.feather')

tf_data = mxm_dataset.sample(frac= 1, random_state = 0).reset_index()

In [4]:
def corpus_topics_top_words(model, features, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict[topic_idx] = [features[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topic_dict

def song_topics(model, song):
    topic_dict = []
    for topic_idx, topic in enumerate(model.components_):
        topic_dict.append(sum(topic*song))
    return topic_dict

In [5]:
# Remove Stopwords From Dataset
features = tf_data.columns
stop_words_tidytext = pd.read_feather('../../data/transform/stop_words_tidytext')
stop_words = []
for i in stop_words_tidytext.word:
    if i in features:
        stop_words.append(i)

tf_data = tf_data.drop(stop_words, axis=1)
tf_data = tf_data.drop(['track_id','index'], axis=1)

In [6]:
from sklearn.decomposition import LatentDirichletAllocation

lda_tf_25 = LatentDirichletAllocation(n_topics=25, random_state=0)
lda_tf_25.fit(tf_data)

top_per_topic_words = corpus_topics_top_words(lda_tf_25, tf_data.columns.values, 10)

#save per/song topic results to df
song_topic_weights = np.zeros([len(tf_data),25])
for i in tqdm(range(len(tf_data))):
    song_weights = pd.Series(song_topics(lda_tf_25, tf_data.iloc[i]))
    song_topic_weights[i] = song_weights


100%|██████████| 238/238 [00:01<00:00, 121.65it/s]


In [17]:
initial_topic_names = list(range(25))
song_topic_weights_df = pd.DataFrame(data =song_topic_weights, columns=initial_topic_names)
song_topic_weights_df['track_id'] = mxm_dataset.track_id 

df_topic_weights_reduced_df = song_topic_weights_df[['track_id', 0,3,6]]
df_topic_weights_reduced_df.columns = 'track_id', 'love','religion','death'

In [25]:
topic_words = {}
topic_words['love'] = top_per_topic_words[0]
topic_words['religion'] = top_per_topic_words[3]
topic_words['death'] = top_per_topic_words[6]



In [26]:
pd.DataFrame(topic_words).to_feather('../../data/clean/topic_words.feather')
df_topic_weights_reduced_df.to_feather('../../data/clean/song_topic_weights.feather')