In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import sklearn
import yellowbrick
import re
import mglearn
import boto3
from s3 import get_file

In [2]:
s3 = boto3.resource('s3')
lyrics = get_file(s3,'s3ssp',download_file='NLP_Data/new_master_lyrics_audio_features.csv',rename_file='nlp.csv')

In [3]:
df = pd.read_csv(lyrics,sep='|',encoding='utf-8')
df_demo = df.copy().dropna()

In [4]:
df_sample = df_demo.sample(3000)

In [16]:
df_sample['new_lyrics'] = df_sample['lyrics'].apply(lambda x: x.replace("love", "").replace("chorus", ""))

In [6]:
def LDA_hyper_params(df):
    
    df_topic_rows = df.shape[0] #Dataframe Row count
    
    if(df_topic_rows in range(100,1000)):
        return {'max_features':100, 'n_components':3, 'n_words':10}
    
    elif(df_topic_rows in range(1001,4000)):
        return {'max_features':200, 'n_components' :5, 'n_words':10} 
        
    elif(df_topic_rows in range(4001,6000)):
        return {'max_features':10000, 'n_components' :25, 'n_words':10} 
    
    elif(df_topic_rows in range(6001,8000)):
        return {'max_features':400, 'n_components' :5, 'n_words':10} 
    
    elif(df_topic_rows in range(8001,10000)):
        return {'max_features':500, 'n_components' :5, 'n_words':10}
    
    elif(df_topic_rows >= 10000):
        return {'max_features':600, 'n_components' :5, 'n_words':10}
    else:
        return None

In [8]:
#Control Panel - Hyper Parameters

hp = LDA_hyper_params(df_sample)

#Count Vectorizer
max_features = 3000  #possibly a percentage of vect.vocabulary_
max_df = .5
min_df = 10



#LDA
n_components = 30  #10


#LDA Display
display_n_chunks = 5 #5
#n_components = hp['n_components']        #20
n_words = 10       #5


print(f'Max Features: {max_features} \nTopics: {n_components} \nWords: {n_words}')

Max Features: 3000 
Topics: 30 
Words: 10


In [10]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=min_df, max_df=max_df,     # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             max_features=max_features)             # max number of uniq words

data_vectorized = vectorizer.fit_transform(df_sample['lyrics'])



In [13]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=n_components, learning_method="online",
                                max_iter=25, random_state=0, doc_topic_prior=.06) #topic_word_prior=10000)

# Build LDA Model
lda_model.fit(data_vectorized)

# We build the model and transform the data in one step
# Computing transform takes some time,
# and we can save time by doing both at once

#document_topics = lda_model.fit_transform(docs)


#print(lda_model)  # Model attributes

# Log Likelyhood: Higher the better
#print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
#print("Perplexity: ", lda_model.perplexity(data_vectorized))

# See model parameters



#print("lda.components_.shape: {}".format(lda.components_.shape))

# For each topic (a row in the components_), sort the features (ascending)
# Invert rows with [:, ::-1] to make sorting descending
#sorting = np.argsort(lda_model.components_, axis=1)#[:, ::-1]




# Get the feature names from the vectorizer
#feature_names = np.array(vectorizer.get_feature_names())

# Print out the 10 topics:
#mglearn.tools.print_topics(topics=range(n_components), feature_names=feature_names,
                           #sorting=sorting, topics_per_chunk=display_n_chunks, n_words=n_words)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=0.06,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=25, mean_change_tol=0.001,
             n_components=30, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [14]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [17]:
pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='mmds')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [18]:
pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [57]:
# Create Document - Topic Matrix

lda_output = lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]


# index names
docnames = ["Doc" + str(i) for i in range(len(df_sample['lyrics']))]



# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.array(lda_output), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)


df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .6 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .6 else 400
    return 'font-weight: {weight}'.format(weight=weight)


df_document_topic = df_document_topic[df_document_topic['Topic1']>=.65]
# Apply Style
#df_document_topics = df_document_topic.style.applymap(color_green).applymap(make_bold)

for col in df_document_topic.columns:
    topic_length=df_document_topic[df_document_topic[col]>=.65].nlargest(n=20, columns=col)
    if len(topic_length)>=10:
        #print(df_document_topic[col].nlargest(n=20))
        print(col)

Topic1
dominant_topic


In [45]:
type(df_document_topic)

pandas.core.series.Series

In [44]:
df_sample['lyrics'].iloc[1296]

'command warrior ready attack destroy robot destroy inhuman form life destroy destroy destroy'

'strike fast hard mercy vermin christ prophet lie disciple seek hunt break spirit crush heart death set pain charge ride flee steel draw blood suffer field sacrifice wipe burn field feed wolf offspring annihilate master torment soul rape whore carry cross burn burn alive send soul deathqueen hall land cold burning flame send land famine despair eternally starve freeze'

lot
set
strange
age
fun
hero
death
final
circle
air
water
march
sigh
tide
tie
people
wrap
bass
guitar
box
break
gate
form
church
machine
worship
