# Topic Modeling with gensim

In [1]:
import numpy as np
import pandas as pd
import re

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
import nltk
from nltk.corpus import stopwords

import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load all the dataframes
country_df = pd.read_csv('data/Country-lyrics.csv', index_col=0)
hip_hop_df = pd.read_csv('data/Hip-Hop-Rnb-lyrics.csv', index_col=0)
pop_df = pd.read_csv('data/pop-lyrics.csv', index_col=0)
christian_df = pd.read_csv('data/Christian-lyrics.csv', index_col=0)
electro_df = pd.read_csv('data/Electro-lyrics.csv', index_col=0)
rock_df = pd.read_csv('data/Rock-lyrics.csv', index_col=0)

In [3]:
# empyty list for documents
documents = []

In [4]:
# choose a genre to analyze. for example 'country_df' for country music
genre = rock_df

In [5]:
# move the words out of pandas dataframe into a list of documents
for key, values in genre.groupby(['artist', 'title', 'year']):
    documents.append(values['word'].astype('str').values)

documents = [' '.join(doc) for doc in documents]

In [6]:
# remove song title and 'lyrics' from start of each document
# remove 'embed' that is at the end of each document
# remove punctuation but keep apostrophes
documents = [re.sub("\d{1,4}Embed|Embed|^.*?(Lyrics)|[^\w\d'\s]+" , '', doc) for doc in documents]

In [7]:
stopwords = stopwords.words('english')

In [8]:
# lemmatize the documents
def lemmatization(documents, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in documents:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

lemmatized_documents = lemmatization(documents)

In [9]:
# convert words into tokens
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_documents)

In [10]:
# ngrams
bigrams_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigrams_phrases = gensim.models.Phrases(bigrams_phrases[data_words], threshold=100)

bigram = gensim.models.phrases.Phraser(bigrams_phrases)
trigram = gensim.models.phrases.Phraser(trigrams_phrases)

def make_bigrams(documents):
    return([bigram[doc] for doc in documents])

def make_trigrams(documents):
    return ([trigram[bigram[doc]] for doc in documents])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

### tf-idf removal

In [11]:
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]

In [12]:
tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words = []
words_missing_in_tfidf = []

for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]
    
    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
pyLDAvis.save_html(vis, 'models-genre/rock.html')
vis
