# TF-IDF with Scikit Learn

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.decomposition import PCA
from nltk.corpus import stopwords
import nltk
import glob
import re

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load all the dataframes
country_df = pd.read_csv('data/Country-lyrics.csv', index_col=0)
hip_hop_df = pd.read_csv('data/Hip-Hop-Rnb-lyrics.csv', index_col=0)
pop_df = pd.read_csv('data/pop-lyrics.csv', index_col=0)
christian_df = pd.read_csv('data/Christian-lyrics.csv', index_col=0)
electro_df = pd.read_csv('data/Electro-lyrics.csv', index_col=0)
rock_df = pd.read_csv('data/Rock-lyrics.csv', index_col=0)

In [3]:
# empyty list for documents
documents = []

In [4]:
# choose a genre to analyze. for example 'country_df' for country music
genre = country_df

In [5]:
# move the words out of pandas dataframe into a list of documents
for key, values in genre.groupby(['artist', 'title', 'year']):
    documents.append(values['word'].astype('str').values)

documents = [' '.join(doc) for doc in documents]

In [6]:
# remove song title and 'lyrics' from start of each document
# remove 'embed' that is at the end of each document
# remove punctuation but keep apostrophes
documents = [re.sub("\d{1,4}Embed|Embed|^.*?(Lyrics)|[^\w\d'\s]+" , '', doc) for doc in documents]

In [7]:
def lemmatization(documents, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in documents:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

In [8]:
vectorizer = TfidfVectorizer(
    lowercase=True,
    max_features=100,
    max_df=0.8,
    min_df=5,
    ngram_range=(1, 5),
    stop_words='english'
)

In [9]:
lemmatized_documents = lemmatization(documents)

vectors = vectorizer.fit_transform(lemmatized_documents)

feature_names = vectorizer.get_feature_names_out()

dense = vectors.todense()
denselist = dense.tolist()

all_keywords = []

In [10]:
for description in denselist:
    i = 0
    keywords = []
    for word in description:
        if word > 0:
            keywords.append(feature_names[i])
        i += 1
    all_keywords.append(keywords)

In [11]:
from gensim.models import TfidfModel

id2word = corpora.Dictionary(all_keywords)

texts = all_keywords

corpus = [id2word.doc2bow(text) for text in texts]

In [12]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [13]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis