# Text Processing

This notebook begins with raw text data and cleans it, resulting in a Document-Term matrix and a TFIDF matrix, saved for later use.

In [1]:
import json
import os
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.decomposition import LatentDirichletAllocation
import sys
sys.path.append('../')
import lyrics_grab

In [2]:
with open('../data/metal_artists.pickle','rb') as rf:
    metal = pickle.load(rf)
    
with open('../data/rock_artists.pickle','rb') as rf:
    rock = pickle.load(rf)

In [3]:
len(metal)

50041

In [4]:
df_metal = pd.DataFrame(metal)
df_rock = pd.DataFrame(rock)

In [5]:
df_metal.head()

Unnamed: 0,artist_name,release_date,page_views,song_title,album_name,spotify_url,lyrics
0,Kreator,2016-12-16,13775,Satan Is Real,Gods of Violence,,Martyrs\nYou cannot kill us all\nVengeance wil...
1,Kreator,1986-11-01,8745,Pleasure to Kill,Pleasure to Kill,,Day turns to night as I rise from my grave\nBl...
2,Kreator,,5037,Enemy Of God,Enemy Of God,,"Shocked Orwellian races, gather united in grie..."
3,Kreator,2017-01-27,0,Fallen Brother,Gods of Violence,,[Instrumental Intro]\n\nMuch too young you had...
4,Kreator,2017-01-27,0,Totalitarian Terror,Gods of Violence,,Come experience hate like you never have befor...


In [6]:
df_metal.dropna(subset=['lyrics'],inplace=True)
df_rock.dropna(subset=['lyrics'],inplace=True)

In [7]:
lyrics_metal = df_metal['lyrics']
lyrics_rock = df_rock['lyrics']

In [8]:
# add many extra stop words that to me didn't help identify topics and were getting in the way as top words

extra_stop_words = ['ooh','something','chorus','verse','go','back','yeah','hey','get','gonna','let','wanna',
                   'take','got','yo','uh','got','come','let','day','man','away','close','stay','em','set',
                   'said','words','ones','oh','no','away','whoah','whoa','run','around','take','something',
                   'know','like','never','one','see','way','want','say','make','instrumental','right','could',
                   'inside','pre','think','still','find','tell','everything','keep','left','every','long',
                   'would','head','bridge','look','ever','well','another','woah','walk','give','cause',
                   'maybe','help','lay','open','stop','start','place','us','born','going','wake',
                   'high','ride','raise','low','stand','upon','hands','land','shall','within','deep','new',
                   'ah','ready','alright','side','name','two','three','round','song','seven','us','call',
                   'old','six','white','la','ya','everybody','big','hit','give','cause','ha','gotta','stop',
                   'play','put','boy','top','bout','de', 'en', 'sin', 'solo','refrain',
                   'til','till']

In [9]:
stop_words = nltk.corpus.stopwords.words('english') + extra_stop_words

In [10]:
open('../data/stopwords.txt','w').write(' '.join(stop_words))

1615

In [11]:
# function for cleaning the text: removing punctuation, removing stop words, and removing non-english and 
# uncommon words by setting a minimum appearances value

def vectorize(text,vectorizer='cv',min_df=0.01,stop_words=stop_words):
    if vectorizer == 'cv':
        v = CountVectorizer(stop_words=stop_words,min_df=min_df)
    elif vectorizer == 'tfidf':
        v = TfidfVectorizer(stop_words=stop_words,min_df=min_df)
        
    v_matrix = v.fit_transform(text)
    
    v_df = pd.DataFrame(v_matrix.toarray(),columns=v.get_feature_names())
    
    return v_df

In [14]:
cv_df_metal = vectorize(lyrics_metal)

In [12]:
cv_df_rock = vectorize(lyrics_rock)

In [15]:
tfidf_df_metal = vectorize(lyrics_metal,'tfidf',min_df=0.01,stop_words=stop_words)
tfidf_df_rock = vectorize(lyrics_rock,'tfidf',min_df=0.01,stop_words=stop_words)

Perform Latent Dirichlet Allocation and visualize it with pyLDAvis.  It ended up working better with the regular count vectorizer rather than the TFIDF, interestingly.

In [18]:
lda_tf = LatentDirichletAllocation(n_components=4,random_state=23)
lda_tf.fit(cv_df_metal)

LatentDirichletAllocation(n_components=4, random_state=23)

In [24]:
# lda_tfidf = LatentDirichletAllocation(n_components=8,random_state=23)
# lda_tfidf.fit(tfidf_matrix)

In [19]:
import pyLDAvis
import pyLDAvis.sklearn

In [22]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_tf,cv_matrix,cv)
# pyLDAvis.sklearn.prepare(lda_tfidf,tfidf_matrix,tfidf)
pyLDAvis.save_html(vis,'lda.html')

  and should_run_async(code)


In [16]:
with open('../data/cv_df_metal.pickle','wb') as out:
    pickle.dump(cv_df_metal,out)

In [17]:
with open('../data/cv_df_rock.pickle','wb') as out:
    pickle.dump(cv_df_rock,out)

In [18]:
with open('../data/tfidf_df_metal.pickle','wb') as out:
    pickle.dump(tfidf_df_metal,out)
    
with open('../data/tfidf_df_rock.pickle','wb') as out:
    pickle.dump(tfidf_df_rock,out)

In [23]:
with open('../data/topic_assignments.pickle','rb') as rf:
    topic_assignments = pd.Series(pickle.load(rf))

In [24]:
final_df = pd.concat([df_metal,topic_assignments],axis=1)

In [25]:
final_df.head()

Unnamed: 0,artist_name,release_date,page_views,song_title,album_name,spotify_url,lyrics,0
0,Kreator,2016-12-16,13775.0,Satan Is Real,Gods of Violence,,Martyrs\nYou cannot kill us all\nVengeance wil...,3.0
1,Kreator,1986-11-01,8745.0,Pleasure to Kill,Pleasure to Kill,,Day turns to night as I rise from my grave\nBl...,2.0
2,Kreator,,5037.0,Enemy Of God,Enemy Of God,,"Shocked Orwellian races, gather united in grie...",0.0
3,Kreator,2017-01-27,0.0,Fallen Brother,Gods of Violence,,[Instrumental Intro]\n\nMuch too young you had...,2.0
4,Kreator,2017-01-27,0.0,Totalitarian Terror,Gods of Violence,,Come experience hate like you never have befor...,2.0


In [28]:
final_df.columns = [ 'artist_name', 'release_date',   'page_views',   'song_title',
         'album_name',  'spotify_url',       'lyrics', 'topic']

In [32]:
final_df.dropna(subset=['topic'],inplace=True)

In [38]:
final_df.drop('lyrics',inplace=True,axis=1)

In [34]:
final_df.topic.unique()

array([3., 2., 0., 1.])

In [41]:
# save topic assignments for later visualization
final_df.to_csv('../data/topic_assignments.csv',index=False)