# Topic Modeling on Pitchfork Reviews

https://towardsdatascience.com/a-guide-to-collaborative-topic-modeling-recommender-systems-49fd576cc871

In [1]:
from pprint import pprint
import numpy as np

import json
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
import gensim
import gensim.corpora as corpora
import gensim.models.ldamodel as lda
import spacy
import itertools

import pandas as pd
import sqlite3

import re
import unidecode as ud
import pickle
import datetime

from ArtistReviewAnalyzer import ArtistReviewAnalyzer



Initializing analyzer

In [2]:
pf_file = "../../data/processed/artist_reviews_pf_only.json"
more_sw = ['band', 'make', 'record', 'get', 'even', 'time', 'year', 'good', 'new', 'come', \
                     'go', 'well', 'first', 'take', 'still', 'way', 'much', 'feel', 'work', \
                     'release', 'seem', 'know', 'back', 'thing', 'also', 'album', 'song', 'None']
pf = ArtistReviewAnalyzer()

In [3]:
t0 = datetime.datetime.now()
pf.build(pf_file, more_stopwords=more_sw, min_df=300, max_df=0.6)
t1 = datetime.datetime.now()
print((t1 - t0).total_seconds())

131.225737


In [4]:
print("tf matrix shape:", pf.count_matrix.shape)
print("tfidf matrix shape:", pf.tfidf_matrix.shape)

tf matrix shape: (964211, 548)
tfidf matrix shape: (964211, 548)


## Run LDA Topic Model

In [5]:
pf.run_and_set_lda_model(num_topics=13)

<gensim.models.ldamodel.LdaModel at 0x122b57550>

In [6]:
pd.DataFrame(pf.print_lda_topics())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,young,rap,rock,say,wayne,rapper,show,live,man,
1,rock,guitar,love,pop,sonic,say,lyric,long,vocal,
2,pop,love,rock,single,guitar,day,vocal,great,end,
3,rock,mile,play,set,live,pop,single,version,love,
4,rock,guitar,white,love,pop,find,mogwai,vocal,give,
5,rock,cash,wait,pop,guitar,live,show,vocal,early,
6,pop,love,rock,guitar,single,give,play,long,say,
7,bowie,rock,pop,smith,love,never,guitar,single,morrissey,
8,darnielle,wu,kanye,life,west,rza,man,goat,chip,
9,cube,killer,haggard,foal,certificate,black,hit,ice,fuss,death


In [7]:
lda_13 = []
lda_13.append(pf.lda_model)

In [8]:
for i in range(3):
    lda_13.append(pf.run_lda_model(num_topics=13))

In [12]:
def print_lda_topics(model):
    topics = model.print_topics()
    all_topics = []
    for _, s in topics:
        all_topics.append(re.findall(r'(?<=\")[a-z]+(?=\")', s))
    return all_topics

In [13]:
pd.DataFrame(print_lda_topics(lda_13[1]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,young,rap,rock,say,wayne,rapper,show,live,man,
1,rock,guitar,love,pop,sonic,say,lyric,long,vocal,
2,pop,love,rock,single,guitar,day,vocal,great,end,
3,rock,mile,play,set,live,pop,single,version,love,
4,rock,guitar,white,love,pop,find,mogwai,vocal,give,
5,rock,cash,wait,pop,guitar,live,show,vocal,early,
6,pop,love,rock,guitar,single,give,play,long,say,
7,bowie,rock,pop,smith,love,never,guitar,single,morrissey,
8,darnielle,wu,kanye,life,west,rza,man,goat,chip,
9,cube,killer,haggard,foal,certificate,black,hit,ice,fuss,death


In [14]:
pd.DataFrame(print_lda_topics(lda_13[2]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,young,rap,rock,say,wayne,rapper,show,live,man,
1,rock,guitar,love,pop,sonic,say,lyric,long,vocal,
2,pop,love,rock,single,guitar,day,vocal,great,end,
3,rock,mile,play,set,live,pop,single,version,love,
4,rock,guitar,white,love,pop,find,mogwai,vocal,give,
5,rock,cash,wait,pop,guitar,live,show,vocal,early,
6,pop,love,rock,guitar,single,give,play,long,say,
7,bowie,rock,pop,smith,love,never,guitar,single,morrissey,
8,darnielle,wu,kanye,life,west,rza,man,goat,chip,
9,cube,killer,haggard,foal,certificate,black,hit,ice,fuss,death


In [15]:
pd.DataFrame(print_lda_topics(lda_13[3]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,young,rap,rock,say,wayne,rapper,show,live,man,
1,rock,guitar,love,pop,sonic,say,lyric,long,vocal,
2,pop,love,rock,single,guitar,day,vocal,great,end,
3,rock,mile,play,set,live,pop,single,version,love,
4,rock,guitar,white,love,pop,find,mogwai,vocal,give,
5,rock,cash,wait,pop,guitar,live,show,vocal,early,
6,pop,love,rock,guitar,single,give,play,long,say,
7,bowie,rock,pop,smith,love,never,guitar,single,morrissey,
8,darnielle,wu,kanye,life,west,rza,man,goat,chip,
9,cube,killer,haggard,foal,certificate,black,hit,ice,fuss,death


In [10]:
pf.build(more_stopwords=[], min_df=300, max_df=0.6)

KeyboardInterrupt: 

In [None]:
pf.build_lda_model(num_topics=14)

In [8]:
pd.DataFrame(pf.print_lda_topics())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,be,have,album,song,do,s,more,get,track,sound
1,be,have,s,do,song,album,music,sound,make,when
2,be,have,song,sound,do,more,s,album,make,music
3,be,have,bowie,album,do,s,song,pop,more,make
4,be,album,song,have,springsteen,s,more,cohen,do,pop
5,be,have,album,song,more,do,s,band,make,sound
6,be,s,have,album,rap,do,song,track,make,more
7,be,have,s,jeezy,album,do,rap,bun,doom,boosie
8,be,cole,albarn,patrol,snow,gorillaz,dexys,rowland,have,soul
9,be,band,have,song,album,do,more,rock,record,sound
