In [36]:
import sys
print("Python Version:", sys.version)

Python Version: 3.7.4 (default, Aug 13 2019, 20:35:49) 
[GCC 7.3.0]


In [37]:
#######################
# standard code block #
#######################

%config InlineBackend.figure_format = 'svg'

# auto reload imports that change
%load_ext autoreload
# only set to auto reload for marked imports
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
import pandas as pd

### Connect to Database

In [39]:
from pymongo import MongoClient
%aimport credentials.cred
from credentials import cred


config = {
  'host': cred.mongo_host,
  'username': cred.mongo_user,
  'password': cred.mongo_pass,
  'authSource': cred.mongo_auth_db
}

# get a mongo client
client = MongoClient(**config)

# use the raw database
jre_raw = client.jre_raw
podcasts_raw = jre_raw.podcasts

# use the clean database
jre_clean = client.jre_clean
podcasts_clean = jre_clean.podcasts

### Get Podcasts from Podscribe

In [40]:
# query
ps_ep_mongo = list(
    podcasts_raw
    .find({'source':'podscribe'},{'_id':0, 'text':1, 'name':1})
    .limit(100)
)

In [41]:
# pull out text and names
ps_ep_list = [x['text'] for x in ps_ep_mongo]
ps_ep_names = [x['name'] for x in ps_ep_mongo]

### Clean

In [42]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [43]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [44]:
ps_ep_list = [x.lower() for x in ps_ep_list]

### Stop Words

In [45]:
from sklearn.feature_extraction import text
new_stop_words = []

# profanity
profanity = (
    open('stop_words/profanity.txt', newline='\n')
    .read()
    .splitlines()
)

new_stop_words += [x.lower() for x in profanity]

# common Joe Rogan words
common_jre_words = [
    'like', 'yeah', 'know', 'just', 'right', 'right', 'think', 'know',
    'people', 'going', 'really', 'got', 'thing', 'want', 'actually',
    'say'
]
new_stop_words += common_jre_words

# append to english stop words
jre_stop_words = text.ENGLISH_STOP_WORDS.union(new_stop_words)

### Lemmatize

In [46]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

for i, ep in enumerate(ps_ep_list):
    ps_ep_list[i] = ' '.join([lemmatizer.lemmatize(x) for x in ep.split(' ')])
    
jre_stop_words = [lemmatizer.lemmatize(x) for x in jre_stop_words]

### TF-IDF

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer
ps_ep_list_tfidf = TfidfVectorizer(
    ngram_range=(1,2), binary=True, stop_words=jre_stop_words,
    token_pattern="\\b[a-z][a-z]+\\b", max_df=0.8
)

# fit
ps_ep_list_tfidf.fit(ps_ep_list)

TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.8, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=['assfukka', 'toward', 'very', 'fill', 'them',
                            'hoer', 'ejaculated', 'all', 'bastard', 'wanker',
                            'please', 'dogging', 'have', 'fanny', 'meanwhile',
                            'latterly', 'during', 'either', 'shitfull',
                            'twatty', 'moreover', 'everywhere', 'dink',
                            'breast', 'of', 'pisser', 'per', 'fuckwhit', 'been',
                            'fuckhead', ...],
                strip_accents=None, sublinear_tf=False,
                token_pattern='\\b[a-z][a-z]+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

### Count Vectorize

In [48]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer for parsing/counting words
ps_ep_list_cvec = CountVectorizer(ngram_range=(1, 3),  
                                   stop_words=jre_stop_words, token_pattern="\\b[a-z][a-z]+\\b")
# fit
ps_ep_list_cvec.fit(ps_ep_list)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 3), preprocessor=None,
                stop_words=['assfukka', 'toward', 'very', 'fill', 'them',
                            'hoer', 'ejaculated', 'all', 'bastard', 'wanker',
                            'please', 'dogging', 'have', 'fanny', 'meanwhile',
                            'latterly', 'during', 'either', 'shitfull',
                            'twatty', 'moreover', 'everywhere', 'dink',
                            'breast', 'of', 'pisser', 'per', 'fuckwhit', 'been',
                            'fuckhead', ...],
                strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
                tokenizer=None, vocabulary=None)

### Choose Tokenizer

In [49]:
ps_ep_list_tokenizer = ps_ep_list_tfidf

### Model Setup

In [64]:
# number of topics
num_topics = 5

# column names
col_names = ['component_'+str(x) for x in range(1,num_topics+1)]

### LDA

In [50]:
# gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn.feature_extraction.text import CountVectorizer

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [51]:
# Create the term-document matrix
# Transpose it so the terms are the rows
doc_word = ps_ep_list_tokenizer.transform(ps_ep_list).transpose()

In [52]:
import pandas as pd

pd.DataFrame(doc_word.toarray(), ps_ep_list_tokenizer.get_feature_names()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
aa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006962,0.0,...,0.0,0.0,0.006068,0.0,0.0,0.0,0.006922,0.0,0.0,0.0
aa charge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010651,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aa couple,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aa cross,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aa denver,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word)

In [54]:
id2word = dict((v, k) for k, v in ps_ep_list_tokenizer.vocabulary_.items())

In [55]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=50)

2020-02-19 19:33:21,272 : INFO : using symmetric alpha at 0.3333333333333333
2020-02-19 19:33:21,274 : INFO : using symmetric eta at 0.3333333333333333
2020-02-19 19:33:21,384 : INFO : using serial LDA version on this node
2020-02-19 19:33:21,620 : INFO : running online (multi-pass) LDA training, 3 topics, 5 passes over the supplied corpus of 100 documents, updating model once every 100 documents, evaluating perplexity every 100 documents, iterating 50x with a convergence threshold of 0.001000
2020-02-19 19:33:28,988 : INFO : -95.322 per-word bound, 49505244778645621142204710912.0 perplexity estimate based on a held-out corpus of 100 documents with 9759 words
2020-02-19 19:33:28,989 : INFO : PROGRESS: pass 0, at document #100/100
2020-02-19 19:33:30,670 : INFO : topic #0 (0.333): 0.000*"paying" + 0.000*"fantastic" + 0.000*"jesus christ" + 0.000*"plane" + 0.000*"shouldn" + 0.000*"news" + 0.000*"worried" + 0.000*"aspect" + 0.000*"mom" + 0.000*"fly"
2020-02-19 19:33:30,682 : INFO : topic 

In [56]:
lda.print_topics()

2020-02-19 19:34:07,696 : INFO : topic #0 (0.333): 0.000*"aspect" + 0.000*"mom" + 0.000*"fantastic" + 0.000*"essentially" + 0.000*"don feel" + 0.000*"welcome" + 0.000*"energy" + 0.000*"standing" + 0.000*"fly" + 0.000*"computer"
2020-02-19 19:34:07,709 : INFO : topic #1 (0.333): 0.000*"horrible" + 0.000*"career" + 0.000*"exact" + 0.000*"designed" + 0.000*"asking" + 0.000*"gigantic" + 0.000*"written" + 0.000*"ran" + 0.000*"healthy" + 0.000*"television"
2020-02-19 19:34:07,718 : INFO : topic #2 (0.333): 0.000*"rare" + 0.000*"look look" + 0.000*"comedy" + 0.000*"did did" + 0.000*"drunk" + 0.000*"beat" + 0.000*"time time" + 0.000*"expensive" + 0.000*"blow" + 0.000*"goal"


[(0,
  '0.000*"aspect" + 0.000*"mom" + 0.000*"fantastic" + 0.000*"essentially" + 0.000*"don feel" + 0.000*"welcome" + 0.000*"energy" + 0.000*"standing" + 0.000*"fly" + 0.000*"computer"'),
 (1,
  '0.000*"horrible" + 0.000*"career" + 0.000*"exact" + 0.000*"designed" + 0.000*"asking" + 0.000*"gigantic" + 0.000*"written" + 0.000*"ran" + 0.000*"healthy" + 0.000*"television"'),
 (2,
  '0.000*"rare" + 0.000*"look look" + 0.000*"comedy" + 0.000*"did did" + 0.000*"drunk" + 0.000*"beat" + 0.000*"time time" + 0.000*"expensive" + 0.000*"blow" + 0.000*"goal"')]

### LSA

In [58]:
import pandas as pd 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [59]:
doc_word = ps_ep_list_tokenizer.transform(ps_ep_list)
doc_word.shape

(100, 618792)

In [60]:
# We have to convert `.toarray()` because the vectorizer returns a sparse matrix.
# For a big corpus, we would skip the dataframe and keep the output sparse.
pd.DataFrame(doc_word.toarray(), index=ps_ep_names, columns=ps_ep_list_tokenizer.get_feature_names()).head(10)

Unnamed: 0,aa,aa charge,aa couple,aa cross,aa denver,aa judah,aa look,aa meeting,aa palace,aa steve,...,zygote different,zyklon,zyklon gas,zyklon remove,zz,zz coffee,zz coming,zz decided,zz funk,zz riff
#1383 - Malcolm Gladwell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1382 - RZA & Donnell Rawlings,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1378 - Greg Fitzsimmons,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1375 - Edward Norton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1373 - Kyle Kulinski,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1371 - Andrew Santino,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1370 - Brian Grazer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1369 - Christopher Ryan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1368 - Edward Snowden,0.006962,0.010651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1366 - Richard Dawkins,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = TruncatedSVD(num_topics)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.00072207, 0.01119541, 0.01044549, 0.01041946, 0.01036292])

In [66]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = col_names,
             columns = ps_ep_list_tokenizer.get_feature_names())
topic_word

Unnamed: 0,aa,aa charge,aa couple,aa cross,aa denver,aa judah,aa look,aa meeting,aa palace,aa steve,...,zygote different,zyklon,zyklon gas,zyklon remove,zz,zz coffee,zz coming,zz decided,zz funk,zz riff
component_1,0.004,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,...,0.001,0.001,0.001,0.001,0.002,0.001,0.001,0.001,0.001,0.001
component_2,0.001,0.003,0.0,0.001,-0.001,0.001,0.0,-0.002,-0.001,0.0,...,0.002,0.003,0.003,0.003,-0.001,-0.001,0.0,-0.001,0.0,-0.001
component_3,-0.006,-0.001,-0.001,-0.001,-0.004,-0.001,-0.001,-0.0,-0.001,0.0,...,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.001,-0.0
component_4,0.002,-0.0,0.002,0.0,0.0,0.0,0.001,0.0,0.002,-0.001,...,0.003,0.001,0.001,0.001,-0.003,-0.0,0.0,-0.003,-0.001,-0.003
component_5,-0.004,-0.0,-0.001,-0.001,-0.002,-0.002,0.0,-0.0,0.0,-0.001,...,-0.0,-0.002,-0.002,-0.002,0.0,-0.001,0.004,-0.0,-0.002,-0.0


In [67]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [68]:
display_topics(lsa, ps_ep_list_tokenizer.get_feature_names(), 5)


Topic  0
comedy, fat, apparently, club, growing

Topic  1
economic, revolution, discussing, elected, justification

Topic  2
year way, volunteer, chief, welcoming, happens die

Topic  3
com enter, jre, seven thousand, month free, postage

Topic  4
strain, thought sort, pretty heavy, stunt, lenny bruce


In [69]:
Vt = pd.DataFrame(doc_topic.round(5),
             index = ps_ep_names,
             columns = col_names)
Vt

Unnamed: 0,component_1,component_2,component_3,component_4,component_5
#1383 - Malcolm Gladwell,0.18865,0.11700,-0.14375,-0.01392,0.10970
#1382 - RZA & Donnell Rawlings,0.17352,0.01368,-0.07292,0.00202,-0.01293
#1378 - Greg Fitzsimmons,0.19821,0.02892,-0.09059,-0.07588,-0.17435
#1375 - Edward Norton,0.14573,0.01722,0.01402,0.08765,0.24600
#1373 - Kyle Kulinski,0.18779,0.15481,-0.06647,-0.03001,-0.05193
...,...,...,...,...,...
#1280 - Michael Yo,0.21437,-0.13083,-0.09648,-0.05392,-0.05380
#1279 - Jessimae Peluso,0.18865,0.05627,-0.07005,0.01533,-0.16422
#1278 - Kevin Hart,0.16575,-0.03266,0.01238,0.13365,-0.03518
#1277 - Gabrielle Reece,0.17820,-0.10176,0.03490,-0.15709,-0.09084


### NMF

In [70]:
doc_word = ps_ep_list_tokenizer.transform(ps_ep_list)
pd.DataFrame(doc_word.toarray(), index=ps_ep_names, columns=ps_ep_list_tokenizer.get_feature_names()).head(10)

Unnamed: 0,aa,aa charge,aa couple,aa cross,aa denver,aa judah,aa look,aa meeting,aa palace,aa steve,...,zygote different,zyklon,zyklon gas,zyklon remove,zz,zz coffee,zz coming,zz decided,zz funk,zz riff
#1383 - Malcolm Gladwell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1382 - RZA & Donnell Rawlings,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1378 - Greg Fitzsimmons,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1375 - Edward Norton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1373 - Kyle Kulinski,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1371 - Andrew Santino,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1370 - Brian Grazer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1369 - Christopher Ryan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1368 - Edward Snowden,0.006962,0.010651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1366 - Richard Dawkins,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
nmf_model = NMF(num_topics)
doc_topic = nmf_model.fit_transform(doc_word)

In [74]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = col_names,
             columns = ps_ep_list_tokenizer.get_feature_names())
topic_word

Unnamed: 0,aa,aa charge,aa couple,aa cross,aa denver,aa judah,aa look,aa meeting,aa palace,aa steve,...,zygote different,zyklon,zyklon gas,zyklon remove,zz,zz coffee,zz coming,zz decided,zz funk,zz riff
component_1,0.005,0.0,0.0,0.0,0.002,0.002,0.0,0.003,0.0,0.0,...,0.0,0.0,0.0,0.0,0.004,0.002,0.002,0.001,0.0,0.001
component_2,0.003,0.002,0.0,0.002,0.0,0.0,0.001,0.0,0.0,0.0,...,0.002,0.002,0.002,0.002,0.0,0.0,0.0,0.0,0.0,0.0
component_3,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006,0.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,...,0.0,0.0,0.0,0.0,0.004,0.0,0.0,0.001,0.004,0.001
component_5,0.002,0.0,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
display_topics(nmf_model, ps_ep_list_tokenizer.get_feature_names(), 10)


Topic  0
oh good, bro, jesus christ, weekend, bar, oh oh, comic, dance, song, laughing

Topic  1
economic, revolution, discussing, discussion, complex, result, political, deeply, bias, entity

Topic  2
fun learn, long run, help body, trying catch, soy, hone ranch, jet lag, shot hit, ounce, remember doing

Topic  3
fan love, deliver, make don, mini, movie said, prep, able buy, accident, love death, best music

Topic  4
green athletic, maintaining zero, proven vitamin, food sourced, sourced ingredient, dairy gluten, compromise approach, approach formulation, jerry, mineral alkaline


In [76]:
H = pd.DataFrame(doc_topic.round(5),
             index = ps_ep_names,
             columns = col_names)
H

Unnamed: 0,component_1,component_2,component_3,component_4,component_5
#1383 - Malcolm Gladwell,0.00000,0.00780,0.00000,0.00000,0.37409
#1382 - RZA & Donnell Rawlings,0.12964,0.00000,0.00000,0.00077,0.00000
#1378 - Greg Fitzsimmons,0.14801,0.00000,0.00000,0.00000,0.00000
#1375 - Edward Norton,0.00000,0.00000,0.00000,0.34823,0.00000
#1373 - Kyle Kulinski,0.00000,0.28154,0.00000,0.00000,0.01389
...,...,...,...,...,...
#1280 - Michael Yo,0.16026,0.00000,0.00000,0.00000,0.00000
#1279 - Jessimae Peluso,0.14076,0.00000,0.00000,0.00000,0.00000
#1278 - Kevin Hart,0.11884,0.00006,0.01037,0.00000,0.00000
#1277 - Gabrielle Reece,0.12883,0.00420,0.00000,0.00000,0.00000
