In [36]:
import sys
print("Python Version:", sys.version)

Python Version: 3.7.4 (default, Aug 13 2019, 20:35:49) 
[GCC 7.3.0]


In [37]:
#######################
# standard code block #
#######################

%config InlineBackend.figure_format = 'svg'

# auto reload imports that change
%load_ext autoreload
# only set to auto reload for marked imports
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
import pandas as pd

### Connect to Database

In [39]:
from pymongo import MongoClient
%aimport credentials.cred
from credentials import cred


config = {
  'host': cred.mongo_host,
  'username': cred.mongo_user,
  'password': cred.mongo_pass,
  'authSource': cred.mongo_auth_db
}

# get a mongo client
client = MongoClient(**config)

# use the raw database
jre_raw = client.jre_raw
podcasts_raw = jre_raw.podcasts

# use the clean database
jre_clean = client.jre_clean
podcasts_clean = jre_clean.podcasts

### Get Podcasts from Podscribe

In [40]:
# query
ps_ep_mongo = list(
    podcasts_raw
    .find({'source':'podscribe'},{'_id':0, 'text':1, 'name':1})
    .limit(100)
)

In [41]:
# pull out text and names
ps_ep_list = [x['text'] for x in ps_ep_mongo]
ps_ep_names = [x['name'] for x in ps_ep_mongo]

### Clean

In [42]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [43]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [44]:
ps_ep_list = [x.lower() for x in ps_ep_list]

### Stop Words

In [45]:
from sklearn.feature_extraction import text
new_stop_words = []

# profanity
profanity = (
    open('stop_words/profanity.txt', newline='\n')
    .read()
    .splitlines()
)

new_stop_words += [x.lower() for x in profanity]

# common Joe Rogan words
common_jre_words = [
    'like', 'yeah', 'know', 'just', 'right', 'right', 'think', 'know',
    'people', 'going', 'really', 'got', 'thing', 'want', 'actually',
    'say'
]
new_stop_words += common_jre_words

# append to english stop words
jre_stop_words = text.ENGLISH_STOP_WORDS.union(new_stop_words)

### Lemmatize

In [46]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

for i, ep in enumerate(ps_ep_list):
    ps_ep_list[i] = ' '.join([lemmatizer.lemmatize(x) for x in ep.split(' ')])
    
jre_stop_words = [lemmatizer.lemmatize(x) for x in jre_stop_words]

### TF-IDF

In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer
ps_ep_list_tfidf = TfidfVectorizer(
    ngram_range=(1,2), binary=True, stop_words=jre_stop_words,
    token_pattern="\\b[a-z][a-z]+\\b", max_df=0.5, min_df=0.02
)

# fit
ps_ep_list_tfidf.fit(ps_ep_list)

TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=0.02, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=['assfukka', 'toward', 'very', 'fill', 'them',
                            'hoer', 'ejaculated', 'all', 'bastard', 'wanker',
                            'please', 'dogging', 'have', 'fanny', 'meanwhile',
                            'latterly', 'during', 'either', 'shitfull',
                            'twatty', 'moreover', 'everywhere', 'dink',
                            'breast', 'of', 'pisser', 'per', 'fuckwhit', 'been',
                            'fuckhead', ...],
                strip_accents=None, sublinear_tf=False,
                token_pattern='\\b[a-z][a-z]+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

### Count Vectorize

In [111]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer for parsing/counting words
ps_ep_list_cvec = CountVectorizer(
    ngram_range=(1, 3), stop_words=jre_stop_words,
    token_pattern="\\b[a-z][a-z]+\\b",  max_df=0.5, min_df=0.02
)

# fit
ps_ep_list_cvec.fit(ps_ep_list)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.5, max_features=None, min_df=0.02,
                ngram_range=(1, 3), preprocessor=None,
                stop_words=['assfukka', 'toward', 'very', 'fill', 'them',
                            'hoer', 'ejaculated', 'all', 'bastard', 'wanker',
                            'please', 'dogging', 'have', 'fanny', 'meanwhile',
                            'latterly', 'during', 'either', 'shitfull',
                            'twatty', 'moreover', 'everywhere', 'dink',
                            'breast', 'of', 'pisser', 'per', 'fuckwhit', 'been',
                            'fuckhead', ...],
                strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
                tokenizer=None, vocabulary=None)

### Choose Tokenizer

In [112]:
ps_ep_list_tokenizer = ps_ep_list_tfidf

### Model Setup

In [132]:
# number of topics
num_topics = 5

# column names
col_names = ['component_'+str(x) for x in range(1,num_topics+1)]

# run LDA?
run_lda = True

### LDA

In [133]:
# gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn.feature_extraction.text import CountVectorizer

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [134]:
# Create the term-document matrix
# Transpose it so the terms are the rows
if run_lda:
    doc_word = ps_ep_list_tokenizer.transform(ps_ep_list).transpose()

In [135]:
# Convert sparse matrix of counts to a gensim corpus
if run_lda:
    corpus = matutils.Sparse2Corpus(doc_word)

In [136]:
if run_lda:    
    id2word = dict((v, k) for k, v in ps_ep_list_tokenizer.vocabulary_.items())

In [137]:
# Create lda model (equivalent to "fit" in sklearn)
if run_lda:    
    lda = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=50)

2020-02-19 20:03:20,751 : INFO : using symmetric alpha at 0.2
2020-02-19 20:03:20,753 : INFO : using symmetric eta at 0.2
2020-02-19 20:03:20,803 : INFO : using serial LDA version on this node
2020-02-19 20:03:20,880 : INFO : running online (multi-pass) LDA training, 5 topics, 50 passes over the supplied corpus of 100 documents, updating model once every 100 documents, evaluating perplexity every 100 documents, iterating 50x with a convergence threshold of 0.001000
2020-02-19 20:03:24,843 : INFO : -67.602 per-word bound, 224038040386537127936.0 perplexity estimate based on a held-out corpus of 100 documents with 6647 words
2020-02-19 20:03:24,845 : INFO : PROGRESS: pass 0, at document #100/100
2020-02-19 20:03:25,679 : INFO : topic #0 (0.200): 0.000*"bowl" + 0.000*"truly" + 0.000*"stress" + 0.000*"communicate" + 0.000*"did guy" + 0.000*"paying attention" + 0.000*"medical" + 0.000*"don hear" + 0.000*"metal" + 0.000*"used used"
2020-02-19 20:03:25,683 : INFO : topic #1 (0.200): 0.000*"fo

2020-02-19 20:03:45,225 : INFO : topic #1 (0.200): 0.000*"application" + 0.000*"truly" + 0.000*"legitimate" + 0.000*"train day" + 0.000*"alien" + 0.000*"station" + 0.000*"result" + 0.000*"approach" + 0.000*"function" + 0.000*"invest"
2020-02-19 20:03:45,229 : INFO : topic #2 (0.200): 0.000*"everybody thank" + 0.000*"flow" + 0.000*"specie" + 0.000*"prison" + 0.000*"bye everybody" + 0.000*"stuff ve" + 0.000*"guard" + 0.000*"pleasure" + 0.000*"electric" + 0.000*"november"
2020-02-19 20:03:45,232 : INFO : topic #3 (0.200): 0.000*"critical" + 0.000*"app second" + 0.000*"investable" + 0.000*"experience train" + 0.000*"doe make" + 0.000*"nutrient" + 0.000*"bank transfer" + 0.000*"don try" + 0.000*"dry" + 0.000*"come home"
2020-02-19 20:03:45,236 : INFO : topic #4 (0.200): 0.000*"thank thank" + 0.000*"yes don" + 0.000*"counter" + 0.000*"justin ren" + 0.000*"explained" + 0.000*"teaching" + 0.000*"day bank" + 0.000*"tie" + 0.000*"lot different" + 0.000*"surprised"
2020-02-19 20:03:45,238 : INFO 

2020-02-19 20:04:02,872 : INFO : topic #4 (0.200): 0.000*"thank thank" + 0.000*"yes don" + 0.000*"counter" + 0.000*"justin ren" + 0.000*"explained" + 0.000*"teaching" + 0.000*"day bank" + 0.000*"tie" + 0.000*"lot different" + 0.000*"surprised"
2020-02-19 20:04:02,874 : INFO : topic diff=0.001761, rho=0.288675
2020-02-19 20:04:05,916 : INFO : -14.599 per-word bound, 24816.3 perplexity estimate based on a held-out corpus of 100 documents with 6647 words
2020-02-19 20:04:05,917 : INFO : PROGRESS: pass 11, at document #100/100
2020-02-19 20:04:06,370 : INFO : topic #0 (0.200): 0.000*"bowl" + 0.000*"stress" + 0.000*"used used" + 0.000*"communicate" + 0.000*"truly" + 0.000*"method" + 0.000*"free shipping" + 0.000*"paying attention" + 0.000*"chart" + 0.000*"metal"
2020-02-19 20:04:06,373 : INFO : topic #1 (0.200): 0.000*"train day" + 0.000*"application" + 0.000*"approach" + 0.000*"truly" + 0.000*"invest" + 0.000*"experience train" + 0.000*"building pygmy" + 0.000*"charity building" + 0.000*"s

2020-02-19 20:04:23,915 : INFO : topic #1 (0.200): 0.000*"train day" + 0.000*"application" + 0.000*"approach" + 0.000*"truly" + 0.000*"invest" + 0.000*"experience train" + 0.000*"building pygmy" + 0.000*"charity building" + 0.000*"summer" + 0.000*"way send"
2020-02-19 20:04:23,919 : INFO : topic #2 (0.200): 0.000*"everybody thank" + 0.000*"flow" + 0.000*"specie" + 0.000*"prison" + 0.000*"bye everybody" + 0.000*"stuff ve" + 0.000*"november" + 0.000*"guard" + 0.000*"pleasure" + 0.000*"electric"
2020-02-19 20:04:23,922 : INFO : topic #3 (0.200): 0.000*"critical" + 0.000*"app second" + 0.000*"investable" + 0.000*"experience train" + 0.000*"doe make" + 0.000*"nutrient" + 0.000*"bank transfer" + 0.000*"don try" + 0.000*"dry" + 0.000*"come home"
2020-02-19 20:04:23,926 : INFO : topic #4 (0.200): 0.000*"thank thank" + 0.000*"yes don" + 0.000*"counter" + 0.000*"justin ren" + 0.000*"explained" + 0.000*"teaching" + 0.000*"day bank" + 0.000*"tie" + 0.000*"lot different" + 0.000*"surprised"
2020-02

2020-02-19 20:04:41,551 : INFO : topic #4 (0.200): 0.000*"thank thank" + 0.000*"yes don" + 0.000*"counter" + 0.000*"justin ren" + 0.000*"explained" + 0.000*"teaching" + 0.000*"day bank" + 0.000*"tie" + 0.000*"lot different" + 0.000*"surprised"
2020-02-19 20:04:41,553 : INFO : topic diff=0.000075, rho=0.208514
2020-02-19 20:04:44,604 : INFO : -14.597 per-word bound, 24782.3 perplexity estimate based on a held-out corpus of 100 documents with 6647 words
2020-02-19 20:04:44,604 : INFO : PROGRESS: pass 22, at document #100/100
2020-02-19 20:04:45,058 : INFO : topic #0 (0.200): 0.000*"bowl" + 0.000*"stress" + 0.000*"used used" + 0.000*"communicate" + 0.000*"truly" + 0.000*"method" + 0.000*"free shipping" + 0.000*"chart" + 0.000*"paying attention" + 0.000*"metal"
2020-02-19 20:04:45,060 : INFO : topic #1 (0.200): 0.000*"train day" + 0.000*"application" + 0.000*"approach" + 0.000*"truly" + 0.000*"invest" + 0.000*"experience train" + 0.000*"building pygmy" + 0.000*"charity building" + 0.000*"s

2020-02-19 20:05:02,678 : INFO : topic #1 (0.200): 0.000*"train day" + 0.000*"application" + 0.000*"approach" + 0.000*"truly" + 0.000*"invest" + 0.000*"experience train" + 0.000*"building pygmy" + 0.000*"charity building" + 0.000*"summer" + 0.000*"way send"
2020-02-19 20:05:02,682 : INFO : topic #2 (0.200): 0.000*"everybody thank" + 0.000*"flow" + 0.000*"specie" + 0.000*"prison" + 0.000*"november" + 0.000*"stuff ve" + 0.000*"bye everybody" + 0.000*"electric" + 0.000*"pleasure" + 0.000*"guard"
2020-02-19 20:05:02,687 : INFO : topic #3 (0.200): 0.000*"critical" + 0.000*"app second" + 0.000*"investable" + 0.000*"experience train" + 0.000*"nutrient" + 0.000*"doe make" + 0.000*"bank transfer" + 0.000*"don try" + 0.000*"dry" + 0.000*"come home"
2020-02-19 20:05:02,690 : INFO : topic #4 (0.200): 0.000*"thank thank" + 0.000*"yes don" + 0.000*"counter" + 0.000*"justin ren" + 0.000*"explained" + 0.000*"teaching" + 0.000*"day bank" + 0.000*"tie" + 0.000*"lot different" + 0.000*"aunt"
2020-02-19 2

2020-02-19 20:05:20,420 : INFO : topic #4 (0.200): 0.000*"thank thank" + 0.000*"yes don" + 0.000*"counter" + 0.000*"explained" + 0.000*"justin ren" + 0.000*"pasta" + 0.000*"teaching" + 0.000*"day bank" + 0.000*"nate" + 0.000*"versa"
2020-02-19 20:05:20,422 : INFO : topic diff=0.000007, rho=0.171499
2020-02-19 20:05:23,482 : INFO : -14.597 per-word bound, 24780.1 perplexity estimate based on a held-out corpus of 100 documents with 6647 words
2020-02-19 20:05:23,483 : INFO : PROGRESS: pass 33, at document #100/100
2020-02-19 20:05:23,935 : INFO : topic #0 (0.200): 0.000*"bowl" + 0.000*"used used" + 0.000*"stress" + 0.000*"communicate" + 0.000*"truly" + 0.000*"chart" + 0.000*"providing" + 0.000*"free shipping" + 0.000*"method" + 0.000*"liberal"
2020-02-19 20:05:23,938 : INFO : topic #1 (0.200): 0.000*"train day" + 0.000*"application" + 0.000*"approach" + 0.000*"truly" + 0.000*"invest" + 0.000*"experience train" + 0.000*"building pygmy" + 0.000*"charity building" + 0.000*"summer" + 0.000*"

2020-02-19 20:05:41,537 : INFO : topic #0 (0.200): 0.000*"liberal progressive" + 0.000*"york state" + 0.000*"comparison rest" + 0.000*"wrong true" + 0.000*"oh christ" + 0.000*"went dark" + 0.000*"movement thank" + 0.000*"thinking little" + 0.000*"hmm called" + 0.000*"sleep slept"
2020-02-19 20:05:41,539 : INFO : topic #1 (0.200): 0.000*"train day" + 0.000*"application" + 0.000*"approach" + 0.000*"truly" + 0.000*"invest" + 0.000*"experience train" + 0.000*"building pygmy" + 0.000*"charity building" + 0.000*"summer" + 0.000*"way send"
2020-02-19 20:05:41,543 : INFO : topic #2 (0.200): 0.000*"ve launched" + 0.000*"com gan" + 0.000*"did played" + 0.000*"fact kind" + 0.000*"yes longer" + 0.000*"jaw hit" + 0.000*"told wow" + 0.000*"started feeling" + 0.000*"okay end" + 0.000*"hit floor"
2020-02-19 20:05:41,546 : INFO : topic #3 (0.200): 0.000*"critical" + 0.000*"app second" + 0.000*"nutrient" + 0.000*"experience train" + 0.000*"investable" + 0.000*"come home" + 0.000*"doe make" + 0.000*"let 

2020-02-19 20:05:59,156 : INFO : topic #1 (0.200): 0.000*"train day" + 0.000*"application" + 0.000*"approach" + 0.000*"truly" + 0.000*"invest" + 0.000*"experience train" + 0.000*"building pygmy" + 0.000*"charity building" + 0.000*"summer" + 0.000*"way send"
2020-02-19 20:05:59,159 : INFO : topic #2 (0.200): 0.000*"com gan" + 0.000*"liberal progressive" + 0.000*"ve launched" + 0.000*"preface" + 0.000*"joni" + 0.000*"yes amazing" + 0.000*"belief sort" + 0.000*"ridiculous group" + 0.000*"qualified candidate" + 0.000*"history sort"
2020-02-19 20:05:59,162 : INFO : topic #3 (0.200): 0.000*"critical" + 0.000*"woman saying" + 0.000*"revolutionary" + 0.000*"effective way" + 0.000*"knew mean" + 0.000*"stretched" + 0.000*"feel depressed" + 0.000*"understand body" + 0.000*"book come" + 0.000*"generalizing"
2020-02-19 20:05:59,165 : INFO : topic #4 (0.200): 0.000*"thinking eventually" + 0.000*"experience make" + 0.000*"civilization culture" + 0.000*"try yes" + 0.000*"rotate" + 0.000*"course yes" +

2020-02-19 20:06:16,713 : INFO : topic #1 (0.200): 0.000*"train day" + 0.000*"application" + 0.000*"approach" + 0.000*"truly" + 0.000*"invest" + 0.000*"experience train" + 0.000*"building pygmy" + 0.000*"charity building" + 0.000*"summer" + 0.000*"way send"
2020-02-19 20:06:16,716 : INFO : topic #2 (0.200): 0.000*"preface" + 0.000*"liberal progressive" + 0.000*"ridiculous group" + 0.000*"angry work" + 0.000*"com gan" + 0.000*"sort horrible" + 0.000*"joni" + 0.000*"history sort" + 0.000*"thinking eventually" + 0.000*"sip recruiter"
2020-02-19 20:06:16,719 : INFO : topic #3 (0.200): 0.000*"understand body" + 0.000*"preface" + 0.000*"feel depressed" + 0.000*"angry work" + 0.000*"editorial" + 0.000*"sort feeling" + 0.000*"ridiculous group" + 0.000*"liberal progressive" + 0.000*"wallet cash" + 0.000*"previous book"
2020-02-19 20:06:16,724 : INFO : topic #4 (0.200): 0.000*"thinking eventually" + 0.000*"experience make" + 0.000*"civilization culture" + 0.000*"ridiculous group" + 0.000*"libera

In [138]:
if run_lda:
    lda.print_topics()

2020-02-19 20:06:20,331 : INFO : topic #0 (0.200): 0.000*"liberal progressive" + 0.000*"angry work" + 0.000*"ridiculous group" + 0.000*"preface" + 0.000*"sip recruiter" + 0.000*"person belief" + 0.000*"act wasn" + 0.000*"sort horrible" + 0.000*"country especially" + 0.000*"incredible number"
2020-02-19 20:06:20,334 : INFO : topic #1 (0.200): 0.000*"train day" + 0.000*"application" + 0.000*"approach" + 0.000*"truly" + 0.000*"invest" + 0.000*"experience train" + 0.000*"building pygmy" + 0.000*"charity building" + 0.000*"summer" + 0.000*"way send"
2020-02-19 20:06:20,337 : INFO : topic #2 (0.200): 0.000*"ridiculous group" + 0.000*"angry work" + 0.000*"preface" + 0.000*"liberal progressive" + 0.000*"com gan" + 0.000*"thinking eventually" + 0.000*"joni" + 0.000*"history sort" + 0.000*"sip recruiter" + 0.000*"person belief"
2020-02-19 20:06:20,340 : INFO : topic #3 (0.200): 0.000*"preface" + 0.000*"understand body" + 0.000*"ridiculous group" + 0.000*"angry work" + 0.000*"liberal progressive"

### LSA

In [120]:
import pandas as pd 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [121]:
doc_word = ps_ep_list_tokenizer.transform(ps_ep_list)
doc_word.shape

(100, 108673)

In [122]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = TruncatedSVD(num_topics)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.00150386, 0.01553777, 0.0119959 , 0.01179166, 0.01135285])

In [123]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = col_names,
             columns = ps_ep_list_tokenizer.get_feature_names())
topic_word

Unnamed: 0,aa,aa meeting,aaa,aah,aaron,aav,aavs,ab,abalone,abandon,...,zoom zoom,zoomed,zooming,zu,zubi,zucchini,zuckerberg,zuckerberg facebook,zumba,zz
component_1,0.005,0.001,0.001,0.002,0.005,0.002,0.002,0.007,0.001,0.006,...,0.003,0.002,0.001,0.001,0.002,0.001,0.005,0.001,0.001,0.002
component_2,0.002,-0.002,-0.001,-0.0,-0.002,0.003,0.003,-0.002,0.001,0.006,...,0.001,-0.002,0.002,0.001,-0.001,-0.002,0.009,0.005,-0.003,-0.004
component_3,0.001,0.001,-0.002,-0.001,-0.004,0.01,0.012,0.004,-0.001,-0.003,...,-0.004,0.002,0.002,-0.005,-0.002,0.001,-0.006,-0.003,0.002,0.002
component_4,0.005,0.001,-0.003,-0.001,0.005,-0.008,-0.008,0.004,-0.003,0.0,...,-0.0,-0.004,0.005,-0.002,-0.0,0.002,-0.003,0.001,-0.002,-0.0
component_5,-0.001,-0.0,0.005,0.001,0.005,-0.001,-0.001,-0.006,0.0,-0.002,...,-0.002,0.002,-0.004,-0.003,0.001,0.001,-0.004,-0.001,0.0,0.003


In [124]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [125]:
display_topics(lsa, ps_ep_list_tokenizer.get_feature_names(), 5)


Topic  0
summer, way send, support good, everybody thank, invest

Topic  1
economic, revolution, radically, entity, transparent

Topic  2
brought expressvpn, expressvpn, expressvpn com, learn brought, computer phone

Topic  3
animal product, highest quality, harmful chemical, psychological, formulation

Topic  4
sleep make, use code, loop strap, wearing whoop, card fee


In [126]:
Vt = pd.DataFrame(doc_topic.round(5),
             index = ps_ep_names,
             columns = col_names)
Vt

Unnamed: 0,component_1,component_2,component_3,component_4,component_5
#1383 - Malcolm Gladwell,0.26906,0.03806,-0.02832,0.05508,-0.08561
#1382 - RZA & Donnell Rawlings,0.25006,-0.06743,-0.01837,-0.10845,-0.13158
#1378 - Greg Fitzsimmons,0.27514,-0.05180,-0.03012,-0.09914,0.04494
#1375 - Edward Norton,0.19524,-0.02456,0.05802,0.02960,-0.02234
#1373 - Kyle Kulinski,0.27395,0.23410,-0.10398,-0.02188,0.11881
...,...,...,...,...,...
#1280 - Michael Yo,0.29219,-0.08155,-0.03396,0.01267,-0.09156
#1279 - Jessimae Peluso,0.27107,-0.05227,-0.03853,-0.04787,-0.10910
#1278 - Kevin Hart,0.21349,-0.03098,-0.03489,0.07325,-0.03564
#1277 - Gabrielle Reece,0.24610,-0.02092,-0.12910,0.10405,-0.15772


### NMF

In [127]:
doc_word = ps_ep_list_tokenizer.transform(ps_ep_list)

In [128]:
nmf_model = NMF(num_topics)
doc_topic = nmf_model.fit_transform(doc_word)

In [129]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = col_names,
             columns = ps_ep_list_tokenizer.get_feature_names())
topic_word

Unnamed: 0,aa,aa meeting,aaa,aah,aaron,aav,aavs,ab,abalone,abandon,...,zoom zoom,zoomed,zooming,zu,zubi,zucchini,zuckerberg,zuckerberg facebook,zumba,zz
component_1,0.008,0.004,0.004,0.003,0.008,0.0,0.001,0.012,0.001,0.006,...,0.005,0.003,0.001,0.001,0.004,0.004,0.003,0.0,0.003,0.006
component_2,0.006,0.0,0.0,0.002,0.002,0.0,0.0,0.002,0.002,0.009,...,0.004,0.0,0.003,0.003,0.001,0.0,0.014,0.006,0.0,0.0
component_3,0.0,0.0,0.0,0.0,0.0,0.0,0.002,0.006,0.0,0.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0
component_4,0.0,0.0,0.0,0.001,0.01,0.0,0.0,0.0,0.001,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004,0.0
component_5,0.001,0.0,0.0,0.001,0.0,0.018,0.017,0.006,0.001,0.003,...,0.0,0.008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [130]:
display_topics(nmf_model, ps_ep_list_tokenizer.get_feature_names(), 10)


Topic  0
ate, workout, vega, jitsu, joey, oh sure, friday, yoga, girlfriend, jacked

Topic  1
transparent, economic, discussion, decision making, bias, citizen, democracy, disagree, donald trump, democrat

Topic  2
offer boost, packed premium, reward come, come packed, instant reward, premium feature, card offer, feature credit, place check, required instant

Topic  3
recruiter need, cooter scan, technology zipper, job web, leading job, site day, candidate miss, analyzes spotlight, thousand resume, candidate site

Topic  4
mattock company, kind went, kind said, mane chaga, aircraft, government doesn, mushroom elixir, refresh, mane mushroom, fo


In [131]:
H = pd.DataFrame(doc_topic.round(5),
             index = ps_ep_names,
             columns = col_names)
H

Unnamed: 0,component_1,component_2,component_3,component_4,component_5
#1383 - Malcolm Gladwell,0.07879,0.17290,0.01023,0.00000,0.00000
#1382 - RZA & Donnell Rawlings,0.14471,0.00905,0.00000,0.02022,0.00000
#1378 - Greg Fitzsimmons,0.14364,0.05576,0.00000,0.00082,0.00000
#1375 - Edward Norton,0.10763,0.00978,0.01659,0.00215,0.00569
#1373 - Kyle Kulinski,0.00000,0.39448,0.00000,0.00000,0.00000
...,...,...,...,...,...
#1280 - Michael Yo,0.16886,0.02438,0.00000,0.00000,0.00000
#1279 - Jessimae Peluso,0.14096,0.00000,0.00000,0.00573,0.09631
#1278 - Kevin Hart,0.11866,0.02490,0.00000,0.00321,0.00000
#1277 - Gabrielle Reece,0.12894,0.03367,0.01463,0.00000,0.00238
