In [1]:
import sys
print("Python Version:", sys.version)

Python Version: 3.7.4 (default, Aug 13 2019, 20:35:49) 
[GCC 7.3.0]


In [2]:
#######################
# standard code block #
#######################

%config InlineBackend.figure_format = 'svg'

# auto reload imports that change
%load_ext autoreload
# only set to auto reload for marked imports
%autoreload 1

In [3]:
import pandas as pd

## Pull Data

### Connect to Database

In [4]:
from pymongo import MongoClient
%aimport credentials.cred
from credentials import cred


config = {
  'host': cred.mongo_host,
  'username': cred.mongo_user,
  'password': cred.mongo_pass,
  'authSource': cred.mongo_auth_db
}

# get a mongo client
client = MongoClient(**config)

# use the clean database
jre_clean = client.jre_clean
podcasts_clean = jre_clean.podcasts

### Get Podcasts from Podscribe

In [5]:
# query
ps_ep_mongo = list(
    podcasts_clean
    .find({'source':'podscribe'},{'_id':0, 'number':1, 'name':1, 'desc':1, 'text':1})
    .limit(200)
)

In [6]:
# pull out text and names
ps_ep_list = [x['text'] for x in ps_ep_mongo]
ps_ep_names = [f"#{x['number']} " + x['name'] for x in ps_ep_mongo]
ps_ep_descs = [x['desc'] for x in ps_ep_mongo]

In [7]:
ps_ep_df = pd.DataFrame(ps_ep_descs, index=ps_ep_names, columns = ['Description'])

In [8]:
ps_ep_df

Unnamed: 0,Description
#1383 - Malcolm Gladwell,"Malcolm Gladwell is a journalist, author, and ..."
#1382 - RZA & Donnell Rawlings,"RZA is a rapper, record producer, musician, ac..."
#1378 - Greg Fitzsimmons,Greg Fitzsimmons is a writer and stand-up come...
#1375 - Edward Norton,"Edward Norton is an actor, writer, producer, d..."
#1373 - Kyle Kulinski,"Kyle Kulinski is a political activist, progres..."
...,...
#1182 - Nick Kroll,"Nick Kroll is an actor, comedian, writer, and ..."
JRE MMA Show #44 with John Kavanagh & George Lockhart,Joe is joined by MMA coach & Brazilian jiu-jit...
#1181 - John Dudley,John Dudley is a pro archer and host of “Nock ...
#1180 - Everlast,Everlast is a Grammy Award-winning American ra...


## Clean Text

In [41]:
# setup clean
clean_lemm = False
spacy_ = False

### Clean

In [42]:
from gensim.utils import simple_preprocess

In [43]:
if clean_lemm:
    ps_ep_list = [' '.join(simple_preprocess(x)) for x in ps_ep_list]

### Stop Words

In [44]:
from sklearn.feature_extraction import text
new_stop_words = []

# profanity
profanity = (
    open('stop_words/profanity.txt', newline='\n')
    .read()
    .splitlines()
)

new_stop_words += [x.lower() for x in profanity]

# common Joe Rogan words
common_jre_words = [
    'like', 'yeah', 'know', 'just', 'right', 'right', 'think', 'know',
    'people', 'going', 'really', 'got', 'thing', 'want', 'actually',
    'say'
]
new_stop_words += common_jre_words

# append to english stop words
jre_stop_words = text.ENGLISH_STOP_WORDS.union(new_stop_words)

### Lemmatize

In [45]:
from nltk.stem import WordNetLemmatizer

if clean_lemm:
    lemmatizer = WordNetLemmatizer()

    for i, ep in enumerate(ps_ep_list):
        ps_ep_list[i] = ' '.join([lemmatizer.lemmatize(x) for x in ep.split(' ')])

    jre_stop_words = [lemmatizer.lemmatize(x) for x in jre_stop_words]

### SpaCy

In [46]:
if spacy_:
    import spacy
    from spacy import displacy
    from collections import Counter
    import en_core_web_sm

In [47]:
from IPython.display import clear_output

if spacy_:
    ps_spacy = en_core_web_sm.load()

    ps_spacy_list = []
    tot = len(ps_ep_list)

    for i, doc in enumerate(ps_ep_list):
        clear_output()
        print((i/tot)*100)
        ps_spacy_list.append(ps_spacy(doc))

In [48]:
if spacy_:
    for ent in ps_spacy_list[0].ents:
        print(ent.text, ent.label_)

In [49]:
if spacy_:
    for ent in ps_spacy_list[0].ents:
        print(ent.text, ent.label_)

In [50]:
if spacy_:
    selected_tokens = [
        y for y in ps_spacy_list[0] if
        (
            not y.is_stop and
            not y.is_punct and
            not y.is_space
        )
    ]

## Tokenization

In [51]:
# Setup Tokenization
tf_idf = False

### TF-IDF

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

if tf_idf:
    if spacy_:
        # Create TF-IDF vectorizer
        ps_ep_list_tfidf = TfidfVectorizer(
            ngram_range=(1,3), binary=True, stop_words=jre_stop_words,
            token_pattern="\\b[a-z][a-z]+\\b", max_df=0.4, min_df=0.01
        )
    else:
        # Create TF-IDF vectorizer
        ps_ep_list_tfidf = TfidfVectorizer(
            ngram_range=(1,3), binary=True, stop_words=jre_stop_words,
            token_pattern="\\b[a-z][a-z]+\\b", max_df=0.4, min_df=0.01
        )        

    # fit
    _ = ps_ep_list_tfidf.fit(ps_ep_list)

### Count Vectorize

In [53]:
from sklearn.feature_extraction.text import CountVectorizer

if not tf_idf:
    # Create a CountVectorizer for parsing/counting words
    ps_ep_list_cvec = CountVectorizer(
        ngram_range=(1, 3), stop_words=jre_stop_words,
        token_pattern="\\b[a-z][a-z]+\\b",  max_df=0.4, min_df=0.01
    )

    # fit
    _ = ps_ep_list_cvec.fit(ps_ep_list)

## Models

### Model Setup

In [54]:
# Tokenizer
if tf_idf:
    ps_ep_list_tokenizer = ps_ep_list_tfidf
else:
    ps_ep_list_tokenizer = ps_ep_list_cvec

# number of topics
num_topics = 12

# column names
col_names = ['component_'+str(x) for x in range(1,num_topics+1)]

# run LDA?
run_lda = False

### LDA

In [55]:
# gensim
from gensim import corpora, models, similarities, matutils

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [56]:
# Create the term-document matrix
# Transpose it so the terms are the rows
if run_lda:
    doc_word = ps_ep_list_tokenizer.transform(ps_ep_list).transpose()

In [57]:
# Convert sparse matrix of counts to a gensim corpus
if run_lda:
    corpus = matutils.Sparse2Corpus(doc_word)

In [58]:
if run_lda:    
    id2word = dict((v, k) for k, v in ps_ep_list_tokenizer.vocabulary_.items())

In [59]:
# Create lda model (equivalent to "fit" in sklearn)
if run_lda:    
    lda = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=50)

In [60]:
if run_lda:
    lda.print_topics()

### LSA

In [61]:
import pandas as pd 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [62]:
doc_word = ps_ep_list_tokenizer.transform(ps_ep_list)
doc_word.shape

(200, 253619)

In [63]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = TruncatedSVD(num_topics)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.00679653, 0.02379325, 0.02253895, 0.02102129, 0.01984403,
       0.01818871, 0.01797202, 0.01410057, 0.01230169, 0.01156521,
       0.01107264, 0.01045077])

In [64]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = col_names,
             columns = ps_ep_list_tokenizer.get_feature_names())
topic_word

Unnamed: 0,aa,aa meeting,aaa,aah,aaliyah,aaron,aaron lewis,aav,aavs,ab,...,zubi,zucchini,zuckerberg,zuckerberg did,zuckerberg facebook,zuckerberg ha,zuckerberg probably,zumba,zurich,zz
component_1,0.002,0.0,0.001,0.0,0.0,0.003,0.0,0.001,0.001,0.006,...,0.001,0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.001
component_2,-0.001,-0.0,-0.0,-0.0,-0.0,-0.001,-0.0,-0.0,0.0,-0.001,...,-0.0,-0.0,-0.001,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
component_3,-0.0,-0.0,-0.0,-0.0,-0.0,-0.002,-0.0,0.0,0.0,0.0,...,-0.0,-0.0,0.001,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0
component_4,-0.0,-0.0,-0.001,-0.0,-0.0,-0.002,-0.0,-0.0,-0.001,-0.004,...,-0.0,-0.0,0.002,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.001
component_5,-0.001,0.0,0.0,0.0,-0.0,0.002,0.0,-0.0,-0.0,0.003,...,-0.0,-0.0,-0.005,0.0,-0.001,-0.0,0.0,0.0,0.0,0.001
component_6,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.001,0.0,...,-0.0,-0.0,-0.002,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0
component_7,-0.0,-0.0,0.001,-0.0,-0.0,-0.0,0.0,0.001,0.001,-0.002,...,-0.0,-0.0,-0.002,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0
component_8,-0.0,-0.0,-0.0,0.0,-0.0,0.001,0.0,0.001,0.001,-0.001,...,-0.0,-0.0,-0.001,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0
component_9,0.003,0.0,-0.001,0.0,-0.0,-0.002,-0.001,0.003,0.005,-0.002,...,-0.0,0.0,-0.005,-0.0,0.0,-0.0,-0.001,0.0,-0.0,0.0
component_10,0.001,0.0,0.002,0.0,0.0,-0.001,-0.001,-0.002,-0.003,-0.0,...,0.0,0.0,-0.008,-0.0,-0.001,0.0,-0.001,0.0,0.001,0.001


In [65]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nComponent ", ix+1)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] if topic[i] >=0 else 'NOT '+feature_names[i]
                        for i in abs(topic).argsort()[:-no_top_words - 1:-1]]))

#### Results

In [66]:
display_topics(lsa, ps_ep_list_tokenizer.get_feature_names(), 10)


Component  1
robot, policy, marijuana, deer, calorie, nazi, cannabis, squarespace, conspiracy, sober

Component  2
calorie, obesity, carbohydrate, insulin, hypothesis, gary, intake, fat cell, experiment, quantum

Component  3
quantum, mechanic, quantum mechanic, wave function, electron, physic, NOT calorie, simulation, probability, NOT obesity

Component  4
cannabis, marijuana, vaccine, medicine, cbd, schizophrenia, NOT quantum, psychosis, policy, mexico

Component  5
cannabis, NOT simulation, quantum, marijuana, mechanic, cbd, quantum mechanic, NOT artificial, NOT policy, NOT tweet

Component  6
vaccine, autism, poverty, measles, simulation, tropical, lyme, NOT policy, lyme disease, malaria

Component  7
simulation, NOT vaccine, cannabis, artificial, robot, NOT quantum, artificial intelligence, NOT policy, simulated, NOT mechanic

Component  8
cartel, mexico, border, mexican, wildlife, NOT cannabis, warden, NOT tweet, game warden, county

Component  9
nazi, NOT simulation, cia, germa

In [67]:
Vt = pd.DataFrame(doc_topic.round(5),
             index = ps_ep_names,
             columns = col_names)
Vt.sort_values('component_2', ascending=False)

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,component_11,component_12
#1267 - Gary Taubes & Stephan Guyenet,53.64907,257.44494,-71.88376,-9.33171,-3.48011,-21.03841,-21.84562,9.23877,2.93398,14.03102,-7.27986,-0.63042
#1352 - Sean Carroll,39.98117,53.01387,224.30656,-49.18978,69.81394,-4.83730,-52.20960,0.79514,-1.81572,22.45916,-14.19576,0.12193
#1350 - Nick Bostrom,47.28174,31.26428,57.70310,-10.58006,-84.03478,30.36238,176.85569,12.78818,-47.43659,64.13417,2.32829,-26.58286
#1201 - William von Hippel,45.82273,19.99387,-1.45803,-1.11021,-1.32438,11.90541,9.00349,-4.41667,-3.94784,-31.69640,12.97590,1.51193
#1234 - David Sinclair,37.73120,18.47354,-1.27767,2.24855,8.87452,11.84792,3.58531,-10.47699,0.96605,-32.98673,11.92394,-8.58427
...,...,...,...,...,...,...,...,...,...,...,...,...
#1355 - Mark Normand,55.60992,-9.97461,-10.99366,-14.35146,5.29112,-0.93181,-4.82074,-10.47367,-3.13818,13.48526,-1.03390,17.93723
#1319 - Joey Diaz,53.47813,-10.31434,-4.13080,-15.05351,16.25878,1.41199,3.04233,19.65961,8.23602,20.11550,-6.22719,7.90684
#1333 - Tom Papa,56.93588,-11.47647,-9.41836,-16.66622,11.28269,6.79029,-2.52369,10.85089,2.83498,7.98204,6.69464,-22.85937
#1329 - Brian Moses,55.07029,-11.58646,-11.88802,-11.54841,6.27471,2.25979,-3.18902,-2.02803,14.56906,18.79790,0.89533,-10.19168


### NMF

In [68]:
doc_word = ps_ep_list_tokenizer.transform(ps_ep_list)

In [69]:
nmf_model = NMF(num_topics)
doc_topic = nmf_model.fit_transform(doc_word)

In [70]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = col_names,
             columns = ps_ep_list_tokenizer.get_feature_names())
topic_word

Unnamed: 0,aa,aa meeting,aaa,aah,aaliyah,aaron,aaron lewis,aav,aavs,ab,...,zubi,zucchini,zuckerberg,zuckerberg did,zuckerberg facebook,zuckerberg ha,zuckerberg probably,zumba,zurich,zz
component_1,0.022,0.012,0.029,0.017,0.017,0.069,0.0,0.0,0.0,0.176,...,0.03,0.017,0.0,0.0,0.0,0.01,0.0,0.009,0.021,0.033
component_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_3,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.051,...,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0
component_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_5,0.0,0.0,0.001,0.003,0.006,0.0,0.0,0.0,0.0,0.029,...,0.01,0.0,0.065,0.007,0.0,0.009,0.0,0.0,0.002,0.0
component_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,...,0.0,0.0,0.006,0.002,0.0,0.002,0.0,0.0,0.0,0.0
component_7,0.002,0.0,0.017,0.0,0.0,0.0,0.001,0.0,0.0,0.0,...,0.0,0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_8,0.0,0.0,0.0,0.004,0.0,0.0,0.0,0.003,0.004,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_9,0.057,0.0,0.0,0.001,0.0,0.0,0.0,0.017,0.023,0.007,...,0.001,0.0,0.0,0.003,0.0,0.0,0.0,0.002,0.0,0.001
component_10,0.004,0.0,0.0,0.001,0.0,0.0,0.003,0.026,0.039,0.034,...,0.005,0.003,0.062,0.0,0.016,0.006,0.0,0.0,0.0,0.0


#### Results

In [71]:
display_topics(nmf_model, ps_ep_list_tokenizer.get_feature_names(), 10)


Component  1
sober, coke, cigarette, hip, squarespace, album, whoop, yoga, bert, tyson

Component  2
calorie, obesity, carbohydrate, insulin, hypothesis, gary, intake, fat cell, experiment, body fat

Component  3
quantum, mechanic, quantum mechanic, wave function, electron, physic, equation, probability, particle, spinning

Component  4
cannabis, marijuana, cbd, medicine, schizophrenia, thc, psychosis, depression, addiction, suicide

Component  5
tweet, conservative, banned, policy, context, journalist, trans, ideology, learn code, harassment

Component  6
vaccine, autism, poverty, measles, tropical, lyme, lyme disease, malaria, anti vaccine, tropical disease

Component  7
simulation, artificial, probability, simulated, biological, artificial intelligence, robot, technological, hypothesis, innovation

Component  8
mexico, cartel, border, mexican, marijuana, officer, warden, tijuana, chapo, federal

Component  9
nazi, german, cia, conspiracy, alex, moon, hitler, jones, yoga, sandy

Com

In [74]:
H = pd.DataFrame(doc_topic.round(5),
             index = ps_ep_names,
             columns = col_names)
H.sort_values('component_7', ascending=False)[:20].loc[:,['component_7','component_10']]

Unnamed: 0,component_7,component_10
#1350 - Nick Bostrom,15.15966,0.0
#1354 - The Black Keys,2.1394,0.0
#1188 - Lex Fridman,1.22981,6.55157
#1345 - Steve Aoki,1.08323,0.92041
#1211 - Dr. Ben Goertzel,1.02862,5.03861
#1366 - Richard Dawkins,0.49342,0.46968
#1233 - Brian Cox,0.43762,2.556
#1325 - Dr. Cornel West,0.36342,0.26935
#1308 - Eddie Bravo,0.35218,0.0
#1351 - Dan Aykroyd,0.34723,0.87772


In [79]:
H.loc[' #1366 - Richard Dawkins',:]

component_1     0.39721
component_2     0.20666
component_3     0.07823
component_4     0.02941
component_5     0.03744
component_6     0.04523
component_7     0.49342
component_8     0.00000
component_9     0.18868
component_10    0.46968
component_11    0.40609
component_12    0.04092
Name:  #1366 - Richard Dawkins, dtype: float64

In [77]:
H.index

Index([' #1383 - Malcolm Gladwell', ' #1382 - RZA & Donnell Rawlings',
       ' #1378 - Greg Fitzsimmons', ' #1375 - Edward Norton',
       ' #1373 - Kyle Kulinski', ' #1371 - Andrew Santino',
       ' #1370 - Brian Grazer', ' #1369 - Christopher Ryan',
       ' #1368 - Edward Snowden', ' #1366 - Richard Dawkins',
       ...
       ' #1186 - Marques Brownlee', ' #1185 - Kelly Slater',
       ' JRE MMA Show #46 with Ari Shaffir', ' #1184 - Roseanne Barr',
       ' #1183 - Andrew Santino', ' #1182 - Nick Kroll',
       ' JRE MMA Show #44 with John Kavanagh & George Lockhart',
       ' #1181 - John Dudley', ' #1180 - Everlast', ' #1179 - Nikki Glaser'],
      dtype='object', length=200)