In [1]:
import sys
print("Python Version:", sys.version)

Python Version: 3.7.4 (default, Aug 13 2019, 20:35:49) 
[GCC 7.3.0]


In [2]:
#######################
# standard code block #
#######################

%config InlineBackend.figure_format = 'svg'

# auto reload imports that change
%load_ext autoreload
# only set to auto reload for marked imports
%autoreload 1

In [3]:
import pandas as pd

## Pull Data

### Connect to Database

In [4]:
from pymongo import MongoClient
%aimport credentials.cred
from credentials import cred


config = {
  'host': cred.mongo_host,
  'username': cred.mongo_user,
  'password': cred.mongo_pass,
  'authSource': cred.mongo_auth_db
}

# get a mongo client
client = MongoClient(**config)

# use the clean database
jre_clean = client.jre_clean
podcasts_clean = jre_clean.podcasts

### Get Podcasts from Podscribe

In [5]:
# query
ps_ep_mongo = list(
    podcasts_clean
    .find({'source':'podscribe'},{'_id':0, 'number':1, 'name':1, 'desc':1, 'text':1})
    .limit(200)
)

In [6]:
# pull out text and names
ps_ep_text_list = [ep['text'] for ep in ps_ep_mongo]
ps_ep_list = [' '.join([x['text'] for x in ep]) for ep in ps_ep_text_list]
ps_ep_names = [f"#{x['number']} " + x['name'] for x in ps_ep_mongo]
ps_ep_descs = [x['desc'] for x in ps_ep_mongo]

In [7]:
ps_ep_df = pd.DataFrame(ps_ep_descs, index=ps_ep_names, columns = ['Description'])

In [8]:
ps_ep_df

Unnamed: 0,Description
#1383 Malcolm Gladwell,"Malcolm Gladwell is a journalist, author, and ..."
#1382 RZA & Donnell Rawlings,"RZA is a rapper, record producer, musician, ac..."
#1378 Greg Fitzsimmons,Greg Fitzsimmons is a writer and stand-up come...
#1375 Edward Norton,"Edward Norton is an actor, writer, producer, d..."
#1373 Kyle Kulinski,"Kyle Kulinski is a political activist, progres..."
...,...
#1182 Nick Kroll,"Nick Kroll is an actor, comedian, writer, and ..."
#44 John Kavanagh & George Lockhart,Joe is joined by MMA coach & Brazilian jiu-jit...
#1181 John Dudley,John Dudley is a pro archer and host of “Nock ...
#1180 Everlast,Everlast is a Grammy Award-winning American ra...


## Clean Text

In [9]:
# setup clean
clean_lemm = False
spacy_ = False

### Clean

In [10]:
from gensim.utils import simple_preprocess

In [11]:
if clean_lemm:
    ps_ep_list = [' '.join(simple_preprocess(x)) for x in ps_ep_list]

### Stop Words

In [12]:
from sklearn.feature_extraction import text
new_stop_words = []

# profanity
profanity = (
    open('stop_words/profanity.txt', newline='\n')
    .read()
    .splitlines()
)

new_stop_words += [x.lower() for x in profanity]

# common Joe Rogan words
common_jre_words = [
    'like', 'yeah', 'know', 'just', 'right', 'right', 'think', 'know',
    'people', 'going', 'really', 'got', 'thing', 'want', 'actually',
    'say'
]
new_stop_words += common_jre_words

# append to english stop words
jre_stop_words = text.ENGLISH_STOP_WORDS.union(new_stop_words)

### Lemmatize

In [13]:
from nltk.stem import WordNetLemmatizer

if clean_lemm:
    lemmatizer = WordNetLemmatizer()

    for i, ep in enumerate(ps_ep_list):
        ps_ep_list[i] = ' '.join([lemmatizer.lemmatize(x) for x in ep.split(' ')])

    jre_stop_words = [lemmatizer.lemmatize(x) for x in jre_stop_words]

### SpaCy

In [14]:
if spacy_:
    import spacy
    from spacy import displacy
    from collections import Counter
    import en_core_web_sm

In [15]:
from IPython.display import clear_output

if spacy_:
    ps_spacy = en_core_web_sm.load()

    ps_spacy_list = []
    tot = len(ps_ep_list)

    for i, doc in enumerate(ps_ep_list):
        clear_output()
        print((i/tot)*100)
        ps_spacy_list.append(ps_spacy(doc))

In [16]:
if spacy_:
    for ent in ps_spacy_list[0].ents:
        print(ent.text, ent.label_)

In [17]:
if spacy_:
    for ent in ps_spacy_list[0].ents:
        print(ent.text, ent.label_)

In [18]:
if spacy_:
    selected_tokens = [
        y for y in ps_spacy_list[0] if
        (
            not y.is_stop and
            not y.is_punct and
            not y.is_space
        )
    ]

## Tokenization

In [19]:
# Setup Tokenization
tf_idf = False

### TF-IDF

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

if tf_idf:
    if spacy_:
        # Create TF-IDF vectorizer
        ps_ep_list_tfidf = TfidfVectorizer(
            ngram_range=(1,3), binary=True, stop_words=jre_stop_words,
            token_pattern="\\b[a-z][a-z]+\\b", max_df=0.4, min_df=0.01
        )
    else:
        # Create TF-IDF vectorizer
        ps_ep_list_tfidf = TfidfVectorizer(
            ngram_range=(1,3), binary=True, stop_words=jre_stop_words,
            token_pattern="\\b[a-z][a-z]+\\b", max_df=0.4, min_df=0.01
        )        

    # fit
    _ = ps_ep_list_tfidf.fit(ps_ep_list)

### Count Vectorize

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

if not tf_idf:
    # Create a CountVectorizer for parsing/counting words
    ps_ep_list_cvec = CountVectorizer(
        ngram_range=(1, 3), stop_words=jre_stop_words,
        token_pattern="\\b[a-z][a-z]+\\b",  max_df=0.4, min_df=0.01
    )

    # fit
    _ = ps_ep_list_cvec.fit(ps_ep_list)

## Models

### Model Setup

In [22]:
# Tokenizer
if tf_idf:
    ps_ep_list_tokenizer = ps_ep_list_tfidf
else:
    ps_ep_list_tokenizer = ps_ep_list_cvec

# number of topics
num_topics = 12

# column names
col_names = ['component_'+str(x) for x in range(1,num_topics+1)]

# run LDA?
run_lda = False

### LDA

In [23]:
# gensim
from gensim import corpora, models, similarities, matutils

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [24]:
# Create the term-document matrix
# Transpose it so the terms are the rows
if run_lda:
    doc_word = ps_ep_list_tokenizer.transform(ps_ep_list).transpose()

In [25]:
# Convert sparse matrix of counts to a gensim corpus
if run_lda:
    corpus = matutils.Sparse2Corpus(doc_word)

In [26]:
if run_lda:    
    id2word = dict((v, k) for k, v in ps_ep_list_tokenizer.vocabulary_.items())

In [27]:
# Create lda model (equivalent to "fit" in sklearn)
if run_lda:    
    lda = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=50)

In [28]:
if run_lda:
    lda.print_topics()

### LSA

In [29]:
import pandas as pd 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
doc_word = ps_ep_list_tokenizer.transform(ps_ep_list)
doc_word.shape

(200, 238457)

In [31]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = TruncatedSVD(num_topics)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.00696505, 0.02389283, 0.02194264, 0.02111012, 0.01956895,
       0.01632177, 0.01595566, 0.01317134, 0.01241334, 0.01168444,
       0.01088926, 0.01030814])

In [32]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = col_names,
             columns = ps_ep_list_tokenizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaa,aah,aaliyah,aaron,aaron lewis,aav,aavs,ab,ab safety,...,zu,zubi,zucchini,zuckerberg,zuckerberg did,zuckerberg facebook,zuckerberg probably,zumba,zurich,zz
component_1,0.002,0.001,0.0,0.0,0.003,0.0,0.001,0.001,0.002,0.0,...,0.001,0.001,0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.001
component_2,-0.001,-0.0,-0.0,-0.0,-0.002,-0.0,0.0,0.0,-0.001,-0.0,...,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0
component_3,-0.001,-0.001,-0.0,-0.0,-0.002,-0.0,-0.0,-0.0,-0.001,0.0,...,-0.0,-0.001,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.001
component_4,-0.0,0.0,-0.0,0.0,0.001,0.0,0.0,0.0,0.0,-0.0,...,-0.0,0.0,0.0,-0.001,0.0,-0.0,0.0,0.0,0.0,0.0
component_5,0.001,-0.0,-0.0,-0.0,-0.002,-0.0,0.0,-0.0,-0.001,0.0,...,0.001,-0.0,0.0,0.006,-0.0,0.001,-0.0,-0.0,-0.0,-0.001
component_6,-0.0,0.0,-0.0,-0.0,-0.001,0.0,0.001,0.002,-0.0,-0.0,...,-0.001,-0.0,-0.0,-0.001,-0.0,-0.0,-0.0,0.0,-0.0,-0.0
component_7,-0.0,-0.001,-0.0,0.0,0.001,-0.0,-0.001,-0.001,-0.0,0.0,...,0.001,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0
component_8,-0.0,-0.001,-0.0,-0.0,0.003,0.002,0.001,0.001,0.001,0.0,...,-0.001,-0.001,-0.001,0.005,0.001,0.0,0.001,0.0,-0.001,-0.001
component_9,-0.003,-0.001,-0.0,0.0,0.002,0.001,-0.002,-0.002,-0.001,-0.0,...,0.0,0.0,0.0,0.004,0.001,-0.0,0.0,0.0,0.0,-0.001
component_10,0.002,-0.002,-0.0,-0.0,0.001,0.001,0.002,0.003,0.001,-0.001,...,0.0,0.0,-0.001,0.001,0.001,-0.0,0.0,0.0,-0.001,-0.001


In [33]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nComponent ", ix+1)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] if topic[i] >=0 else 'NOT '+feature_names[i]
                        for i in abs(topic).argsort()[:-no_top_words - 1:-1]]))

#### Results

In [34]:
display_topics(lsa, ps_ep_list_tokenizer.get_feature_names(), 10)


Component  1
marijuana, bears, deer, cannabis, policy, squarespace, sober, artificial, jones, alex

Component  2
quantum, mechanics, quantum mechanics, function, wave function, physics, electron, probability, worlds, simulation

Component  3
cannabis, obesity, calories, marijuana, insulin, increase, cells, NOT quantum, cbd, carbohydrate

Component  4
NOT cannabis, calories, obesity, NOT marijuana, insulin, carbohydrate, cells, gary, intake, NOT cbd

Component  5
NOT cannabis, simulation, accounts, policy, artificial, NOT quantum, congress, banned, campaign, context

Component  6
simulation, vaccine, diseases, autism, vaccines, artificial, poverty, measles, artificial intelligence, ai

Component  7
vaccine, NOT simulation, diseases, autism, vaccines, poverty, NOT artificial, measles, NOT artificial intelligence, NOT cannabis

Component  8
wildlife, mexico, deer, cartel, elk, hunter, bears, hunters, cartels, border

Component  9
NOT mexico, NOT cartel, NOT aliens, NOT cia, NOT cartels, 

In [35]:
Vt = pd.DataFrame(doc_topic.round(5),
             index = ps_ep_names,
             columns = col_names)
Vt.sort_values('component_2', ascending=False)

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,component_11,component_12
#1352 Sean Carroll,40.43823,244.01273,-53.72981,-11.25202,-43.97437,-21.49760,22.23382,-10.71187,-11.98043,-9.27094,5.05078,9.40114
#1203 Eric Weinstein,57.20591,46.15782,-12.05368,4.33116,-3.65467,-2.74064,3.20257,13.81682,20.43247,22.28013,0.81881,-11.16553
#1216 Sir Roger Penrose,22.44430,41.07613,-6.93734,1.54052,-1.53782,2.83352,-2.15275,-1.11673,2.72914,1.52184,-3.10218,-8.71281
#1267 Gary Taubes & Stephan Guyenet,52.48443,35.05871,159.57709,182.74659,-7.48910,-35.93539,0.20834,-2.80856,-17.32391,-6.89832,9.22315,10.32357
#1350 Nick Bostrom,45.69800,32.19160,2.64378,11.60459,55.50492,105.69031,-112.81875,8.53874,15.50061,-45.36727,66.90370,51.52581
...,...,...,...,...,...,...,...,...,...,...,...,...
#1355 Mark Normand,53.96824,-12.31879,-13.92354,3.99236,-9.93127,-4.98798,1.90641,-19.02014,5.63446,-10.81441,0.61735,-2.73523
#1329 Brian Moses,53.53588,-12.91915,-11.57577,1.56410,-9.26498,-0.52925,5.37510,-9.48295,-14.04114,20.85323,7.64136,2.95859
#1358 Sober October 3,46.28193,-13.06410,-16.48772,8.43336,-29.60208,-7.88659,2.86051,-25.98378,-5.18998,-39.05145,2.49390,-2.07148
#1304 Brendan Schaub,42.38405,-13.14461,-13.70654,7.45505,-18.60621,-1.50729,1.33867,-18.23025,-4.40345,5.45436,4.05829,3.44395


### NMF

In [36]:
doc_word = ps_ep_list_tokenizer.transform(ps_ep_list)

In [37]:
nmf_model = NMF(num_topics)
doc_topic = nmf_model.fit_transform(doc_word)

In [38]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = col_names,
             columns = ps_ep_list_tokenizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaa,aah,aaliyah,aaron,aaron lewis,aav,aavs,ab,ab safety,...,zu,zubi,zucchini,zuckerberg,zuckerberg did,zuckerberg facebook,zuckerberg probably,zumba,zurich,zz
component_1,0.025,0.029,0.015,0.016,0.071,0.0,0.0,0.0,0.038,0.009,...,0.018,0.033,0.019,0.007,0.0,0.0,0.003,0.007,0.02,0.032
component_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.002,0.0,0.0,0.0,0.0
component_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_5,0.015,0.0,0.003,0.006,0.013,0.0,0.0,0.0,0.01,0.009,...,0.029,0.012,0.006,0.171,0.005,0.022,0.0,0.0,0.003,0.0
component_6,0.0,0.012,0.0,0.0,0.0,0.003,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.046,0.0,0.013,0.0,0.0,0.0,0.0
component_7,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.008,0.001,0.0,0.0,0.0,0.0,0.0
component_8,0.005,0.0,0.0,0.0,0.093,0.032,0.0,0.0,0.022,0.0,...,0.0,0.003,0.0,0.118,0.016,0.0,0.019,0.013,0.0,0.0
component_9,0.004,0.0,0.007,0.0,0.0,0.0,0.114,0.167,0.045,0.0,...,0.0,0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.0
component_10,0.063,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.005,0.0,...,0.004,0.001,0.0,0.0,0.003,0.0,0.0,0.001,0.001,0.001


#### Results

In [39]:
display_topics(nmf_model, ps_ep_list_tokenizer.get_feature_names(), 10)


Component  1
sober, bus, coke, ha, squarespace, workout, tesla, hip, wine, songs

Component  2
quantum, mechanics, quantum mechanics, function, wave function, physics, electron, worlds, probability, spinning

Component  3
cannabis, marijuana, cbd, medicine, schizophrenia, increase, thc, psychosis, depression, addiction

Component  4
calories, obesity, insulin, cells, carbohydrate, gary, intake, hypothesis, calorie, body fat

Component  5
policy, accounts, banned, context, congress, campaign, trans, conservative, bernie, ideology

Component  6
simulation, artificial, artificial intelligence, ai, simulations, biological, robot, simulated, technological, innovation

Component  7
vaccine, diseases, autism, vaccines, poverty, measles, tropical, lyme, lyme disease, malaria

Component  8
deer, bears, wildlife, elk, hunters, hunter, bow, lion, grizzly, wolves

Component  9
gravity, bob, craft, ufo, radar, object, climate, propulsion, mushroom, airplane

Component  10
aliens, nazis, cia, alex,

In [40]:
H = pd.DataFrame(doc_topic.round(5),
             index = ps_ep_names,
             columns = col_names)
H.sort_values('component_7', ascending=False)[:20].loc[:,['component_7','component_10']]

Unnamed: 0,component_7,component_10
#1261 Peter Hotez,14.71153,0.0
#1234 David Sinclair,1.02292,0.0
#1349 David Sinclair,0.97329,0.0
#1201 William von Hippel,0.78755,0.0
#1294 Jamie Metzl,0.72499,0.24066
#1324 Ian Edwards,0.65191,0.0
#1259 David Wallace-Wells,0.5682,0.0
#1282 Adam Conover,0.52467,0.0
#1300 Michael Malice,0.46188,1.08525
#1274 Nicholas Christakis,0.45532,0.28423


In [42]:
H.loc['#1366 Richard Dawkins',:]

component_1     0.36857
component_2     0.06378
component_3     0.01531
component_4     0.24675
component_5     0.34751
component_6     0.50503
component_7     0.03113
component_8     0.06563
component_9     0.26529
component_10    0.20135
component_11    0.00000
component_12    0.00070
Name: #1366 Richard Dawkins, dtype: float64

In [43]:
H.index

Index(['#1383 Malcolm Gladwell', '#1382 RZA & Donnell Rawlings',
       '#1378 Greg Fitzsimmons', '#1375 Edward Norton', '#1373 Kyle Kulinski',
       '#1371 Andrew Santino', '#1370 Brian Grazer', '#1369 Christopher Ryan',
       '#1368 Edward Snowden', '#1366 Richard Dawkins',
       ...
       '#1186 Marques Brownlee', '#1185 Kelly Slater', '#46 Ari Shaffir',
       '#1184 Roseanne Barr', '#1183 Andrew Santino', '#1182 Nick Kroll',
       '#44 John Kavanagh & George Lockhart', '#1181 John Dudley',
       '#1180 Everlast', '#1179 Nikki Glaser'],
      dtype='object', length=200)