In [1]:
import sys
print("Python Version:", sys.version)

Python Version: 3.7.4 (default, Aug 13 2019, 20:35:49) 
[GCC 7.3.0]


In [2]:
#######################
# standard code block #
#######################

%config InlineBackend.figure_format = 'svg'

# auto reload imports that change
%load_ext autoreload
# only set to auto reload for marked imports
%autoreload 1

In [3]:
import pandas as pd

## Pull Data

### Connect to Database

In [4]:
from pymongo import MongoClient
%aimport credentials.cred
from credentials import cred


config = {
  'host': cred.mongo_host,
  'username': cred.mongo_user,
  'password': cred.mongo_pass,
  'authSource': cred.mongo_auth_db
}

# get a mongo client
client = MongoClient(**config)

# use the raw database
jre_raw = client.jre_raw
podcasts_raw = jre_raw.podcasts

# use the clean database
jre_clean = client.jre_clean
podcasts_clean = jre_clean.podcasts

### Get Podcasts from Podscribe

In [5]:
# query
ps_ep_mongo = list(
    podcasts_raw
    .find({'source':'podscribe'},{'_id':0, 'text':1, 'name':1, 'desc':1})
    .limit(200)
)

In [6]:
# pull out text and names
ps_ep_list = [x['text'] for x in ps_ep_mongo]
ps_ep_names = [x['name'] for x in ps_ep_mongo]
ps_ep_descs = [x['desc'] for x in ps_ep_mongo]

In [7]:
ps_ep_df = pd.DataFrame(ps_ep_descs, index=ps_ep_names, columns = ['Description'])

In [8]:
ps_ep_df

Unnamed: 0,Description
#1383 - Malcolm Gladwell,"Malcolm Gladwell is a journalist, author, and ..."
#1382 - RZA & Donnell Rawlings,"RZA is a rapper, record producer, musician, ac..."
#1378 - Greg Fitzsimmons,Greg Fitzsimmons is a writer and stand-up come...
#1375 - Edward Norton,"Edward Norton is an actor, writer, producer, d..."
#1373 - Kyle Kulinski,"Kyle Kulinski is a political activist, progres..."
...,...
#1182 - Nick Kroll,"Nick Kroll is an actor, comedian, writer, and ..."
JRE MMA Show #44 with John Kavanagh & George Lockhart,Joe is joined by MMA coach & Brazilian jiu-jit...
#1181 - John Dudley,John Dudley is a pro archer and host of “Nock ...
#1180 - Everlast,Everlast is a Grammy Award-winning American ra...


## Clean Text

In [9]:
# setup clean
lemm = False
clean = False

### Clean

In [10]:
from gensim.utils import simple_preprocess

In [11]:
if clean:
    ps_ep_list = [' '.join(simple_preprocess(x)) for x in ps_ep_list]

### Stop Words

In [12]:
from sklearn.feature_extraction import text
new_stop_words = []

# profanity
profanity = (
    open('stop_words/profanity.txt', newline='\n')
    .read()
    .splitlines()
)

new_stop_words += [x.lower() for x in profanity]

# common Joe Rogan words
common_jre_words = [
    'like', 'yeah', 'know', 'just', 'right', 'right', 'think', 'know',
    'people', 'going', 'really', 'got', 'thing', 'want', 'actually',
    'say'
]
new_stop_words += common_jre_words

# append to english stop words
jre_stop_words = text.ENGLISH_STOP_WORDS.union(new_stop_words)

### Lemmatize

In [13]:
from nltk.stem import WordNetLemmatizer

if lemm:
    lemmatizer = WordNetLemmatizer()

    for i, ep in enumerate(ps_ep_list):
        ps_ep_list[i] = ' '.join([lemmatizer.lemmatize(x) for x in ep.split(' ')])

    jre_stop_words = [lemmatizer.lemmatize(x) for x in jre_stop_words]

### SpaCy

In [14]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

In [15]:
from IPython.display import clear_output

ps_spacy = en_core_web_sm.load()

ps_spacy_list = []
tot = len(ps_ep_list)

for i, doc in enumerate(ps_ep_list):
    clear_output()
    print((i/tot)*100)
    ps_spacy_list.append(ps_spacy(doc))

In [None]:
def identity_tokenizer(text):
    return text

tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, stop_words='english', lowercase=False)    
tfidf.fit_transform(tokenized_list_of_sentences)

In [58]:
for ent in ps_spacy_list[0].ents:
    print(ent.text, ent.label_)

as little as one CARDINAL
four or five day DATE
today DATE
app ORG
Joe PERSON
Rogan PERSON
one CARDINAL
ten dollars MONEY
app ORG
ten dollars MONEY
Justin Ren's PERSON
today DATE
one CARDINAL
CBD ORG
CBD MD PRODUCT
CBD p.m. TIME
500 CARDINAL
USA GPE
valerian NORP
CBD p.m. WORK_OF_ART
CBD MDS PRODUCT
CBO ORG
CBD PRODUCT
Jerry listeners 20 percent PERCENT
Rogan Ro Gan PERSON
CBD MD PRODUCT
Rogan PERSON
20% PERCENT
greens NORP
greens NORP
one CARDINAL
75 CARDINAL
Whole Food ORG
zero CARDINAL
Whole Foods ORG
Leti PERSON
20 CARDINAL
79 CARDINAL
greens.com ORG
20 CARDINAL
79 MONEY
first ORDINAL
UK GPE
Europe LOC
today DATE
one CARDINAL
Malcolm Gladwell PERSON
Joe Rogan PERSON
Joe Rogan PERSON
Malcolm PERSON
Joe PERSON
first ORDINAL
second ORDINAL
two CARDINAL
Amanda Knox PERSON
Bernie Madoff PERSON
Larry Nasser PERSON
Michigan State ORG
Jerry Sandusky PERSON
Penn State ORG
Sandra Bland PERSON
Texas GPE
today DATE
Awkward NORP
one CARDINAL
half CARDINAL
Wind Sprints PERSON
mmm PERSON
Sandra B

Jamie PERSON
Jamie PERSON
Chris PERSON
one CARDINAL
Larry David PERSON
Oddball PERSON
Prius FAC
500 million dollars MONEY
New York GPE
Jerry PERSON
Comics ORG
Comics ORG
a hundred percent PERCENT
Staple Center of people ORG
Law PERSON
Law PERSON
47 DATE
Savage NORP
Malcolm PERSON
Dick Wolf PERSON
Law PERSON
Eastern NORP
Eastern NORP
Eastern ORG
four CARDINAL
Eastern ORG
Law PERSON
Eastern ORG
Serpico ORG
Eastern NORP
Hollywood GPE
Law PERSON
Law PERSON
Law & Order ORG
Justice ORG
all day DATE
John Grisham PERSON
Southern NORP
John Grisham PERSON
John Anderson PERSON
Law PERSON
one CARDINAL
four CARDINAL
Brits NORP
British NORP
Sherlock Holmes PERSON
Charles PERSON
Westerns PERSON
Jack Reacher PERSON
Lee Child PERSON
one CARDINAL
Reacher PERSON
Army ORG
Montana GPE
a 21st century DATE
50 CARDINAL
the Hudson News FAC
more than one CARDINAL
Lee Childs PERSON
years DATE
Breakneck PERSON
one CARDINAL
Stephen Hunter PERSON
Bob Lee Swagger PERSON
Delights PERSON
one CARDINAL
Mark Wahlberg PER

In [22]:
ps_spacy_list[0].

spacy.tokens.doc.Doc

In [None]:
for ent in ps_spacy_list[0].ents:
    print(ent.text, ent.label_)

In [55]:
selected_tokens = [
    y for y in ps_spacy_list[0] if
    (
        not y.is_stop and
        not y.is_punct and
        not y.is_space
    )
]

In [57]:
selected_tokens

[Hello,
 friends,
 episode,
 podcast,
 brought,
 motherfucking,
 cash,
 app,
 cash,
 app,
 simplest,
 way,
 send,
 save,
 money,
 simplest,
 way,
 try,
 grow,
 money,
 introducing,
 cash,
 app,
 investing,
 cash,
 app,
 investing,
 unlike,
 bullshit,
 ass,
 Investments,
 tools,
 let,
 buy,
 entire,
 shares,
 stock,
 cash,
 app,
 lets,
 instantly,
 invest,
 little,
 want,
 way,
 favorite,
 company,
 stock,
 little,
 expensive,
 piece,
 little,
 dollar,
 whoo,
 cash,
 app,
 directly,
 connected,
 bank,
 account,
 day,
 waiting,
 periods,
 inbound,
 transfers,
 start,
 investing,
 today,
 brokerage,
 services,
 provided,
 cash,
 app,
 investing,
 subsidiary,
 square,
 member,
 s,
 IPC,
 course,
 download,
 cash,
 app,
 enter,
 referral,
 code,
 Joe,
 Rogan,
 word,
 receive,
 dollars,
 cash,
 app,
 send,
 dollars,
 Justin,
 Ren,
 fight,
 Forgotten,
 charity,
 download,
 cash,
 app,
 app,
 store,
 Google,
 Play,
 store,
 today,
 brought,
 CB,
 D,
 MD,
 CBD,
 big,
 life,
 stuff,
 time,
 thin

## Tokenization

In [225]:
# Setup Tokenization
tf_idf = False

### TF-IDF

In [226]:
from sklearn.feature_extraction.text import TfidfVectorizer

if tf_idf:
    # Create TF-IDF vectorizer
    ps_ep_list_tfidf = TfidfVectorizer(
        ngram_range=(1,3), binary=True, stop_words=jre_stop_words,
        token_pattern="\\b[a-z][a-z]+\\b", max_df=0.4, min_df=0.01
    )

    # fit
    _ = ps_ep_list_tfidf.fit(ps_ep_list)

### Count Vectorize

In [227]:
from sklearn.feature_extraction.text import CountVectorizer

if not tf_idf:
    # Create a CountVectorizer for parsing/counting words
    ps_ep_list_cvec = CountVectorizer(
        ngram_range=(1, 3), stop_words=jre_stop_words,
        token_pattern="\\b[a-z][a-z]+\\b",  max_df=0.4, min_df=0.01
    )

    # fit
    _ = ps_ep_list_cvec.fit(ps_ep_list)

## Models

### Model Setup

In [247]:
# Tokenizer
if tf_idf:
    ps_ep_list_tokenizer = ps_ep_list_tfidf
else:
    ps_ep_list_tokenizer = ps_ep_list_cvec

# number of topics
num_topics = 10

# column names
col_names = ['component_'+str(x) for x in range(1,num_topics+1)]

# run LDA?
run_lda = False

### LDA

In [248]:
# gensim
from gensim import corpora, models, similarities, matutils

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [249]:
# Create the term-document matrix
# Transpose it so the terms are the rows
if run_lda:
    doc_word = ps_ep_list_tokenizer.transform(ps_ep_list).transpose()

In [250]:
# Convert sparse matrix of counts to a gensim corpus
if run_lda:
    corpus = matutils.Sparse2Corpus(doc_word)

In [251]:
if run_lda:    
    id2word = dict((v, k) for k, v in ps_ep_list_tokenizer.vocabulary_.items())

In [252]:
# Create lda model (equivalent to "fit" in sklearn)
if run_lda:    
    lda = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=50)

In [253]:
if run_lda:
    lda.print_topics()

### LSA

In [254]:
import pandas as pd 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [255]:
doc_word = ps_ep_list_tokenizer.transform(ps_ep_list)
doc_word.shape

(200, 242772)

In [256]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = TruncatedSVD(num_topics)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.00694156, 0.02429202, 0.02302283, 0.02129645, 0.02012436,
       0.01865214, 0.01839021, 0.01437416, 0.01249472, 0.01159063])

In [257]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = col_names,
             columns = ps_ep_list_tokenizer.get_feature_names())
topic_word

Unnamed: 0,aa,aa meeting,aaa,aah,aaliyah,aaron,aaron lewis,aav,aavs,ab,...,zu,zubi,zucchini,zuckerberg,zuckerberg did,zuckerberg facebook,zuckerberg probably,zumba,zurich,zz
component_1,0.002,0.0,0.001,0.0,0.0,0.003,0.0,0.001,0.001,0.006,...,0.001,0.001,0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.001
component_2,-0.001,-0.0,-0.0,-0.0,-0.0,-0.001,-0.0,-0.0,0.0,-0.001,...,-0.0,-0.0,-0.0,-0.001,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
component_3,-0.0,-0.0,-0.0,-0.0,-0.0,-0.002,-0.0,0.0,0.0,0.0,...,-0.0,-0.0,-0.0,0.001,-0.0,0.0,-0.0,-0.0,-0.0,-0.0
component_4,-0.0,-0.0,-0.001,-0.0,-0.0,-0.002,-0.0,-0.0,-0.001,-0.004,...,0.0,-0.001,-0.0,0.001,-0.0,0.0,-0.0,-0.0,-0.0,-0.001
component_5,-0.001,0.0,0.0,0.0,0.0,0.002,0.0,0.0,0.0,0.004,...,-0.001,0.0,-0.0,-0.005,0.0,-0.001,0.0,0.0,0.0,0.001
component_6,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.001,0.0,...,-0.0,-0.0,-0.0,-0.002,0.0,-0.0,0.0,0.0,-0.0,0.0
component_7,-0.0,0.0,0.001,-0.0,-0.0,-0.0,0.0,0.001,0.001,-0.002,...,-0.001,-0.0,-0.0,-0.002,-0.0,-0.0,-0.0,-0.0,-0.0,0.0
component_8,-0.0,-0.0,-0.0,0.0,-0.0,0.001,0.0,0.001,0.001,-0.001,...,-0.001,-0.001,-0.0,-0.001,-0.0,-0.0,0.0,0.0,-0.0,-0.0
component_9,0.003,0.0,-0.001,0.0,-0.0,-0.002,-0.0,0.003,0.004,-0.002,...,-0.0,-0.0,0.0,-0.003,-0.0,0.0,-0.0,-0.0,-0.0,0.0
component_10,0.001,0.0,0.002,0.001,0.0,-0.002,-0.001,-0.0,-0.001,-0.001,...,0.001,0.0,-0.0,-0.008,-0.0,-0.001,-0.001,0.0,0.001,0.001


In [258]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

#### Results

In [259]:
display_topics(lsa, ps_ep_list_tokenizer.get_feature_names(), 5)


Topic  0
robot, policy, marijuana, deer, calorie

Topic  1
calorie, obesity, carbohydrate, insulin, hypothesis

Topic  2
quantum, mechanic, quantum mechanic, wave function, electron

Topic  3
cannabis, marijuana, vaccine, medicine, cbd

Topic  4
cannabis, quantum, mechanic, marijuana, quantum mechanic

Topic  5
vaccine, autism, poverty, measles, tropical

Topic  6
simulation, cannabis, artificial, robot, artificial intelligence

Topic  7
cartel, mexico, border, mexican, wildlife

Topic  8
nazi, cia, german, ufo, conspiracy

Topic  9
simulation, nazi, mexico, alex, tweet


In [None]:
ps_ep

In [267]:
Vt = pd.DataFrame(doc_topic.round(5),
             index = ps_ep_names,
             columns = col_names)
Vt.sort_values('component_2', ascending=False)

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10
#1267 - Gary Taubes & Stephan Guyenet,0.00000,16.54237,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
JRE MMA Show #44 with John Kavanagh & George Lockhart,0.84098,1.23212,0.00000,0.00000,0.03683,0.00000,0.00402,0.00000,0.00000,0.05369
#1201 - William von Hippel,0.58391,1.02074,0.00000,0.14098,0.00000,0.41814,0.00000,0.04284,0.00000,3.63482
#1234 - David Sinclair,0.68544,0.96811,0.17761,0.50642,0.00000,0.59684,0.00000,0.00000,0.00000,2.11543
#1235 - Ben Greenfield,1.10226,0.89182,0.00000,1.07886,0.00000,0.22480,0.00000,0.06527,0.00000,1.22359
...,...,...,...,...,...,...,...,...,...,...
#1256 - David Lee Roth,1.30209,0.00000,0.00000,0.00000,0.13232,0.00000,0.00000,0.02334,0.27815,0.31132
"#1258 - Jack Dorsey, Vijaya Gadde & Tim Pool",0.00000,0.00000,0.00000,0.00000,8.43639,0.00000,0.00000,0.00000,0.00000,0.00000
#1259 - David Wallace-Wells,0.00000,0.00000,0.04891,0.00000,0.87703,0.57661,0.00000,0.19594,0.00000,2.92137
Swapcast - Podcast On A Plane with John Dudley,0.85828,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.07054,0.22258,0.34653


### NMF

In [261]:
doc_word = ps_ep_list_tokenizer.transform(ps_ep_list)

In [262]:
nmf_model = NMF(num_topics)
doc_topic = nmf_model.fit_transform(doc_word)

In [263]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = col_names,
             columns = ps_ep_list_tokenizer.get_feature_names())
topic_word

Unnamed: 0,aa,aa meeting,aaa,aah,aaliyah,aaron,aaron lewis,aav,aavs,ab,...,zu,zubi,zucchini,zuckerberg,zuckerberg did,zuckerberg facebook,zuckerberg probably,zumba,zurich,zz
component_1,0.027,0.01,0.022,0.015,0.014,0.107,0.007,0.0,0.0,0.191,...,0.013,0.028,0.013,0.054,0.009,0.0,0.012,0.013,0.016,0.024
component_2,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_5,0.016,0.0,0.0,0.003,0.005,0.012,0.0,0.0,0.0,0.031,...,0.027,0.01,0.005,0.136,0.005,0.016,0.0,0.0,0.003,0.0
component_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,...,0.001,0.0,0.0,0.006,0.002,0.0,0.0,0.0,0.0,0.0
component_7,0.002,0.0,0.017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.004,0.0,0.0,0.0,0.0,0.0,0.0
component_8,0.0,0.0,0.0,0.003,0.0,0.007,0.004,0.002,0.002,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.0,0.0
component_9,0.056,0.0,0.0,0.002,0.0,0.0,0.0,0.026,0.036,0.01,...,0.006,0.001,0.0,0.0,0.002,0.0,0.0,0.001,0.0,0.002
component_10,0.005,0.0,0.0,0.0,0.0,0.0,0.006,0.018,0.029,0.022,...,0.0,0.004,0.004,0.072,0.0,0.017,0.0,0.0,0.0,0.0


#### Results

In [264]:
display_topics(nmf_model, ps_ep_list_tokenizer.get_feature_names(), 10)


Topic  0
sober, deer, shark, elk, coke, pig, florida, hip, squarespace, cigarette

Topic  1
calorie, obesity, carbohydrate, insulin, hypothesis, gary, intake, fat cell, experiment, body fat

Topic  2
quantum, mechanic, quantum mechanic, wave function, electron, physic, equation, probability, particle, atom

Topic  3
cannabis, marijuana, cbd, medicine, schizophrenia, thc, psychosis, depression, addiction, suicide

Topic  4
policy, tweet, conservative, campaign, election, republican, journalist, democrat, banned, ideology

Topic  5
vaccine, autism, poverty, measles, tropical, lyme, lyme disease, malaria, anti vaccine, tropical disease

Topic  6
simulation, artificial, probability, simulated, biological, artificial intelligence, robot, technological, hypothesis, innovation

Topic  7
cartel, mexico, border, mexican, wildlife, warden, marijuana, game warden, officer, county

Topic  8
nazi, german, cia, conspiracy, alex, moon, hitler, jones, ufo, yoga

Topic  9
robot, artificial, ai, tesla,

In [265]:
H = pd.DataFrame(doc_topic.round(5),
             index = ps_ep_names,
             columns = col_names)
H

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10
#1383 - Malcolm Gladwell,1.00173,0.03428,0.00000,0.39621,0.78084,0.00000,0.11830,0.41910,0.21901,0.45969
#1382 - RZA & Donnell Rawlings,2.11829,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
#1378 - Greg Fitzsimmons,1.47922,0.01162,0.13995,0.02224,1.27717,0.17705,0.07273,0.00000,0.00000,0.00000
#1375 - Edward Norton,0.65468,0.05641,0.00000,0.00000,0.01527,0.00000,0.00000,0.00000,0.16497,0.32266
#1373 - Kyle Kulinski,0.06317,0.00000,0.05268,0.00000,4.66551,0.04394,0.04762,1.16025,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...
#1182 - Nick Kroll,0.92447,0.00000,0.00000,0.17319,0.05584,0.00000,0.00000,0.00000,0.11332,0.34506
JRE MMA Show #44 with John Kavanagh & George Lockhart,0.84098,1.23212,0.00000,0.00000,0.03683,0.00000,0.00402,0.00000,0.00000,0.05369
#1181 - John Dudley,1.00596,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.19417,0.00000,0.09556
#1180 - Everlast,1.22275,0.01222,0.00000,0.00000,0.06688,0.01030,0.14291,0.00000,0.10829,0.11704
