In [1]:
import sys
print("Python Version:", sys.version)

Python Version: 3.7.4 (default, Aug 13 2019, 20:35:49) 
[GCC 7.3.0]


In [2]:
#######################
# standard code block #
#######################

%config InlineBackend.figure_format = 'svg'

# auto reload imports that change
%load_ext autoreload
# only set to auto reload for marked imports
%autoreload 1

In [3]:
import pandas as pd

## Pull Data

### Connect to Database

In [4]:
from pymongo import MongoClient
%aimport credentials.cred
from credentials import cred


config = {
  'host': cred.mongo_host,
  'username': cred.mongo_user,
  'password': cred.mongo_pass,
  'authSource': cred.mongo_auth_db
}

# get a mongo client
client = MongoClient(**config)

# use the clean database
jre_clean = client.jre_clean
podcasts_clean = jre_clean.podcasts

### Get Podcasts from Podscribe

In [5]:
# query
ps_ep_mongo = list(
    podcasts_clean
    .find({},{'_id':0, 'number':1, 'name':1, 'desc':1, 'text':1, 'length':1, 'date':1, 'source':1})
)

In [6]:
from copy import deepcopy

# remove duplicates
mongo_df = pd.DataFrame(ps_ep_mongo).sort_values(['name','source'], ascending=False).drop_duplicates(['name','number'])
ps_ep_mongo = deepcopy(mongo_df.to_dict('records'))
del mongo_df

In [7]:
# pull out text and names
ps_ep_text_list = [ep['text'] for ep in ps_ep_mongo]
ps_ep_list = [' '.join([x['text'] for x in ep]) for ep in ps_ep_text_list]
ps_ep_names = [f"#{x['number']} " + x['name'] for x in ps_ep_mongo]
ps_ep_descs = [x['desc'] for x in ps_ep_mongo]
ps_ep_dates = [x['date'] for x in ps_ep_mongo]
ps_ep_lengths = [x['length'] for x in ps_ep_mongo]
ps_ep_sources = [x['source'] for x in ps_ep_mongo]

## Clean Text

In [8]:
# setup clean
clean_lemm = False
spacy_ = False

### Clean

In [9]:
from gensim.utils import simple_preprocess

In [10]:
if clean_lemm:
    ps_ep_list = [' '.join(simple_preprocess(x)) for x in ps_ep_list]

### Stop Words

In [11]:
from sklearn.feature_extraction import text
new_stop_words = []

# profanity
profanity = (
    open('stop_words/profanity.txt', newline='\n')
    .read()
    .splitlines()
)

new_stop_words += [x.lower() for x in profanity]

# common Joe Rogan words
common_jre_words = [
    'like', 'yeah', 'know', 'just', 'right', 'right', 'think', 'know',
    'people', 'going', 'really', 'got', 'thing', 'want', 'actually',
    'say', 'squarespace', 'sober', 'legalzoom', 'stamps', 'stamps.com',
    'hmm', 'mmm', 'ha', 'com', 'whoop', 'october', 'um', 'uh', 'cash','app'
]

new_stop_words += common_jre_words

# append to english stop words
jre_stop_words = text.ENGLISH_STOP_WORDS.union(new_stop_words)

### Lemmatize

In [12]:
from nltk.stem import WordNetLemmatizer

if clean_lemm:
    lemmatizer = WordNetLemmatizer()

    for i, ep in enumerate(ps_ep_list):
        ps_ep_list[i] = ' '.join([lemmatizer.lemmatize(x) for x in ep.split(' ')])

    jre_stop_words = [lemmatizer.lemmatize(x) for x in jre_stop_words]

### SpaCy

In [13]:
if spacy_:
    import spacy
    from spacy import displacy
    from collections import Counter
    import en_core_web_sm

In [14]:
from IPython.display import clear_output

if spacy_:
    ps_spacy = en_core_web_sm.load()

    ps_spacy_list = []
    tot = len(ps_ep_list)

    for i, doc in enumerate(ps_ep_list):
        clear_output()
        print((i/tot)*100)
        ps_spacy_list.append(ps_spacy(doc))

In [15]:
if spacy_:
    for ent in ps_spacy_list[0].ents:
        print(ent.text, ent.label_)

In [16]:
if spacy_:
    for ent in ps_spacy_list[0].ents:
        print(ent.text, ent.label_)

In [17]:
if spacy_:
    selected_tokens = [
        y for y in ps_spacy_list[0] if
        (
            not y.is_stop and
            not y.is_punct and
            not y.is_space
        )
    ]

## Tokenization

In [18]:
# Setup Tokenization
tf_idf = False

### TF-IDF

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

if tf_idf:
    # Create TF-IDF vectorizer
    ps_ep_list_tfidf = TfidfVectorizer(
        ngram_range=(1,3), binary=True, stop_words=jre_stop_words,
        token_pattern="\\b[a-z][a-z]+\\b", max_df=0.35, min_df=0.01
    )        

    # fit
    _ = ps_ep_list_tfidf.fit(ps_ep_list)

### Count Vectorize

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

if not tf_idf:
    # Create a CountVectorizer for parsing/counting words
    ps_ep_list_cvec = CountVectorizer(
        ngram_range=(1, 3), stop_words=jre_stop_words,
        token_pattern="\\b[a-z][a-z]+\\b",  max_df=0.3, min_df=0.01
    )

    # fit
    cvec_doc_word = ps_ep_list_cvec.fit_transform(ps_ep_list)

In [21]:
if not tf_idf:
    word = 'bc'
    cvec_df = pd.DataFrame(cvec_doc_word.toarray(), index=ps_ep_names, columns=ps_ep_list_cvec.get_feature_names())
#     display(
#         cvec_df[cvec_df[word] >=1].loc[:,word].sort_values(ascending=False)
#     )

## Models

### Model Setup

In [71]:
# Tokenizer
if tf_idf:
    ps_ep_list_tokenizer = ps_ep_list_tfidf
else:
    ps_ep_list_tokenizer = ps_ep_list_cvec

# number of topics
num_topics = 13

# column names
col_names = ['component_'+str(x) for x in range(1,num_topics+1)]

# run LDA?
run_lda = False
run_lsa = False

### LDA

In [72]:
# gensim
from gensim import corpora, models, similarities, matutils

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [73]:
# Create the term-document matrix
# Transpose it so the terms are the rows
if run_lda:
    doc_word = ps_ep_list_tokenizer.transform(ps_ep_list).transpose()

In [74]:
# Convert sparse matrix of counts to a gensim corpus
if run_lda:
    corpus = matutils.Sparse2Corpus(doc_word)

In [75]:
if run_lda:    
    id2word = dict((v, k) for k, v in ps_ep_list_tokenizer.vocabulary_.items())

In [76]:
# Create lda model (equivalent to "fit" in sklearn)
if run_lda:    
    lda = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=50)

In [77]:
if run_lda:
    lda.print_topics()

### LSA

In [78]:
import pandas as pd 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [79]:
if run_lsa:
    doc_word = ps_ep_list_tokenizer.transform(ps_ep_list)
    doc_word.shape

In [80]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
if run_lsa:
    lsa = TruncatedSVD(num_topics)
    doc_topic = lsa.fit_transform(doc_word)
    lsa.explained_variance_ratio_

In [81]:
if run_lsa:
    topic_word = pd.DataFrame(lsa.components_.round(3),
                 index = col_names,
                 columns = ps_ep_list_tokenizer.get_feature_names())
    topic_word

In [82]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nComponent ", ix+1)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] if topic[i] >=0 else 'NOT '+feature_names[i]
                        for i in abs(topic).argsort()[:-no_top_words - 1:-1]]))

#### Results

In [83]:
if run_lsa:
    display_topics(lsa, ps_ep_list_tokenizer.get_feature_names(), 10)

In [84]:
if run_lsa:
    Vt = pd.DataFrame(doc_topic.round(5),
                 index = ps_ep_names,
                 columns = col_names)
    Vt.sort_values('component_2', ascending=False)

### NMF

In [85]:
doc_word = ps_ep_list_tokenizer.transform(ps_ep_list)

In [86]:
nmf_model = NMF(num_topics, random_state=42)
doc_topic = nmf_model.fit_transform(doc_word)

In [87]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = col_names,
             columns = ps_ep_list_tokenizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaa,aaron,ab,abandon,abandoned,abandoning,abbott,abbreviated,abby,...,zoom share device,zoom video,zoom video communications,zoom video conferencing,zoom zoom,zoom zoom delivers,zoomed,zooms,zoos,zuckerberg
component_1,0.043,0.042,0.131,0.036,0.098,0.162,0.021,0.012,0.021,0.022,...,0.019,0.086,0.057,0.028,0.04,0.014,0.003,0.028,0.013,0.016
component_2,0.0,0.0,0.0,0.0,0.051,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.051,0.004,0.0,0.0
component_3,0.0,0.0,0.027,0.016,0.013,0.061,0.026,0.034,0.0,0.024,...,0.0,0.002,0.004,0.0,0.0,0.002,0.0,0.098,0.12,0.073
component_4,0.0,0.046,0.053,0.022,0.059,0.057,0.011,0.001,0.0,0.012,...,0.001,0.001,0.0,0.002,0.005,0.002,0.002,0.0,0.0,0.0
component_5,0.0,0.002,0.0,0.0,0.111,0.039,0.002,0.004,0.015,0.086,...,0.0,0.001,0.001,0.0,0.0,0.001,0.001,0.003,0.019,0.188
component_6,0.0,0.0,0.0,0.0,0.0,0.003,0.0,0.047,0.0,0.055,...,0.005,0.026,0.008,0.017,0.006,0.001,0.0,0.005,0.0,0.0
component_7,0.002,0.004,0.0,0.001,0.01,0.02,0.002,0.02,0.001,0.0,...,0.004,0.008,0.003,0.005,0.001,0.005,0.0,0.0,0.0,0.0
component_8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.006,0.0,0.001,0.0,0.0,0.0
component_9,0.009,0.0,0.031,0.007,0.0,0.031,0.0,0.0,0.0,0.0,...,0.037,0.123,0.049,0.074,0.112,0.026,0.0,0.005,0.046,0.0
component_10,0.002,0.011,0.008,0.0,0.0,0.0,0.003,0.0,0.0,0.036,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039


#### Results

In [90]:
display_topics(nmf_model, ps_ep_list_tokenizer.get_feature_names(), 10)


Component  1
comics, goddamn, netflix, coke, code joe, joey, comedians, bus, cop, yoga

Component  2
saturated, saturated fat, cholesterol, increase, randomized, protein, observational, heart disease, dietary, epidemiology

Component  3
deer, hunt, wildlife, bears, elk, hunters, wolves, cwd, hunter, bow

Component  4
ancient, civilization, climate, thousand years, sphinx, egypt, modern, ice age, flood, pyramid

Component  5
platform, speech, gender, trans, violence, policy, racist, wing, accounts, content

Component  6
calories, obesity, insulin, carbohydrate, carb, ketogenic, calorie, ketogenic diet, metabolic, body fat

Component  7
cannabis, marijuana, alcohol, heroin, medicine, cbd, addiction, schizophrenia, cocaine, thc

Component  8
quantum, mechanics, quantum mechanics, wave, function, physics, electron, worlds, particles, probability

Component  9
cells, diseases, stem, aging, plants, bacteria, vaccine, immune, vitamin, shown

Component  10
puerto, tax, rico, economy, puerto r

In [91]:
from copy import deepcopy

# select component
nmf_comp = 'component_11'

# make dataframe
H = pd.DataFrame(doc_topic.round(5),
             index = ps_ep_names,
             columns = col_names)

# get and re-order columns so selected component is first
col_list = deepcopy(H.columns)
new_index = pd.Index([nmf_comp]).append(col_list.drop(nmf_comp))

# display with selected component sorted on and first
H.sort_values(nmf_comp, ascending=False)[:5].loc[:,new_index.values]

Unnamed: 0,component_11,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,component_12,component_13
#1350 Nick Bostrom,10.67823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#1188 Lex Fridman,6.55706,0.07844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.34034,0.0
#1211 Dr. Ben Goertzel,5.65799,0.0,0.0,0.0,0.0,0.28057,0.0,0.0,0.24052,0.03552,0.14995,0.0,0.0
#1292 Lex Fridman,3.83396,0.47927,0.00155,0.0,0.0,0.32416,0.0,0.0,0.0,0.0,0.0,1.55908,0.0
#1294 Jamie Metzl,3.48451,0.0,0.0,0.11731,0.22861,0.45389,0.0,0.03576,0.0,1.77953,0.21593,0.0,0.24345


In [92]:
rename_dict = {
    'component_1':'Writers/Entertainers', 'component_2':'Diet1', 'component_3':'Hunting',
    'component_4':'Ancient History', 'component_5':'Politics', 'component_6':'Diet2', 'component_7':'Marijuana/Drugs',
    'component_8':'Physics/Math', 'component_9':'Biology', 'component_10':'Economics',
    'component_11':'AI/Tech', 'component_12':'Fighting', 'component_13':'CIA/Aliens/Conspiracy'
}

In [93]:
rec_df = H.rename(rename_dict, axis='columns')

In [94]:
rec_df['Date'] = ps_ep_dates
rec_df['Length'] = ps_ep_lengths

In [95]:
import datetime

In [96]:
def len_to_sec(time):
    time_ls = time.split(':')
    return int(time_ls[0])*3600 + int(time_ls[1])*60 + int(time_ls[2])

In [97]:
rec_df['Length'] = rec_df['Length'].apply(len_to_sec)

In [98]:
rec_df = rec_df.drop(['Length','Date'], axis='columns')

In [99]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import linear_kernel

In [100]:
count_scaled = StandardScaler().fit_transform(rec_df.to_numpy())
cosine_sim = linear_kernel(count_scaled, count_scaled)

In [101]:
class recommender():
    def __init__(self, cosine_sim):
        self.cosine_sim = cosine_sim
    def recommend(self, title):
        recommended = []
        most_topic_idx = rec_df.loc[title,:].idxmax(axis='columns')
        topic_df = rec_df[rec_df.idxmax(axis='columns') == most_topic_idx]
        score_series = pd.Series(self.cosine_sim[topic_df]).sort_values(ascending = False)
        return score_series

In [102]:
score_series = pd.Series(cosine_sim[6]).sort_values(ascending = False)