In [4]:
import json, re, nltk

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.wordnet import WordNetLemmatizer

stopwords = set(nltk.corpus.stopwords.words('english'))

In [5]:
raw_dat = pd.read_csv('../script_output/episode_transcript_data_w_metadata.csv')
raw_dat.head()

Unnamed: 0.1,index,show_uri,show_name,show_description,publisher,language,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix,Unnamed: 0,episode,transcript
0,0,spotify:show:2NYtxEZyYelR6RMKmjfPLB,Kream in your Koffee,A 20-something blunt female takes on the world...,Katie Houle,['en'],https://anchor.fm/s/11b84b68/podcast/rss,spotify:episode:000A9sRBYdVh66csG2qEdj,1: It’s Christmas Time!,On the first ever episode of Kream in your Kof...,12.700133,show_2NYtxEZyYelR6RMKmjfPLB,000A9sRBYdVh66csG2qEdj,34866,000A9sRBYdVh66csG2qEdj,Hello. Hello. Hello everyone. This is Katie an...
1,1,spotify:show:15iWCbU7QoO23EndPEO6aN,Morning Cup Of Murder,Ever wonder what murder took place on today in...,Morning Cup Of Murder,['en'],https://anchor.fm/s/b07181c/podcast/rss,spotify:episode:000HP8n3hNIfglT2wSI2cA,The Goleta Postal Facility shootings- January ...,"See something, say something. It’s a mantra ma...",6.019383,show_15iWCbU7QoO23EndPEO6aN,000HP8n3hNIfglT2wSI2cA,14162,000HP8n3hNIfglT2wSI2cA,There were two more murders 15 miles away arri...
2,2,spotify:show:6vZRgUFTYwbAA79UNCADr4,Inside The 18 : A Podcast for Goalkeepers by G...,Inside the 18 is your source for all things Go...,Inside the 18 GK Media,['en'],https://anchor.fm/s/81a072c/podcast/rss,spotify:episode:001UfOruzkA3Bn1SPjcdfa,Ep.36 - Incorporating a Singular Goalkeeping C...,Today’s episode is a sit down Michael and Omar...,43.616333,show_6vZRgUFTYwbAA79UNCADr4,001UfOruzkA3Bn1SPjcdfa,93168,001UfOruzkA3Bn1SPjcdfa,Welcome to inside the 18. Today's episode is t...
3,3,spotify:show:5BvKEjaMSuvUsGROGi2S7s,Arrowhead Live!,Your favorite podcast for everything @Chiefs! ...,Arrowhead Live!,['en-US'],https://anchor.fm/s/917dba4/podcast/rss,spotify:episode:001i89SvIQgDuuyC53hfBm,Episode 1: Arrowhead Live! Debut,Join us as we take a look at all current Chief...,58.1892,show_5BvKEjaMSuvUsGROGi2S7s,001i89SvIQgDuuyC53hfBm,69703,001i89SvIQgDuuyC53hfBm,Hey cheese fans before we get started. I wante...
4,4,spotify:show:7w3h3umpH74veEJcbE6xf4,FBoL,"The comedy podcast about toxic characters, wri...",Emily Edwards,['en'],https://www.fuckboisoflit.com/episodes?format=rss,spotify:episode:0025RWNwe2lnp6HcnfzwzG,"The Lion, The Witch, And The Wardrobe - Ashley...",The modern morality tail of how to stay good f...,51.78205,show_7w3h3umpH74veEJcbE6xf4,0025RWNwe2lnp6HcnfzwzG,104381,0025RWNwe2lnp6HcnfzwzG,"Sorry to interrupt the show, but I do have to ..."


In [25]:
raw_dat = raw_dat.dropna()

In [26]:
def tokenize_titles(title):
    tokens = nltk.word_tokenize(title)
    lmtzr = WordNetLemmatizer()
    filtered_tokens = []
    
    for token in tokens:
        token = token.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
        token = re.sub(r'[^a-zA-Z0-9 ]', '', token)
        if token not in stopwords:
            filtered_tokens.append(token.lower())
    
    lemmas = [lmtzr.lemmatize(t,'v') for t in filtered_tokens]

    return lemmas

In [28]:
raw_dat['show_description_tokens'] = raw_dat.show_description.apply(tokenize_titles)
raw_dat['show_description_cleaned'] = raw_dat.show_description_tokens.str.join(' ')

In [34]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                #tokenizer=tokenize_titles,
                                max_features=500,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.8, 
                                min_df = 20,
                                ngram_range=(2,4))
dtm_tf = tf_vectorizer.fit_transform(raw_dat.show_description_cleaned)
print(dtm_tf.shape)

(105153, 500)


In [44]:
lda_tf = LatentDirichletAllocation(n_components=5, 
                                   max_iter=100,
                                   learning_method='online', 
                                   random_state = 0, 
                                   n_jobs=-2)
lda_tf.fit(dtm_tf)

LatentDirichletAllocation(learning_method='online', n_components=5, n_jobs=-2,
                          random_state=0)

In [51]:
n_top_words = 30
tf_feature_names = tf_vectorizer.get_feature_names()

topics = dict()
for topic_idx, topic in enumerate(lda_tf.components_):
    topics[topic_idx] = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("Topic #%d:" % topic_idx)
    print(" | ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print("\n")

Topic #0:
new episodes | true crime | mental health | social media | parcast network | cutler media | talk things | audio experience | youtube channel | gary vaynerchuk | make sure | podcast talk | latest news | share stories | serial killers | help people | real life | media production | current events | new podcast | network cutler media production | network cutler media | parcast network cutler | parcast network cutler media | network cutler | cutler media production | podcast discuss | follow instagram | new york | episodes release


Topic #1:
support podcast | support podcast https | podcast https | personal development | audioblog blogcast | daily audioblog | think optimal | daily audioblog blogcast | podcast support | podcast feature | podcast support podcast | podcast support podcast https | best blog | health fitness | single day | tip trick | blogcast support podcast | blogcast support podcast https | audioblog blogcast support | audioblog blogcast support podcast | daily aud

In [46]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
