# Topic Modeling Podcasts

The plan is to discover the main topics that "This American Life" podcast is covering through their entire archive.


In [1]:
# import podcast vec -- vector representation of each podcast in wordEmbed space
import pickle


# Uploads the the podcasts as a vector
with open('Flask/podcast_vec.pkl', 'rb') as handle:
    podcast_vec = pickle.load(handle)
    
# Opens the podcast title to ID reference
with open('Flask/title_ids.pkl', 'rb') as handle:
    PODCAST_IDS = pickle.load(handle)

    
# Opens the pretrained Word Embedding
with open('Flask/w2v.pkl', 'rb') as handle:
    w2v = pickle.load(handle)

## Cleaning and Preprocessing the Corpus

In [111]:
import re
import nltk
SPECIAL_CHARS = '[^A-Za-z0-9 ]+'

def preprocess(text):
    tokenized = [word for sent in [re.sub(SPECIAL_CHARS, '', element).split(' ') for
                                  element in nltk.sent_tokenize(text)] for word in sent]
    
    lowered = [word.lower() for word in tokenized]
    return ' '.join(lowered)

In [113]:
import pandas as pd

PODCASTS = 'uncleaned_podcasts.csv'
df_podcasts = pd.read_csv(PODCASTS)
df_podcasts.head()

df_podcasts.dropna(inplace=True)

podcast_texts = [preprocess(text) for text in df_podcasts['Body'].tolist()]

In [117]:
processed_df = pd.DataFrame.from_dict( {
                                            'Title': df_podcasts['Title'],
                                            'Body': podcast_texts
                                        })
processed_df.head()

Unnamed: 0,Title,Body
0,New Beginnings,joe franklin im ready its ira glass here oh yo...
1,Small Scale Sin,ok three boys aged 13 15 and 16 all three chos...
2,Poultry Slam 1995,in danielles house ever since she was a girl w...
3,Vacations,the thing about hawaii is that before you go a...
4,Anger and Forgiveness,hi its ira glass recording this in 2006 and t...


## Using TF-IDF and NMF to Topic Model

In [177]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=0.2)
doc_spars = vectorizer.fit_transform(podcast_texts)
terms = vectorizer.get_feature_names()

In [178]:
doc_term = pd.DataFrame(doc_spars.toarray(), columns=terms)
doc_term.head()

Unnamed: 0,100,1000,11,12,13,14,15,16,17,18,...,written,wrote,yard,yelling,yellow,yesterday,york,youd,youll,younger
0,0.0,0.078835,0.0,0.0,0.0,0.0,0.0,0.017998,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.025126,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.014265,0.0,0.019628,0.013264,0.0,0.026383,...,0.021947,0.017517,0.0,0.0,0.0,0.016935,0.037036,0.009667,0.0,0.0
2,0.0,0.066487,0.035005,0.009552,0.013059,0.012682,0.017968,0.048573,0.0,0.024153,...,0.040184,0.0,0.01632,0.0,0.0,0.015503,0.016953,0.0177,0.018271,0.038641
3,0.0,0.0,0.0,0.011312,0.0,0.0,0.01064,0.0,0.0,0.014302,...,0.0,0.009496,0.0,0.0,0.0,0.0,0.010039,0.010481,0.0,0.0
4,0.029256,0.02008,0.0,0.014424,0.078884,0.0,0.013567,0.018337,0.0,0.018236,...,0.0,0.024216,0.0,0.0,0.0,0.0,0.0256,0.026729,0.0,0.0


In [182]:
from sklearn.decomposition import NMF
comps = 9
nmf = NMF(n_components=comps)
doc_topic_arr = nmf.fit_transform(doc_spars)

index_ls = [f"component_{x+1}" for x in range(comps)]
topic_word = pd.DataFrame(nmf.components_.round(3),
             index = index_ls,
             columns = terms)
topic_word

Unnamed: 0,100,1000,11,12,13,14,15,16,17,18,...,written,wrote,yard,yelling,yellow,yesterday,york,youd,youll,younger
component_1,0.075,0.054,0.044,0.068,0.061,0.047,0.098,0.051,0.023,0.047,...,0.077,0.105,0.062,0.051,0.053,0.029,0.236,0.111,0.115,0.041
component_2,0.095,0.059,0.036,0.049,0.025,0.024,0.059,0.024,0.018,0.037,...,0.055,0.074,0.004,0.013,0.004,0.008,0.107,0.046,0.046,0.0
component_3,0.005,0.009,0.016,0.011,0.014,0.01,0.032,0.008,0.0,0.014,...,0.0,0.006,0.0,0.002,0.004,0.019,0.021,0.0,0.014,0.02
component_4,0.029,0.0,0.043,0.063,0.033,0.031,0.029,0.038,0.035,0.035,...,0.057,0.1,0.027,0.025,0.045,0.029,0.03,0.049,0.036,0.081
component_5,0.021,0.0,0.014,0.022,0.016,0.022,0.024,0.032,0.037,0.02,...,0.013,0.05,0.0,0.018,0.0,0.005,0.091,0.0,0.017,0.027
component_6,0.014,0.02,0.025,0.024,0.018,0.033,0.031,0.008,0.036,0.021,...,0.005,0.033,0.042,0.027,0.009,0.025,0.012,0.017,0.015,0.008
component_7,0.023,0.012,0.028,0.024,0.011,0.008,0.017,0.014,0.028,0.008,...,0.035,0.055,0.01,0.01,0.004,0.016,0.016,0.016,0.015,0.0
component_8,0.012,0.02,0.031,0.014,0.0,0.008,0.02,0.016,0.0,0.008,...,0.031,0.012,0.04,0.0,0.029,0.002,0.018,0.017,0.025,0.023
component_9,0.0,0.003,0.01,0.005,0.008,0.006,0.0,0.011,0.0,0.0,...,0.014,0.0,0.005,0.002,0.002,0.009,0.0,0.019,0.02,0.005


In [183]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print(f"\nTopic {ix+1}: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [191]:
display_topics(nmf, terms, 20)


Topic  0
music, play, girl, song, game, david, street, dog, singing, york, book, women, playing, water, movie, eyes, sex, city, laughter, stage

Topic  1
president, government, company, vote, campaign, state, bank, party, political, office, business, court, pay, mike, law, federal, million, politics, jobs, states

Topic  2
speaking, spanish, camp, government, ice, dad, david, united, families, workers, mom, states, english, village, son, lots, hello, language, women, video

Topic  3
dad, mom, father, mother, parents, children, brother, child, fathers, sister, baby, kid, son, dads, mothers, hospital, died, daughter, letter, marriage

Topic  4
students, schools, black, white, teachers, teacher, class, church, college, student, parents, education, race, south, state, kid, district, group, children, neighborhood

Topic  5
police, crime, cops, officers, officer, prison, court, gun, jail, law, department, trial, evidence, drug, shot, attorney, guns, cop, arrested, city

Topic  6
war, milita

Here we're just adding labels

In [192]:
topics = ["Music and Media", "Politics", "Immigration", "Family", "Schooling",
          "Crime&Policing", "War", "Food", "Religion"]


doc_topic = pd.DataFrame(doc_topic_arr.round(5))

## KMeans Cluster to check for significance 

In [195]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

n_groups = 11

podcast_titles = df_podcasts['Title']


#Scale
scaled_doc_top = MinMaxScaler().fit_transform(doc_topic)


# KMeans km
km = KMeans(n_clusters=n_groups, random_state=100)

km.fit(scaled_doc_top)


#clustering actual track names using km.labels_
cluster = defaultdict(list)

for idx, group in enumerate(km.labels_):
    try:
        cluster[group].append(podcast_titles[idx])
    except KeyError:
        pass


#Description of each Cluster using km.cluster_centers_
clust_description = {}

for idx, spread in enumerate(km.cluster_centers_):
    total = sum(spread)
    
    description = ''
    for i, part in enumerate(spread):
        if part:
            description += f"'{topics[i]}' {(100*(part/total)).round(2)}% - "

    clust_description[idx] = description
    
    
# Printing the tracks under each description
for idx in clust_description:
    print(clust_description[idx])
    print(cluster[idx])
    print('\n')

'Music and Media' 3.77% - 'Politics' 1.29% - 'Immigration' 0.8% - 'Family' 1.47% - 'Schooling' 0.8% - 'Crime&Policing' 6.35% - 'War' 0.61% - 'Food' 81.0% - 'Religion' 3.9% - 
['Poultry Slam 1995', 'Poultry Slam 1996', 'Poultry Slam 1997', 'Poultry Slam 1998', 'Poultry Slam 1999', 'Poultry Slam 2003', 'How to Rest in Peace', 'Who Do You Think You Are?', 'Back to Penn State']


'Music and Media' 41.04% - 'Politics' 8.26% - 'Immigration' 2.32% - 'Family' 30.79% - 'Schooling' 5.23% - 'Crime&Policing' 4.88% - 'War' 3.18% - 'Food' 1.86% - 'Religion' 2.44% - 
['Vacations', 'Dawn', 'Detectives', 'Sissies', 'Business of Death', 'Defying Sickness', 'One of Us', 'Telephone', 'Escape the Box', 'Monogamy', 'I Enjoy Being A Girl, Sort Of', 'Scenes from A Transplant', 'Truth and Lies at Age Ten', 'Last Words', 'You Gonna Eat That?', 'Lockup', 'Valentine’s Day ’99', 'High Cost of Living', 'The Book That Changed Your Life', 'Family Business', 'Barbara', 'Bedside Diplomacy', 'Kids As Adults', 'Crush', '

Conclusion

- The clusters are distinct for the most part. Most clusters are a large majority, 1 topic.
- However, this is not something I will include in my presentation as it has no place in a recommendation app.

## Setting up a CSV file for a word cloud

In [251]:
corp_term = doc_term.sum(axis=0).reset_index()
total = corp_term[0].sum()

corp_term.sort_values(by=0, ascending=False, inplace=True, ignore_index=True)
corp_term.rename(columns={'index':'key', 0:'freq'}, inplace=True)
corp_term['freq'] = round(corp_term['freq']) - 12
corp_term['freq'] = corp_term['freq'].astype(int)
corp_term['word'] = corp_term['key']
corp_term.drop('key', axis=1, inplace=True)

In [255]:
word_cloud = corp_term.iloc[:100]

In [256]:
word_cloud.to_csv('word_cloud.csv', index=False, header=False)

In [257]:
!cat word_cloud.csv

20,dad
20,mom
19,mother
17,father
14,parents
14,speaking
13,police
12,black
12,white
10,city
10,war
9,book
9,government
9,music
8,state
8,david
8,girl
8,town
7,kid
7,street
7,president
7,children
7,york
6,men
6,play
6,women
6,office
6,brother
6,church
5,wife
5,john
5,group
4,party
4,change
4,spanish
4,business
4,girls
4,game
4,court
4,baby
3,care
3,tv
3,company
3,death
3,line
3,mr
3,sex
3,water
3,eyes
3,hospital
3,song
3,shed
3,america
3,son
3,walk
3,states
3,laughter
3,theyd
3,met
2,food
2,wrote
2,hey
2,died
2,word
2,law
2,looks
2,sister
2,christmas
2,class
2,questions
2,letter
2,news
2,middle
2,college
2,store
2,students
2,body
2,child
2,prison
2,boy
2,happy
2,bed
1,pay
1,bleep
1,singing
1,deal
1,playing
1,alex
1,mike
1,laughs
1,building
1,lost
1,movie
1,tells
1,past
1,dog
1,gay
1,running
1,meeting
1,apartment


## Future works

- Come back to the podcasts and scrape dates so that the data can be analyzed over time. 

In [13]:
# # input date?
# for title in list(podcast_vec.keys())
#     p_id = PODCAST_IDS[title]
#     dashed_title = title.replace(' ', '-')
#     link = f'https://www.thisamericanlife.org/{p_id}/{dashed_title}}'
#     #scrape link for "<span class=date-display-single>" .text