# Topic model

In [1]:
# Import necessary modules
import os
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from datetime import datetime
from sentence_transformers import SentenceTransformer
from umap import UMAP
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configure the notebook
pd.set_option("display.max_rows", None)

## Setup

First, we load in data, define constants and set up the functions we will need.

In [3]:
# Load in podcast metadata
meta = pd.read_excel("./data/clean/polio_vaccine_podcasts_20240311_115741.xlsx")

In [4]:
# Define constants
RELEVANT_TRANSCRIPTS = "./transcripts/relevant_transcripts"  # Where relevant transcripts are stored
RELEVANT_SEGMENTS = "./transcripts/relevant_segments"  # Where relevant segments are stored
with open('openai_api_key.txt', 'r') as key_file: # API key for OpenAI
    API_KEY = key_file.readline().strip()
CORPUS_SAVE = "./output/corpus" # Where to save pre-processed corpus as passed to model
TOPIC_OUTPUT = "./output/topic_model/topic_info" # Where to save topic info
DOC_OUTPUT = "./output/topic_model/document_info" # Where to save document-level info
MODEL_OUTPUT = "./output/topic_model" # Where to save topic model

In [5]:
# Define functions
def split_into_sentences(source):
    """Splits each transcript into sentences and stores the results
    in a Pandas DataFrame."""                                                                                                                               
                                                                
    nlp = spacy.load("en_core_web_sm")                                     
    df_list = []
         
    for filename in os.listdir(source):
        file_path = os.path.join(source, filename)
        filename_split = filename.split("_")
        file_id = filename_split[0]
        with open(file_path) as f:
            contents = f.read()
            cleaned = contents.replace('\n', '')
            f.close()
            doc = nlp(cleaned)
            doc_split = [sent.text for sent in doc.sents]
            length = len(doc_split)
            print(str(file_id)+": "+"split into "+str(length)+" sentences.")
            value = file_id
            unique_id = [value] * length
            tmp_df = pd.DataFrame(list(zip(unique_id, doc_split)),
                                  columns = ['unique_id', 'sentences']) 
        df_list.append(tmp_df)

    all_dfs = pd.concat(df_list, ignore_index=True)
    
    return all_dfs

In [6]:
def check_create_new_dir(directory):
    """Creates a directory if it does not already exist.
    
    Args:
        directory: Path to the new directory.
    """

    if not os.path.exists(directory):
        os.makedirs(directory)

## Corpus construction

In terms of pre-processing, most of the heavy lifting was done by the scripts that ran earlier in the pipeline, in particular `transcript_preprocess.py`, which segmented the raw transcripts and extracted the segments relevant to our topic (i.e. polio vaccines). However, the segments are too long to be passed directly to BERTopic so we will construct our corpus by breaking them up into sentences. To reduce the amount of noise in the model, we will also drop all sentences of 5 words or fewer.

In [7]:
# Split the segments into sentences
total_files = len([f for f in os.listdir("./transcripts/relevant_segments")
                       if f.endswith('.txt') and 
                       os.path.isfile(os.path.join("./transcripts/relevant_segments", f))])
print("Dividing relevant transcripts into sentences...")
corpus = split_into_sentences(RELEVANT_SEGMENTS)
print(str(total_files)+" documents divided into "+str(len(corpus))+" sentences.")

Dividing relevant transcripts into sentences...
449: split into 142 sentences.
483: split into 590 sentences.
87: split into 95 sentences.
426: split into 53 sentences.
399: split into 0 sentences.
42: split into 45 sentences.
466: split into 40 sentences.
423: split into 282 sentences.
380: split into 53 sentences.
440: split into 16 sentences.
115: split into 19 sentences.
72: split into 0 sentences.
33: split into 211 sentences.
578: split into 42 sentences.
261: split into 105 sentences.
469: split into 35 sentences.
484: split into 75 sentences.
363: split into 33 sentences.
559: split into 79 sentences.
9: split into 59 sentences.
574: split into 462 sentences.
120: split into 104 sentences.
163: split into 33 sentences.
493: split into 31 sentences.
283: split into 68 sentences.
231: split into 316 sentences.
247: split into 11 sentences.
228: split into 0 sentences.
437: split into 132 sentences.
408: split into 12 sentences.
81: split into 242 sentences.
387: split into 31 sen

In [8]:
# Calculate length of each sentence and drop those with <4 words
start_length = len(corpus)
count = corpus['sentences'].str.split().str.len()
corpus = corpus[(count>3)]
end_length = len(corpus)
print("Corpus reduced from "+str(start_length)+" to "+str(end_length)+" sentences.")

Corpus reduced from 37911 to 32770 sentences.


In [9]:
# Save corpus to file
check_create_new_dir(CORPUS_SAVE)

now = datetime.now()
datetime_string = now.strftime("%Y%m%d_%H%M%S")

output = corpus.to_excel(
    "./output/corpus/corpus_sentencized_>3_"+datetime_string+".xlsx",
    index = False)

## Constructing the model

In [10]:
# Pre-calculate embeddings
sentences = corpus['sentences'].to_list()
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(sentences, show_progress_bar=True)

Batches: 100%|██████████| 1025/1025 [02:08<00:00,  7.98it/s]


In [11]:
# Configure UMAP to prevent stochastic behaviour
umap_model = UMAP(random_state=42)

In [12]:
# Set domain-specific seed words and weight these higher. This increases
# the likelihood that these words will appear in topic representations.
ctfidf_model = ClassTfidfTransformer(
    seed_words=["polio", "poliovirus", "poliomyelitis", "paralytic",
                "paralysis", "polio vaccine", "polio vaccines",
                "polio vaccination", "polio vaccinations", "unvaccinated",
                "ipv", "opv", "nopv", "nopv2", "ipol",
                "pentacel", "pediarix", "kinrix", "vaxelis", "quadracel",
                "outbreak", "emergency", "wastewater", "sewage",
                "rockland", "sabin", "salk", "shed", "shedding",
                "mutate", "mutation", "mutations", "vaccine-derived",
                "detoxification"], 
    seed_multiplier=2,
    reduce_frequent_words=True
)

In [13]:
# Configure vectorizer model to allow bigrams in topic representations
# and remove stop words 
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

In [14]:
# Configure representation models

# KeyBert
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT-3.5
client = openai.OpenAI(api_key=API_KEY)
prompt = """
I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,
    "MMR": mmr_model,
    "POS": pos_model
}

In [15]:
# Instantiate topic model
topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  ctfidf_model=ctfidf_model,
  vectorizer_model = vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  nr_topics="auto",
  verbose=True
)

In [16]:
# Train model
topics, probs = topic_model.fit_transform(sentences, embeddings)

2024-09-25 14:52:36,150 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-09-25 14:53:35,563 - BERTopic - Dimensionality - Completed ✓
2024-09-25 14:53:35,565 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

In [17]:
# Inspect output
topic_info = topic_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,OpenAI,MMR,POS,Representative_Docs
0,-1,14045,-1_emergency_paralysis_unvaccinated_rockland,"[emergency, paralysis, unvaccinated, rockland,...","[covid 19, long covid, outbreak, patients, inf...",[Poliovirus Outbreak Analysis],"[emergency, paralysis, unvaccinated, rockland,...","[emergency, paralysis, unvaccinated, outbreak,...",[There was just a study released from the Univ...
1,0,1898,0_big big_want just_big_wrong,"[big big, want just, big, wrong, want don, poi...","[conversation, said talk, think talk, communic...",[Heidi Klum's Influence and Morality],"[big big, want just, big, wrong, want don, poi...","[big, wrong, morality, framing, big woodwork, ...",[but they did they used to though like i said ...
2,1,1750,1_unvaccinated_vaccines_covid vaccine_covid 19,"[unvaccinated, vaccines, covid vaccine, covid ...","[covid vaccine, vaccines covid, new vaccine, c...",[Rapid Aging Disease Vaccination],"[unvaccinated, vaccines, covid vaccine, covid ...","[unvaccinated, vaccines, covid vaccines, vacci...",[Because by adding COVID-19 shots to the sched...
3,2,1484,2_polio vaccine_polio_paralytic_poliovirus,"[polio vaccine, polio, paralytic, poliovirus, ...","[polio vaccine, vaccine polio, polio vaccines,...",[Polio Vaccine and Paralytic Outbreak],"[polio vaccine, polio, paralytic, poliovirus, ...","[polio, paralytic, poliovirus, paralysis, outb...","[And then there's oral polio vaccine., The ons..."
4,3,1222,3_podcast_podcasts_dace_halloween,"[podcast, podcasts, dace, halloween, hawks, so...","[week, wives, season, wants retire, mariota, c...",[Halloween-themed podcast analysis],"[podcast, podcasts, dace, halloween, hawks, so...","[podcast, podcasts, dace, halloween, socks, mo...",[and you get back to him and go where's the co...
5,4,523,4_fear_safe_panic_scared,"[fear, safe, panic, scared, school, schools, t...","[neglect fear, fear, think fear, fear fear, li...",[Fear and Panic Dynamics],"[fear, safe, panic, scared, school, schools, t...","[fear, safe, panic, scared, school, schools, t...","[And that's a funny, that's another funny inte..."
6,5,411,5_alice_caleb_alice cooper_mary,"[alice, caleb, alice cooper, mary, caitlin, ch...","[shakira, lisa booth, kim, jennifer, bias jour...",[Media Bias and Celebrity Drama],"[alice, caleb, alice cooper, mary, caitlin, ch...","[caleb, mary, divorce, mom, big actor, estrang...",[It's all in the hips with Shakira By the way ...
7,6,327,6_cancer_medicine_dose_patients,"[cancer, medicine, dose, patients, doses, chem...","[cancer, chemo, recognize cancer, know cancer,...",[Cancer Treatment Innovations],"[cancer, medicine, dose, patients, doses, chem...","[cancer, medicine, dose, patients, doses, chem...","[And just to clarify these studies, this study..."
8,7,276,7_climate_fossil_fuels_fossil fuels,"[climate, fossil, fuels, fossil fuels, climate...","[climate crisis, global warming, climate chang...",[Impact of Fossil Fuels on Climate],"[climate, fossil, fuels, fossil fuels, climate...","[climate, fossil, fuels, fossil fuels, heat, c...","[For more on this, we're pleased to be joined ..."
9,8,271,8_election_democrats_republicans_votes,"[election, democrats, republicans, votes, repu...","[like republicans, lot republicans, democrats ...",[Media Bias and Political Dynamics],"[election, democrats, republicans, votes, repu...","[election, democrats, votes, ballots, voting, ...",[That happened because there was a guy named T...


In [18]:
# Inspect document-level info
doc_info = topic_model.get_document_info(sentences)
doc_info.head()

Unnamed: 0,Document,Topic,Name,Representation,KeyBERT,OpenAI,MMR,POS,Representative_Docs,Top_n_words,Probability,Representative_document
0,The spirit of the American West is alive and ...,61,61_technocracy_brzezinski_technotronic era_tec...,"[technocracy, brzezinski, technotronic era, te...","[fascism government, leaders technocracy, bure...",[Corporate power and economic policies],"[technocracy, brzezinski, technotronic era, te...","[technocracy, immoral conduct, technotronic, t...","[Yes, many of those regulations do need to be ...",technocracy - brzezinski - technotronic era - ...,0.521341,False
1,The award-winning quarterly devoted to the iss...,-1,-1_emergency_paralysis_unvaccinated_rockland,"[emergency, paralysis, unvaccinated, rockland,...","[covid 19, long covid, outbreak, patients, inf...",[Poliovirus Outbreak Analysis],"[emergency, paralysis, unvaccinated, rockland,...","[emergency, paralysis, unvaccinated, outbreak,...",[There was just a study released from the Univ...,emergency - paralysis - unvaccinated - rocklan...,0.0,False
2,"Each issue contains informative articles, brea...",-1,-1_emergency_paralysis_unvaccinated_rockland,"[emergency, paralysis, unvaccinated, rockland,...","[covid 19, long covid, outbreak, patients, inf...",[Poliovirus Outbreak Analysis],"[emergency, paralysis, unvaccinated, rockland,...","[emergency, paralysis, unvaccinated, outbreak,...",[There was just a study released from the Univ...,emergency - paralysis - unvaccinated - rocklan...,0.0,False
3,Order online from RangeMagazine.com.,-1,-1_emergency_paralysis_unvaccinated_rockland,"[emergency, paralysis, unvaccinated, rockland,...","[covid 19, long covid, outbreak, patients, inf...",[Poliovirus Outbreak Analysis],"[emergency, paralysis, unvaccinated, rockland,...","[emergency, paralysis, unvaccinated, outbreak,...",[There was just a study released from the Univ...,emergency - paralysis - unvaccinated - rocklan...,0.0,False
4,Loving Liberty Network salutes the spirit of t...,61,61_technocracy_brzezinski_technotronic era_tec...,"[technocracy, brzezinski, technotronic era, te...","[fascism government, leaders technocracy, bure...",[Corporate power and economic policies],"[technocracy, brzezinski, technotronic era, te...","[technocracy, immoral conduct, technotronic, t...","[Yes, many of those regulations do need to be ...",technocracy - brzezinski - technotronic era - ...,0.582428,False


In [19]:
# Reduce outliers using c-TF-IDF method
new_topics = topic_model.reduce_outliers(sentences, topics,
                                         strategy="c-tf-idf")

# Update topic representations
topic_model.update_topics(sentences, topics=new_topics,
                          vectorizer_model=vectorizer_model,
                          representation_model=representation_model)

100%|██████████| 263/263 [02:33<00:00,  1.72it/s]


In [20]:
# View updated topic info
topic_info = topic_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,OpenAI,MMR,POS,Representative_Docs
0,-1,176,-1_ding_ding ding_rotate_accent,"[ding, ding ding, rotate, accent, iota, rotate...","[rotate, rotate rotate, achieved rotate, iota ...",[Rotate and Rattle Scholarship Award],"[ding, ding ding, rotate, accent, iota, rotate...","[ding, iota, accent, shakes, , , , , , ]",[There was just a study released from the Univ...
1,0,1957,0_big big_big_don_just,"[big big, big, don, just, yeah, think, mean, l...","[just don, don, don want, big big, big, say, h...",[Avoiding Big Mistakes],"[big big, big, don, just, yeah, think, mean, l...","[big, wrong, time, things, lot, shit, people, ...",[but they did they used to though like i said ...
2,1,1874,1_vaccines_covid_vaccine_vaccinated,"[vaccines, covid, vaccine, vaccinated, vaccina...","[covid vaccine, covid vaccines, covid 19, covi...",[COVID-19 Vaccination Effectiveness],"[vaccines, covid, vaccine, vaccinated, vaccina...","[vaccines, covid, vaccine, vaccination, kids, ...",[Because by adding COVID-19 shots to the sched...
3,2,1550,2_polio_polio vaccine_vaccine_polio virus,"[polio, polio vaccine, vaccine, polio virus, o...","[vaccine polio, vaccinated polio, polio, polio...",[Polio Virus and Vaccines],"[polio, polio vaccine, vaccine, polio virus, o...","[polio, vaccine, oral, oral polio, virus, case...","[And then there's oral polio vaccine., The ons..."
4,3,1310,3_podcast_podcasts_like_guy,"[podcast, podcasts, like, guy, listen, dace, d...","[podcasts, podcast, apple podcasts, episodes, ...",[Podcast Bonus Content Subscription],"[podcast, podcasts, like, guy, listen, dace, d...","[podcast, podcasts, guy, dace, movie, fucking,...",[and you get back to him and go where's the co...
5,4,589,4_school_kids_safe_fear,"[school, kids, safe, fear, parents, schools, c...","[kids school, schools, school, know kids, kids...",[School Safety Concerns],"[school, kids, safe, fear, parents, schools, c...","[school, kids, safe, fear, parents, schools, c...","[And that's a funny, that's another funny inte..."
6,5,462,5_mary_like_um_alice,"[mary, like, um, alice, morning, caleb, chicag...","[chicago morning, dan amy, chicago, amy, morni...",[Chicago Morning Answer show],"[mary, like, um, alice, morning, caleb, chicag...","[mary, morning, caleb, mom, stage, agent, spee...",[It's all in the hips with Shakira By the way ...
7,6,379,6_medicine_cancer_dose_patients,"[medicine, cancer, dose, patients, doses, medi...","[second dose, fourth dose, doses, medicine, mi...",[Pediatric Vaccine Dosage Schedule],"[medicine, cancer, dose, patients, doses, medi...","[medicine, cancer, dose, patients, doses, medi...","[And just to clarify these studies, this study..."
8,7,310,7_climate_fossil_fossil fuels_fuels,"[climate, fossil, fossil fuels, fuels, climate...","[climate change, fossil fuels, global warming,...",[Controversy Surrounding Fossil Fuels and Clim...,"[climate, fossil, fossil fuels, fuels, climate...","[climate, fossil, fuels, fossil fuels, heat, c...","[For more on this, we're pleased to be joined ..."
9,8,344,8_election_trump_democrats_republicans,"[election, trump, democrats, republicans, pres...","[election happens, throw election, election, p...",[Election Politics Analysis],"[election, trump, democrats, republicans, pres...","[election, trump, democrats, president, democr...",[That happened because there was a guy named T...


In [21]:
# View updated document-level info
doc_info = topic_model.get_document_info(sentences)
doc_info.head()

Unnamed: 0,Document,Topic,Name,Representation,KeyBERT,OpenAI,MMR,POS,Representative_Docs,Top_n_words,Probability,Representative_document
0,The spirit of the American West is alive and ...,61,61_technocracy_economic_access_singular,"[technocracy, economic, access, singular, sing...","[economic order, world commerce, commerce sing...",[New International Economic Order],"[technocracy, economic, access, singular, sing...","[technocracy, economic, access, singular, sing...","[Yes, many of those regulations do need to be ...",technocracy - economic - access - singular - s...,0.521341,False
1,The award-winning quarterly devoted to the iss...,46,46_award_day_monday_sunday,"[award, day, monday, sunday, tin, disney, awar...","[dog day, disney plus, labor day, disney ll, a...",[Hollywood Awards and Disney Day],"[award, day, monday, sunday, tin, disney, awar...","[award, day, tin, disney, awards, meeting, upd...",[well i think now the difference is that they'...,award - day - monday - sunday - tin - disney -...,0.0,False
2,"Each issue contains informative articles, brea...",252,252_regret_mean ve_look like_moral,"[regret, mean ve, look like, moral, faced, bur...","[like public, public record, followers socials...",[Moral Crisis in Medical Research],"[regret, mean ve, look like, moral, faced, bur...","[regret, moral, burden, articles, deep, shame,...","[I mean, this is the type of thing that was ne...",regret - mean ve - look like - moral - faced -...,0.0,False
3,Order online from RangeMagazine.com.,61,61_technocracy_economic_access_singular,"[technocracy, economic, access, singular, sing...","[economic order, world commerce, commerce sing...",[New International Economic Order],"[technocracy, economic, access, singular, sing...","[technocracy, economic, access, singular, sing...","[Yes, many of those regulations do need to be ...",technocracy - economic - access - singular - s...,0.0,False
4,Loving Liberty Network salutes the spirit of t...,61,61_technocracy_economic_access_singular,"[technocracy, economic, access, singular, sing...","[economic order, world commerce, commerce sing...",[New International Economic Order],"[technocracy, economic, access, singular, sing...","[technocracy, economic, access, singular, sing...","[Yes, many of those regulations do need to be ...",technocracy - economic - access - singular - s...,0.582428,False


## Save outputs

In [22]:
# Save topic model
check_create_new_dir(MODEL_OUTPUT)
topic_model.save(MODEL_OUTPUT, serialization="safetensors",
                 save_ctfidf=True, save_embedding_model=embedding_model)

In [23]:
# Save topic info as Excel file
check_create_new_dir(TOPIC_OUTPUT)

now = datetime.now()
datetime_string = now.strftime("%Y%m%d_%H%M%S")

output = topic_info.to_excel(
    TOPIC_OUTPUT+"/topic_info_raw_"+datetime_string+".xlsx",
    index = False)

In [24]:
# Save document info as Excel file
check_create_new_dir(DOC_OUTPUT)

now = datetime.now()
datetime_string = now.strftime("%Y%m%d_%H%M%S")

output = doc_info.to_excel(
    DOC_OUTPUT+"/document_info_raw_"+datetime_string+".xlsx",
    index = False)

In [25]:
# Save annotated version of corpus with topics, podcast & episode titles

# Set up mappings
topics_dict = doc_info.set_index('Document')['Topic'].to_dict()
podcast_dict = meta.set_index('unique_id')['podcast_name'].to_dict()
episodes_dict = meta.set_index('unique_id')['episode_title'].to_dict()

# Apply mappings
corpus_enriched = corpus.copy()
meta.unique_id = meta.unique_id.astype(int)
corpus_enriched.unique_id = corpus_enriched.unique_id.astype(int)
corpus_enriched["Topic"] = corpus_enriched.sentences.map(topics_dict)
corpus_enriched["podcast_name"] = corpus_enriched.unique_id.map(podcast_dict)
corpus_enriched["episode_title"] = corpus_enriched.unique_id.map(episodes_dict)
corpus_enriched.head()

# Save DataFrame as Excel file
now = datetime.now()
datetime_string = now.strftime("%Y%m%d_%H%M%S")

output = corpus_enriched.to_excel(
    CORPUS_SAVE+"/corpus_annotated_raw_topics_"+datetime_string+".xlsx",
    index = False)