# Topic Map

In [159]:
import os
import pandas as pd
import re
import nltk
import openai
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
from bertopic import BERTopic
from nltk import sent_tokenize, word_tokenize
nltk.download('punkt')

openai.api_key=os.environ['openai_api_key'] 
transcription_dir_location_en = '../data/processed/transcription/en/'

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/frasercrichton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [87]:
docs = []
timestamps = []

# the text contains spoken filler words that need removed 
stopwords = ['um', 'ah', 'mhm', 'mhm.', 'oh', 'huh?', 'hi']     

def remove_stop_words(sentence):
    text_tokens = sentence.split(" ")
    tokens_filtered = [word for word in text_tokens if not word.lower() in stopwords]
    return (" ").join(tokens_filtered)

def parse_text(file_location): 
    transcriptions_df = pd.read_json(file_location, convert_dates=['createTime'])
    transcription_text = transcriptions_df['transcript'][0]
    transcription_create_time = transcriptions_df['createTime'][0].strftime('%Y-%m-%d %H:%M:%S')
    
    # only analyse files with sentences with more then 27 words
    if len(transcription_text.split()) < 27:
        print(f'Small number of words: {file_location} {transcription_text}')
        return
    
    for sentence in sent_tokenize(transcription_text):
        timestamps.append(transcription_create_time)
        sentence = sentence.replace("Fratelli D'Italia", 'Brothers of Italy').replace("Fratelli d'Italia", 'Brothers of Italy')
        sentence = remove_stop_words(sentence)                 
        docs.append(sentence)
    
for file_name in os.listdir(transcription_dir_location_en):
    parse_text(transcription_dir_location_en + file_name)


# temp_docs = [doc for doc in docs if len(doc) > 3]
# temp_docs
# topics_over_time = model.topics_over_time(tweets, timestamps)
docs
# timestamps
# Python program to demonstrate 
# saving a text file 


with open('temp.txt', 'w') as file: 
	file.writelines("% s\n" % data for data in docs) 


Small number of words: ../data/processed/transcription/en/TranscribeTikTokAudio7147206448232074502.json gentlemen, It was the day of defeat. But this is not the day to get things done. Well, that wasn't really the case. That's fine. 
Small number of words: ../data/processed/transcription/en/TranscribeTikTokAudio7147263913200045317.json September twenty-fifth I've said it all. 
Small number of words: ../data/processed/transcription/en/TranscribeTikTokAudio7135145359629634822.json 
Small number of words: ../data/processed/transcription/en/TranscribeTikTokAudio7144425341279341830.json I am
Small number of words: ../data/processed/transcription/en/TranscribeTikTokAudio7070073900100521222.json who goes M design, yo. 
Small number of words: ../data/processed/transcription/en/TranscribeTikTokAudio7144420910336953606.json 
Small number of words: ../data/processed/transcription/en/TranscribeTikTokAudio7143163842665663749.json 
Small number of words: ../data/processed/transcription/en/Transcribe

1) Pre-calculate Embeddings

In [197]:
from bertopic.backend import OpenAIBackend
# import spacy
# !python -m spacy download en_core_web_md
# nlp = spacy.load("en_core_web_md", exclude=['tagger', 'parser', 'ner', 
#                                             'attribute_ruler', 'lemmatizer'])
# embedding_model = OpenAIBackend('text-embedding-ada-002"')

# import gensim.downloader as api
# ft = api.load('fasttext-wiki-news-subwords-300')

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# embedding_model = ft

# embeddings = embedding_model.encode(docs, show_progress_bar=True)

Stochastic Behaviour

In [5]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

Limit Number of Topics

In [19]:
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

Improving Default Representation

In [206]:
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [241]:
# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT-3.5
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

# representation_model = [
#     keybert_model,
#     pos_model,
#     # openai_model
# ]

representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,  # Uncomment if you will use OpenAI
    # "MMR": mmr_model,
    "POS": pos_model
}

In [242]:
seed_topic_list = [['abortion', 'abort', 'aborts', 'pregnancy', 'pregnant', 'woman', 'alternative'],
                    # ['family'],
                    ['rape', 'raped'],
                   ['election', 'government', 'vote'],
                   ['bills', 'inflation'],
                   ['immigration', 'migration', 'refugee', 'traffickers'],
                   ['Italy', 'Italian', 'brothers'],
                   ['ecological', 'climate' , 'environment',  'sustainability'],
                   ['fake news', 'lie', 'conspiracy', 'journalism'],
                   ['tax', 'income'],
                   ['freedom']]
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
# https://towardsdatascience.com/topic-modeling-with-llama-2-85177d01e174
topic_model = BERTopic(
  ctfidf_model=ctfidf_model,
  seed_topic_list= seed_topic_list,
  embedding_model=embedding_model,
  # umap_model=umap_model,
  # hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  # top_n_words=5,
  # diversity=0.7,
  # nr_topics='auto', < less topcs
  verbose=True,
  # min_topic_size=5
  # calculate_probabilities=False
)

topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()
# https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html


Batches:   0%|          | 0/73 [00:00<?, ?it/s]

2023-10-05 09:10:04,523 - BERTopic - Transformed documents to Embeddings


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2023-10-05 09:10:13,249 - BERTopic - Reduced dimensionality
2023-10-05 09:10:13,317 - BERTopic - Clustered reduced embeddings


In [249]:
topics=[2, 4, 5, 7, 9, 13, 15, 16, 20]
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,OpenAI,POS,Representative_Docs
0,-1,837,-1_government_our_be_say,"[government, our, be, say, was, so, not, peopl...","[government, political, support, culture, pres...",[Government efforts and public belief],"[government, people, culture, first, thing, po...",[We have started important work to make the pl...
1,0,174,0_italian_italy_italians_government,"[italian, italy, italians, government, how, gr...","[italians, italian, italy, europe, citizens, g...",[Italy's Role in Europe],"[government, great, nation, problem, left, onl...","[I can speak very badly about Italy, but I do ..."
2,1,145,1_thank_ready_happy_reality,"[thank, ready, happy, reality, here, wait, eve...","[ourselves, merits, greetings, we, advantage, ...",[Recognition of Women's Underestimated Advantage],"[ready, happy, reality, evening, goals, determ...","[It's the reason we are here today, so thank y..."
3,2,140,2_brothers_italy_italian_cross,"[brothers, italy, italian, cross, world, party...","[italians, italian, italy, francesco, election...",[Brothers of Italy elections],"[brothers, cross, world, party, elections, cen...",[Then even funnier than this is the reason why...
4,3,133,3_yes_hi_okay_too,"[yes, hi, okay, too, ve, why, heard, no, look,...","[yes, more, mainly, seems, absolutely, probabl...",[Echoes and Confusion],"[fine, echo, professionalism, bit, question, m...","[No, I mean okay, things will be fine anyway.,..."
5,4,72,4_immigration_migration_traffickers_trafficking,"[immigration, migration, traffickers, traffick...","[immigration, migrants, refugees, immigrants, ...",[Combating Illegal Immigration and Trafficking],"[immigration, traffickers, trafficking, illega...",[The route is not covered by non-governmental ...
6,5,65,5_freedom_free_rights_liberal,"[freedom, free, rights, liberal, right, love, ...","[freedom, rights, totalitarianisms, democracy,...",[Celebrating Freedom and Rights],"[freedom, free, rights, liberal, right, love, ...","[A great Italian liberal philosopher, Benedett..."
7,6,61,6_income_tax_citizen_euros,"[income, tax, citizen, euros, citizenship, tho...","[income, taxes, tax, taxation, incomes, earner...",[Tax cuts and guarantees],"[income, tax, citizen, euros, citizenship, fla...","[So today and until the end of this year, we h..."
8,7,50,7_resources_funds_supply_initiatives,"[resources, funds, supply, initiatives, fund, ...","[initiatives, organizations, projects, fund, f...",[Development initiatives and funding],"[resources, funds, supply, initiatives, fund, ...","[For this reason, the objective of our work mu..."
9,8,46,8_vote_electoral_campaign_elections,"[vote, electoral, campaign, elections, require...","[electoral, elections, ballots, election, voti...","[""Electoral Campaign and Voting""]","[electoral, campaign, elections, promise, elec...","[That is, I even realized the law because I am..."


In [254]:
topic_model.update_topics(docs, vectorizer_model=vectorizer_model)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,OpenAI,POS,Representative_Docs
0,-1,837,-1_say_government_people_let,"[say, government, people, let, obviously, want...","[government, political, support, culture, pres...",[Government efforts and public belief],"[government, people, culture, first, thing, po...",[We have started important work to make the pl...
1,0,174,0_italy_italian_italians_government,"[italy, italian, italians, government, nation,...","[italians, italian, italy, europe, citizens, g...",[Italy's Role in Europe],"[government, great, nation, problem, left, onl...","[I can speak very badly about Italy, but I do ..."
2,1,145,1_thank_ready_happy_reality,"[thank, ready, happy, reality, good, president...","[ourselves, merits, greetings, we, advantage, ...",[Recognition of Women's Underestimated Advantage],"[ready, happy, reality, evening, goals, determ...","[It's the reason we are here today, so thank y..."
3,2,140,2_brothers_brothers italy_italy_italian,"[brothers, brothers italy, italy, italian, wor...","[italians, italian, italy, francesco, election...",[Brothers of Italy elections],"[brothers, cross, world, party, elections, cen...",[Then even funnier than this is the reason why...
4,3,133,3_yes_yes yes_okay_ve heard,"[yes, yes yes, okay, ve heard, ve, look, heard...","[yes, more, mainly, seems, absolutely, probabl...",[Echoes and Confusion],"[fine, echo, professionalism, bit, question, m...","[No, I mean okay, things will be fine anyway.,..."
5,4,72,4_immigration_people_international law_traffic...,"[immigration, people, international law, traff...","[immigration, migrants, refugees, immigrants, ...",[Combating Illegal Immigration and Trafficking],"[immigration, traffickers, trafficking, illega...",[The route is not covered by non-governmental ...
6,5,65,5_freedom_free_right_rights,"[freedom, free, right, rights, liberal, love, ...","[freedom, rights, totalitarianisms, democracy,...",[Celebrating Freedom and Rights],"[freedom, free, rights, liberal, right, love, ...","[A great Italian liberal philosopher, Benedett..."
7,6,61,6_income_tax_thousand euros_euros,"[income, tax, thousand euros, euros, citizen i...","[income, taxes, tax, taxation, incomes, earner...",[Tax cuts and guarantees],"[income, tax, citizen, euros, citizenship, fla...","[So today and until the end of this year, we h..."
8,7,50,7_resources_supply_initiatives_fund,"[resources, supply, initiatives, fund, used, h...","[initiatives, organizations, projects, fund, f...",[Development initiatives and funding],"[resources, funds, supply, initiatives, fund, ...","[For this reason, the objective of our work mu..."
9,8,46,8_electoral_vote_electoral campaign_campaign,"[electoral, vote, electoral campaign, campaign...","[electoral, elections, ballots, election, voti...","[""Electoral Campaign and Voting""]","[electoral, campaign, elections, promise, elec...","[That is, I even realized the law because I am..."


In [250]:
topic_model.visualize_barchart(width=280, height=330, top_n_topics=60, n_words=20)

In [212]:
# new_topics = topic_model.reduce_outliers(docs, topics, strategy="embeddings", embeddings=embeddings)
# new_topics = topic_model.reduce_outliers(docs, topics)
# topic_model.update_topics(docs, topics=new_topics)
# topic_model.get_topic_info()

## Topic Labelling

Rename and format topic labels.

In [222]:
topic_lables = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, word_length=15, separator=' - ')
topic_model.set_topic_labels(topic_lables)

# llama2_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Llama2"].values()]
# topic_model.set_topic_labels(llama2_labels)
print(topic_model.get_topic_info()['KeyBERT'][0])
# topic_model.set_topic_labels({1: "Abortion"})
# topic_model.set_topic_labels({2: "Immigration"})
# topic_model.set_topic_labels({3: "Fake News"})
# topic_model.set_topic_labels({4: "Freedom"})
# topic_model.set_topic_labels({14: "Taxes"})
# topic_model.set_topic_labels({14: "Cost of Living/Inflation"})
# topic_model.set_topic_labels({14: "Election Campaign"})
# topic_model.set_topic_labels({14: "Rape"})
# topic_model.set_topic_labels({14: "Climate Change"})
topic_model.get_topic_info().head(20)

['reform', 'citizens', 'government', 'today', 'political', 'families', 'work', 'we', 'when', 'point']


Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,799,-1_government_our_people_will,government - our - people,"[government, our, people, will, one, say, was,...","[reform, citizens, government, today, politica...","[government, our, people, will, one, say, was,...","[government, people, more, families, political...",[It is exactly from this philosophy that we mo...
1,0,193,0_thank_ready_happy_reality,thank - ready - happy,"[thank, ready, happy, reality, here, wait, rea...","[intimidated, emotion, speaking, very, tones, ...","[thank, ready, happy, reality, here, wait, rea...","[ready, happy, reality, more, permission, mome...",[I apologize for the emotion of approaching th...
2,1,146,1_yes_hi_enough_okay,yes - hi - enough,"[yes, hi, enough, okay, surreal, too, ve, why,...","[heard, hear, absolutely, yes, mainly, about, ...","[yes, hi, enough, okay, surreal, too, ve, why,...","[enough, surreal, bit, fine, echo, finish, pro...","[No, I mean okay, things will be fine anyway.,..."
3,2,142,2_brothers_italy_italian_cross,brothers - italy - italian,"[brothers, italy, italian, cross, world, party...","[italians, italian, italy, francesco, election...","[brothers, italy, italian, cross, world, party...","[brothers, cross, world, party, elections, cen...",[Then even funnier than this is the reason why...
4,3,103,3_italy_italian_italians_knows,italy - italian - italians,"[italy, italian, italians, knows, government, ...","[italians, italian, italy, government, europe,...","[italy, italian, italians, knows, government, ...","[government, great, nation, collaboration, res...","[I can speak very badly about Italy, but I do ..."
5,4,70,4_immigration_migration_traffickers_trafficking,immigration - migration - traffickers,"[immigration, migration, traffickers, traffick...","[migrants, immigration, refugees, trafficking,...","[immigration, migration, traffickers, traffick...","[immigration, traffickers, trafficking, illega...",[The route is not covered by non-governmental ...
6,5,70,5_freedom_free_rights_liberal,freedom - free - rights,"[freedom, free, rights, liberal, right, love, ...","[freedom, secularity, rights, totalitarianisms...","[freedom, free, rights, liberal, right, love, ...","[freedom, free, rights, liberal, right, love, ...","[A great Italian liberal philosopher, Benedett..."
7,6,63,6_ministers_council_sacrifices_minister,ministers - council - sacrifices,"[ministers, council, sacrifices, minister, pro...","[honor, celebration, celebrate, salute, proud,...","[ministers, council, sacrifices, minister, pro...","[ministers, council, sacrifices, minister, pro...",[Demonstrate the pride that we demonstrate bec...
8,7,63,7_income_tax_citizen_euros,income - tax - citizen,"[income, tax, citizen, euros, citizenship, tho...","[taxes, tax, taxation, income, incomes, vat, r...","[income, tax, citizen, euros, citizenship, tho...","[income, tax, citizen, euros, citizenship, fla...","[So today and until the end of this year, we h..."
9,8,52,8_resources_funds_initiatives_fund,resources - funds - initiatives,"[resources, funds, initiatives, fund, supply, ...","[initiatives, economic, organizations, project...","[resources, funds, initiatives, fund, supply, ...","[resources, funds, initiatives, fund, supply, ...","[For this reason, the objective of our work mu..."


In [68]:
# merge topics
topic_model.merge_topics(docs, topics_to_merge=[-1, 1])
topic_model.get_topic_info().head(5)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,999,-1_the_to_that_of,"[the, to, that, of, and, in, we, is, you, are]",[ I can announce to you now that the budget la...
1,0,456,0_the_of_italy_to,"[the, of, italy, to, and, in, is, that, we, it]","[ Easy it doesn't take much you have to study,..."
2,1,56,1_we_work_ready_we are,"[we, work, ready, we are, here, you, are, to, ...",[ And obviously we are already announcing that...
3,2,56,2_the_campaign_electoral_that,"[the, campaign, electoral, that, right, is, th...","[ I am potentially the majority now, will I de..."
4,3,53,3_we_you_responsibility_is,"[we, you, responsibility, is, to, this, and, t...",[ So the higher we climb and the more we bring...


In [None]:

# %%time
# model = BERTopic(
#     n_gram_range=(1, 2),
#     vectorizer_model=vectorizer_model,
#     nr_topics='auto',
#     min_topic_size=10,
#     diversity=0.7,
#     seed_topic_list=[
#         ["experience", "bad", "good", "nice"],
#         ["place", "atmosphere", "toilet", "clean"],
#         ["staff", "waitress", "service"],
#         ["wait", "time", "long"],
#         ["food", "taste"]
#     ],
#     calculate_probabilities=True).fit(docs, corpus_embeddings)
# remove outliers
#  Comment out this line below if you decided to use the "propbabilities" strategy
# new_topics = model.reduce_outliers(docs, topics, strategy="c-tf-idf")


# Reduce outliers using the `probabilities` strategy (Uncomment to use this)
# #new_topics = model.reduce_outliers(docs, topics, probabilities=probabilities, strategy="probabilities")
# This line is to update the model with the latest topic assignment 
# model.update_topics(docs, topics=new_topics, vectorizer_model=vectorizer_model)

# df_topic_freq = model.get_topic_freq()
# topics_count = len(df_topic_freq) - 1
# df_topic_freq

In [230]:
topic_model.get_topic(6, full=True)

{'Main': [('resources', 0.4210617587670018),
  ('supply', 0.3074709349220746),
  ('funds', 0.3074709349220746),
  ('chains', 0.2844834341485178),
  ('fund', 0.27799883639406936),
  ('initiatives', 0.2686058216276156),
  ('farmers', 0.24794738314868262),
  ('cases', 0.24463657439521205),
  ('interventions', 0.23816596207920385),
  ('support', 0.23395728687359035)]}

In [255]:
topic_model.get_document_info(docs)

Unnamed: 0,Document,Topic,Name,Representation,KeyBERT,OpenAI,POS,Representative_Docs,Top_n_words,Probability,Representative_document
0,Another important chapter of our work in recen...,7,7_resources_supply_initiatives_fund,"[resources, supply, initiatives, fund, used, h...","[initiatives, organizations, projects, fund, f...",[Development initiatives and funding],"[resources, funds, supply, initiatives, fund, ...","[For this reason, the objective of our work mu...",resources - supply - initiatives - fund - used...,0.626189,False
1,The government has allocated over one billion ...,7,7_resources_supply_initiatives_fund,"[resources, supply, initiatives, fund, used, h...","[initiatives, organizations, projects, fund, f...",[Development initiatives and funding],"[resources, funds, supply, initiatives, fund, ...","[For this reason, the objective of our work mu...",resources - supply - initiatives - fund - used...,0.925143,False
2,"Because the scenes of degradation, abandonment...",11,11_safety_emergency_defend_especially,"[safety, emergency, defend, especially, lives,...","[safety, protection, emergencies, risk, defend...",[Safety and Emergency Defense],"[safety, emergency, alternation, emergencies, ...",[We want to defend people who keep our beaches...,safety - emergency - defend - especially - liv...,0.805523,False
3,So we decided to increase the salaries and ove...,-1,-1_say_government_people_let,"[say, government, people, let, obviously, want...","[government, political, support, culture, pres...",[Government efforts and public belief],"[government, people, culture, first, thing, po...",[We have started important work to make the pl...,say - government - people - let - obviously - ...,0.000000,False
4,We have decided to limit the possibility that ...,-1,-1_say_government_people_let,"[say, government, people, let, obviously, want...","[government, political, support, culture, pres...",[Government efforts and public belief],"[government, people, culture, first, thing, po...",[We have started important work to make the pl...,say - government - people - let - obviously - ...,0.000000,False
...,...,...,...,...,...,...,...,...,...,...,...
2320,"Yesterday established a principle, namely that...",-1,-1_say_government_people_let,"[say, government, people, let, obviously, want...","[government, political, support, culture, pres...",[Government efforts and public belief],"[government, people, culture, first, thing, po...",[We have started important work to make the pl...,say - government - people - let - obviously - ...,0.000000,False
2321,The approach of the European Council in the co...,13,13_european_europe_european council_council,"[european, europe, european council, council, ...","[europeanism, europe, eu, european, departures...",[Europe's Response to Departures],"[witch, departures, humanity, addition, availa...",[The only way to seriously deal with this matt...,european - europe - european council - council...,1.000000,False
2322,"The approach that the European Council, put in...",13,13_european_europe_european council_council,"[european, europe, european council, council, ...","[europeanism, europe, eu, european, departures...",[Europe's Response to Departures],"[witch, departures, humanity, addition, availa...",[The only way to seriously deal with this matt...,european - europe - european council - council...,1.000000,False
2323,Immigration is a European problem and needs a ...,4,4_immigration_people_international law_traffic...,"[immigration, people, international law, traff...","[immigration, migrants, refugees, immigrants, ...",[Combating Illegal Immigration and Trafficking],"[immigration, traffickers, trafficking, illega...",[The route is not covered by non-governmental ...,immigration - people - international law - tra...,1.000000,False


In [252]:
topic_model.visualize_topics()

## Visualise Topics

In [256]:
topic_model.visualize_heatmap(n_clusters=10)

## Topics over time 

Make sure to use a limited number of unique timestamps (<100) as the c-TF-IDF representation will be calculated at each single unique timestamp. Having a large number of unique timestamps can take some time to be calculated. Moreover, there aren't many use-cased where you would like to see the difference in topic representations over more than 100 different timestamps.

In [257]:

topics_over_time = topic_model.topics_over_time(docs, timestamps)
topic_model.visualize_topics_over_time(topics_over_time, topics=topics)

166it [00:01, 121.50it/s]


In [248]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=False)
# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(docs)

In [239]:
doc_id = 41
topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)
topic_model.visualize_approximate_distribution(docs[doc_id], topic_token_distr[doc_id])

100%|██████████| 3/3 [00:00<00:00,  7.74it/s]


Unnamed: 0,We,must,always,do,better,but,am,satisfied,with,the,fact,that,not,even,day,has,gone,by,Saturdays,Sundays,including,holidays,without,us,trying,to,give,at,least,one,answer,always.1,on,Italy,side
1_bit_echo_professionalism_girl,0.0,0.108,0.222,0.335,0.335,0.226,0.113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2_ready_evening_happy_moment,0.101,0.101,0.101,0.101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18_capable_advantage_lunatic_merits,0.1,0.1,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.101,0.101,0.101,0.101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.108,0.247,0.378,0.378,0.27,0.131,0.0
23_proud_celebration_sacrifice_politics,0.0,0.109,0.219,0.343,0.343,0.234,0.124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26_tones_propaganda_respect_adhesion,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104,0.104,0.104,0.104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29_state_just_fair_assistance,0.125,0.125,0.125,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32_income_citizen_minor_disabled,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119,0.119,0.119,0.119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34_structured_proposal_counter_specious,0.0,0.0,0.0,0.112,0.112,0.112,0.112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40_principle_municipality_controls_bulk,0.139,0.139,0.139,0.139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [258]:
import matplotlib
import matplotlib.pyplot as plt

# Prepare data for plotting
embeddings = topic_model._extract_embeddings(docs, method="document")
umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit(embeddings)
df = pd.DataFrame(umap_model.embedding_, columns=["x", "y"])
df["topic"] = topics

# Plot parameters
top_n = 10
fontsize = 12

# Slice data
to_plot = df.copy()
to_plot[df.topic >= top_n] = -1
outliers = to_plot.loc[to_plot.topic == -1]
non_outliers = to_plot.loc[to_plot.topic != -1]

# Visualize topics
cmap = matplotlib.colors.ListedColormap(['#FF5722', # Red
                                         '#03A9F4', # Blue
                                         '#4CAF50', # Green
                                         '#80CBC4', # FFEB3B
                                         '#673AB7', # Purple
                                         '#795548', # Brown
                                         '#E91E63', # Pink
                                         '#212121', # Black
                                         '#00BCD4', # Light Blue
                                         '#CDDC39', # Yellow/Red
                                         '#AED581', # Light Green
                                         '#FFE082', # Light Orange
                                         '#BCAAA4', # Light Brown
                                         '#B39DDB', # Light Purple
                                         '#F48FB1', # Light Pink
                                         ])

# Visualize outliers + inliers
fig, ax = plt.subplots(figsize=(15, 15))
scatter_outliers = ax.scatter(outliers['x'], outliers['y'], c="#E0E0E0", s=1, alpha=.3)
scatter = ax.scatter(non_outliers['x'], non_outliers['y'], c=non_outliers['topic'], s=1, alpha=.3, cmap=cmap)

# Add topic names to clusters
centroids = to_plot.groupby("topic").mean().reset_index().iloc[1:]
for row in centroids.iterrows():
    topic = int(row[1].topic)
    text = f"{topic}: " + "_".join([x[0] for x in topic_model.get_topic(topic)[:3]])
    ax.text(row[1].x, row[1].y*1.01, text, fontsize=fontsize, horizontalalignment='center')

ax.text(0.99, 0.01, f"BERTopic - Top {top_n} topics", transform=ax.transAxes, horizontalalignment="right", color="black")
plt.xticks([], [])
plt.yticks([], [])
plt.show()

ValueError: Length of values (9) does not match length of index (2325)

In [None]:
topic_model.hierarchical_topics(docs)

# semi supervised topics model
# topic_model.fit(docs, y = y)

# incremental topic modelling
# topic_model.partial_fit(docs)

# Dynamci topics topics over time
# topic_model.topics_over_time(docs, timestamps=)

# class based topic modelling
# topic_model.topics_per_class(docs, classes)
