## Initialise libraries

In [None]:
pip install bertopic

In [None]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('french')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Import media dataset

In [None]:
df = pd.read_csv("FULL_with_titles.zip", compression="zip")

# Topic Modelling Colab – Calculate, explore, save

## Define documents, conduct and save topic modelling

In [None]:
df["normalized_url_phrases"] = df["normalized_url_phrases"].astype(str)
documents = list(df["normalized_url_phrases"].unique())

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
document_vectors = model.encode(documents,show_progress_bar=True)

Batches:   0%|          | 0/7096 [00:00<?, ?it/s]

In [None]:
vectorizer = CountVectorizer(stop_words=stopwords)
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=20, min_samples=2, metric='euclidean', cluster_selection_method='eom')

topic_model = BERTopic(
    language="multilingual",
    embedding_model=model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    verbose=True
).fit(documents, document_vectors)

2024-03-04 09:54:40,094 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-04 10:02:01,046 - BERTopic - Dimensionality - Completed ✓
2024-03-04 10:02:01,056 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-04 10:02:20,703 - BERTopic - Cluster - Completed ✓
2024-03-04 10:02:20,816 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-04 10:02:28,597 - BERTopic - Representation - Completed ✓


In [None]:
topic_model.save("model 1 - headlines")



## Visualize barchart first 100 topics

In [None]:
topic_model.visualize_barchart(top_n_topics=100)

## Assign topics back to df_topics

In [None]:
df_topics = pd.DataFrame()
df_topics["normalized_url_phrases"] = documents
df_topics["topic"] = topic_model.topics_
df_topics.to_csv("FR_headlines_topics.csv", index = False)
df_topics.head()

Unnamed: 0,normalized_url_phrases,topic
0,rennes ils tendent un guet apens aux pompiers ...,-1
1,on devrait faire payer aux francais les appels...,-1
2,sciences po supprime son concours d entree 201...,144
3,ils lancent une petition le retour des veritab...,264
4,un label pour les commercants secouristes de l...,-1


In [None]:
print(f'The topic model yielded a total of {max(df_topics.topic)} topics derived from {len(documents)} unique documents. The topic model table looks like this:')
print("")
topic_info_df = topic_model.get_topic_info()
topic_info_df

The topic model yielded a total of 1803 topics derived from 227066 unique documents. The topic model table looks like this:



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,62184,-1_sarkozy_video_macron_emmanuel,"[sarkozy, video, macron, emmanuel, femmes, fem...",[et apres quoi 41 des francais affirment qu i...
1,0,1640,0_4497916_bonheur_3232_boulot,"[4497916, bonheur, 3232, boulot, entreprises, ...",[vous etes grosse et moche soyez seulement moc...
2,1,1254,1_migrants_migrant_mediterranee_accueil,"[migrants, migrant, mediterranee, accueil, lib...",[libye sahara plus de migrants meurent terre q...
3,2,1110,2_politique_policiers_politiques_billet,"[politique, policiers, politiques, billet, pol...",[le billet politique du jeudi 15 novembre 2018...
4,3,1016,3_20190131_20190626_daesh_20190623,"[20190131, 20190626, daesh, 20190623, 20190315...",[2440711 20190131 drome salarie amazon licenci...
...,...,...,...,...,...
1800,1799,20,1799_burundi_nkurunziza_nshimirimana_bujumbura,"[burundi, nkurunziza, nshimirimana, bujumbura,...",[20180905 burundi commission enquete onu pierr...
1801,1800,20,1800_guerrab_jid_el_casque,"[guerrab, jid, el, casque, mjid, membre, deput...",[le depute m jid el guerrab decide de ne pas d...
1802,1801,20,1801_enfarine_destabiliser_erigeait_858710,"[enfarine, destabiliser, erigeait, 858710, 831...",[video en deplacement a romilly sur seine fran...
1803,1802,20,1802_mannequin_mannequins_negzzia_23347424,"[mannequin, mannequins, negzzia, 23347424, mai...",[negzzia mannequin en iran demandeuse d asile ...


## Explore topics to look for those related to climate and immigration – top 5

In [None]:
def find_similar_topics(word):
  similar_topics, similarity = topic_model.find_topics(word, top_n=5)
  print(f"The top 5 topics relating to {word} are: ")
  print("")

  for topic in similar_topics:
    print("Topic number:", topic)
    print("Representative document:", list(topic_info_df[topic_info_df.Topic == topic]["Representative_Docs"]))
    print("Representative words:", list(topic_info_df[topic_info_df.Topic == topic]["Representation"]))
    print("----------------------")

In [None]:
find_similar_topics("immigration")

The top 5 topics relating to immigration are: 

Topic number: 348
Representative document: [['loi asile et immigration une operation gilets de sauvetage devant le senat mardi', 'loi asile et immigration le fn et en marche main dans la main a l assemblee', 'trois deputes lrem menaces apres avoir refuse de voter la loi asile immigration']]
Representative words: [['immigration', 'asile', 'texte', 'lhemicycle', 'loi', '1654200', '23662877', 'dhapsatou', 'mexique', 'exclue']]
----------------------
Topic number: 1280
Representative document: [['immigration affaires europe une campagne presidentielle truffee d intox 5115607 4355770', 'la singularite francaise en matiere d immigration est aussi le fruit de l imperfection juridique europeenne didier leschi', '1214335 reseaux sociaux nouveaux visages comment le vlaams belang parti anti immigration a seduit la jeunesse flamande']]
Representative words: [['immigration', 'mecanisme', 'intox', '5361784', '5429523', '5435717', '5115607', '5243206', 

# Systematic quantitative topics analysis through similarity function – top 20 topics per word

In [None]:
immigration = ['immigration', 'immigre', 'migrant', 'immigrant', 'migration',
               'refugie', 'asile', 'accueil', 'sauvetage']
climate =["climat", "changement climatique", "climatique", "environment", "environnement",
          "durable", "soutenable", "ecologique", "ecolo",
          "pollution", "charbon", "petrole", "gaz"]

In [None]:
df_topics_characteristics = pd.DataFrame(columns= ['topic_number','debate','similar_word'])

for word in immigration:
    similar_topics, similarity = topic_model.find_topics(word, top_n=20)

    for topic in similar_topics:
        temp_dic = {'topic_number' : [topic],
                    'debate' : ['immigration'],
                    'similar_word' : [word]}
        temp_df = pd.DataFrame(temp_dic, columns= ['topic_number','debate','similar_word'])
        df_topics_characteristics = pd.concat([df_topics_characteristics,temp_df])

for word in climate:
    similar_topics, similarity = topic_model.find_topics(word, top_n=20)

    for topic in similar_topics:
        temp_dic = {'topic_number' : [topic],
                    'debate' : ['climate'],
                    'similar_word' : [word]}
        temp_df = pd.DataFrame(temp_dic, columns= ['topic_number','debate','similar_word'])
        df_topics_characteristics = pd.concat([df_topics_characteristics,temp_df])


In [None]:
df_topics_characteristics.drop_duplicates(inplace=True)
df_topics_characteristics.reset_index(inplace=True, drop=True)
df_topics_characteristics.to_csv('FR_IMM_CLIM_topics_headlines.csv', index=False)

In [None]:
df_topics = pd.DataFrame()
df_topics["normalized_url_phrases"] = documents
df_topics["topic"] = topic_model.topics_
df_topics.to_csv("FR_ALL_topics_headlines.csv", index = False)
df_topics.head()

Unnamed: 0,normalized_url_phrases,topic
0,rennes ils tendent un guet apens aux pompiers ...,-1
1,on devrait faire payer aux francais les appels...,-1
2,sciences po supprime son concours d entree 201...,144
3,ils lancent une petition le retour des veritab...,264
4,un label pour les commercants secouristes de l...,-1


In [None]:
topic_info_df = topic_model.get_topic_info()
topic_info_df.to_csv('FR_headlines_topics_INFO.csv', index = False)
topic_info_df.head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,62184,-1_sarkozy_video_macron_emmanuel,"[sarkozy, video, macron, emmanuel, femmes, fem...",[et apres quoi 41 des francais affirment qu i...
1,0,1640,0_4497916_bonheur_3232_boulot,"[4497916, bonheur, 3232, boulot, entreprises, ...",[vous etes grosse et moche soyez seulement moc...
2,1,1254,1_migrants_migrant_mediterranee_accueil,"[migrants, migrant, mediterranee, accueil, lib...",[libye sahara plus de migrants meurent terre q...
3,2,1110,2_politique_policiers_politiques_billet,"[politique, policiers, politiques, billet, pol...",[le billet politique du jeudi 15 novembre 2018...
4,3,1016,3_20190131_20190626_daesh_20190623,"[20190131, 20190626, daesh, 20190623, 20190315...",[2440711 20190131 drome salarie amazon licenci...
