In [3]:
from bertopic import BERTopic
# from datasets import load_dataset
# from tools import detect_language, get_language_ratio
# from sentence_transformers import SentenceTransformer
# from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import spacy
from nltk.corpus import stopwords
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP


In [2]:
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_trf')

In [4]:
stop_words = list(set(stopwords.words('english')))
stop_words.extend(set(stopwords.words('french')))
stop_words.extend(set(stopwords.words('arabic')))

In [9]:
with open('../data/cleaned/comments.csv', 'r', encoding='utf-8') as file:
    comments_df = pd.read_csv(file, low_memory=False)
comments_df.fillna('', inplace=True)
comments = []
comments_df.body.apply(lambda x: comments.append(x))

0          None
1          None
2          None
3          None
4          None
           ... 
1203987    None
1203988    None
1203989    None
1203990    None
1203991    None
Name: body, Length: 1203992, dtype: object

In [5]:
with open('../data/cleaned/submissions.csv', 'r', encoding='utf-8') as file:
    submissions = pd.read_csv(file)    
submissions.fillna('', inplace=True)
# concat title and selftext
submissions['body'] = submissions['title'] + ' ' + submissions['selftext']
posts = []
submissions.body.apply(lambda x: posts.append(x))

0        None
1        None
2        None
3        None
4        None
         ... 
88897    None
88898    None
88899    None
88900    None
88901    None
Name: body, Length: 88902, dtype: object

In [None]:
sentences = [sent_tokenize(post) for post in posts]
sentences = [sentence for doc in sentences for sentence in doc]

In [12]:
vectorizer_model = CountVectorizer(stop_words=stop_words, min_df=2, ngram_range=(1, 2))
topic_model = BERTopic(embedding_model=nlp.pipe, verbose=True, language="multilingual", vectorizer_model=vectorizer_model)
topics, _ = topic_model.fit_transform(posts)
topic_model.get_topic_info().head(25)

2024-06-05 20:11:52,672 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 2779/2779 [01:31<00:00, 30.39it/s]
2024-06-05 20:13:27,530 - BERTopic - Embedding - Completed ✓
2024-06-05 20:13:27,534 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-05 20:14:46,572 - BERTopic - Dimensionality - Completed ✓
2024-06-05 20:14:46,576 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-05 20:14:53,041 - BERTopic - Cluster - Completed ✓
2024-06-05 20:14:53,075 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-05 20:15:02,071 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,36111,-1_like_people_would_know,"[like, people, would, know, morocco, get, time...",[Went to Morocco this weekend! Best traveling ...
1,0,1725,0_maroc_si_marocains_plus,"[maroc, si, marocains, plus, marocain, françai...","[I have a problem &amp; i need help, please. (..."
2,1,1363,1_song_music_songs_rap,"[song, music, songs, rap, com watch, youtube c...","[Music , Spotify Moroccan rap top songs , what..."
3,2,882,2_inwi_internet_sim_telecom,"[inwi, internet, sim, telecom, orange, maroc t...","[ADSL Maroc Telecom 12 Mbps inconsistency , Pr..."
4,3,697,3_chi_li_mn_3la,"[chi, li, mn, 3la, ana, wach, wla, bghit, bach...",[Achnahiya le9ta li 7lat 3inikom 3la l 7ayat ?...
5,4,682,4_tangier_tanger_friends tangier_tangier tangier,"[tangier, tanger, friends tangier, tangier tan...","[Tangier ✨💙 , Tangier Acha9ar , My Tangier 💙✨ ]"
6,5,606,5_earthquake_aid_donate_donations,"[earthquake, aid, donate, donations, volunteer...",[Who did felt the earthquake? From Casablanca ...
7,6,527,6_cat_cats_dogs_dog,"[cat, cats, dogs, dog, stray, animal, vet, kit...",[Animal rescue needs help- urgent! Hi everyone...
8,7,526,7_israel_jews_palestine_israeli,"[israel, jews, palestine, israeli, palestinian...",[Correcting Misleading Nomenclature: Jews Neve...
9,8,458,8_cup_world cup_morocco vs_vs,"[cup, world cup, morocco vs, vs, world, fifa, ...",[Belgium vs Morocco - World Cup 2022 : Lineups...


In [15]:
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)

vectorizer_model = CountVectorizer(stop_words=stop_words, min_df=2, ngram_range=(1, 2))
topic_model = BERTopic(embedding_model=nlp.pipe, verbose=True, language="multilingual", vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model)
topics, _ = topic_model.fit_transform(comments)
topic_model.get_topic_info().head(25)

2024-06-05 20:17:53,747 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 37625/37625 [18:07<00:00, 34.61it/s] 
2024-06-05 20:36:40,576 - BERTopic - Embedding - Completed ✓
2024-06-05 20:36:40,581 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
  return _core.array(a, dtype, False, order)
2024-06-05 20:56:52,981 - BERTopic - Dimensionality - Completed ✓
2024-06-05 20:56:53,016 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-05 21:04:01,829 - BERTopic - Cluster - Completed ✓
2024-06-05 21:04:02,137 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-05 21:06:20,593 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,908756,-1_morocco_like_people_country,"[morocco, like, people, country, get, know, wo...","[My dad emigrated from Morocco to FL, USA.\n\n..."
1,0,16504,0_الله_راه_شي_ديال,"[الله, راه, شي, ديال, هاد, او, ماشي, باش, انا,...","[وياااااا سبحان الله., واش مشفتيش الاهانات فال..."
2,1,7379,1_li_3la_chi_rah,"[li, 3la, chi, rah, ana, mn, m3a, nta, ghir, d...","[S3ib l7al, nass ma3ndhomch dik l conscience r..."
3,2,5106,2_language_english_french_languages,"[language, english, french, languages, arabic,...","[French 🤮, Not French at all !, What language?]"
4,3,4632,3_israel_palestinians_hamas_palestine,"[israel, palestinians, hamas, palestine, jews,...",[1. The international resolution is two States...
5,4,4122,4_islam_religion_quran_atheist,"[islam, religion, quran, atheist, muslims, ath...","[We're all muslims, I am an atheist, and its n..."
6,5,2753,5_darija_speak darija_dialect_arabic,"[darija, speak darija, dialect, arabic, darija...","[Darija = arabic dialect., Is it darija?, Is t..."
7,6,2693,6_cih_paypal_bank_card,"[cih, paypal, bank, card, account, bank accoun...",[Tip : create a payoneer account and link it t...
8,7,2442,7_moroccan women_moroccan men_marry_moroccan w...,"[moroccan women, moroccan men, marry, moroccan...",[Moroccan women can not legally marry in Moroc...
9,8,2002,8____,"[, , , , , , , , , ]","[🤧, 👎, 😂]"


In [20]:
topic_model.save("my_model")

[('الله', 0.007063816811956684),
 ('راه', 0.006597737825124289),
 ('شي', 0.006382510133193278),
 ('ديال', 0.0062355888208407055),
 ('هاد', 0.005881781074737054),
 ('او', 0.005756054661180965),
 ('ماشي', 0.005573457320261475),
 ('باش', 0.005570540222369815),
 ('انا', 0.005293287478436867),
 ('اللي', 0.005026080218857113)]

In [30]:
doc_df = topic_model.get_document_info(comments)[topic_model.get_document_info(comments).Topic != -1]

In [40]:
topic_model.get_document_info(comments)[topic_model.get_document_info(comments).Topic == -1]

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,I don't know if Tilapia is something I'd be pr...,-1,-1_morocco_like_people_country,"[morocco, like, people, country, get, know, wo...","[My dad emigrated from Morocco to FL, USA.\n\n...",morocco - like - people - country - get - know...,0.0,False
1,Modern retailers make a ton of positive things...,-1,-1_morocco_like_people_country,"[morocco, like, people, country, get, know, wo...","[My dad emigrated from Morocco to FL, USA.\n\n...",morocco - like - people - country - get - know...,0.0,False
2,Impressive. What's tmbo?,-1,-1_morocco_like_people_country,"[morocco, like, people, country, get, know, wo...","[My dad emigrated from Morocco to FL, USA.\n\n...",morocco - like - people - country - get - know...,0.0,False
3,Do you like hiking? [Ait Bougmez](http://lexic...,-1,-1_morocco_like_people_country,"[morocco, like, people, country, get, know, wo...","[My dad emigrated from Morocco to FL, USA.\n\n...",morocco - like - people - country - get - know...,0.0,False
4,"Concerning Ait Bougmez, I'm no expert but I fo...",-1,-1_morocco_like_people_country,"[morocco, like, people, country, get, know, wo...","[My dad emigrated from Morocco to FL, USA.\n\n...",morocco - like - people - country - get - know...,0.0,False
...,...,...,...,...,...,...,...,...
1203987,The closest word I can think of is yujazif mea...,-1,-1_morocco_like_people_country,"[morocco, like, people, country, get, know, wo...","[My dad emigrated from Morocco to FL, USA.\n\n...",morocco - like - people - country - get - know...,0.0,False
1203988,I don’t think it’s idiotic to want to know whe...,-1,-1_morocco_like_people_country,"[morocco, like, people, country, get, know, wo...","[My dad emigrated from Morocco to FL, USA.\n\n...",morocco - like - people - country - get - know...,0.0,False
1203989,I'm in too ! how can i join ?,-1,-1_morocco_like_people_country,"[morocco, like, people, country, get, know, wo...","[My dad emigrated from Morocco to FL, USA.\n\n...",morocco - like - people - country - get - know...,0.0,False
1203990,We both know that’s not the aim of this discou...,-1,-1_morocco_like_people_country,"[morocco, like, people, country, get, know, wo...","[My dad emigrated from Morocco to FL, USA.\n\n...",morocco - like - people - country - get - know...,0.0,False


In [33]:
doc_df = doc_df[doc_df.Topic != 1]

In [34]:
doc_df.sort_values('Topic', ascending=True).head(50)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
874303,It's French that's spoken fluently in business...,2,2_language_english_french_languages,"[language, english, french, languages, arabic,...","[French 🤮, Not French at all !, What language?]",language - english - french - languages - arab...,1.0,False
874330,Cool between those two what do most people spe...,2,2_language_english_french_languages,"[language, english, french, languages, arabic,...","[French 🤮, Not French at all !, What language?]",language - english - french - languages - arab...,1.0,False
874338,French and Egyptian,2,2_language_english_french_languages,"[language, english, french, languages, arabic,...","[French 🤮, Not French at all !, What language?]",language - english - french - languages - arab...,1.0,False
874369,Because Moroccans speak mainly Moroccan Arabic...,2,2_language_english_french_languages,"[language, english, french, languages, arabic,...","[French 🤮, Not French at all !, What language?]",language - english - french - languages - arab...,1.0,False
37,"""Do you mind if I join you?"" is clearly not ta...",2,2_language_english_french_languages,"[language, english, french, languages, arabic,...","[French 🤮, Not French at all !, What language?]",language - english - french - languages - arab...,0.923028,False
286990,You are wrong. Phd holders make their research...,2,2_language_english_french_languages,"[language, english, french, languages, arabic,...","[French 🤮, Not French at all !, What language?]",language - english - french - languages - arab...,1.0,False
287033,&gt;Malheureusement je vois beaucoup de gens a...,2,2_language_english_french_languages,"[language, english, french, languages, arabic,...","[French 🤮, Not French at all !, What language?]",language - english - french - languages - arab...,1.0,False
287046,Then I'd rather speak approximate English than...,2,2_language_english_french_languages,"[language, english, french, languages, arabic,...","[French 🤮, Not French at all !, What language?]",language - english - french - languages - arab...,1.0,False
973014,At least France doesn't actively engage in eth...,2,2_language_english_french_languages,"[language, english, french, languages, arabic,...","[French 🤮, Not French at all !, What language?]",language - english - french - languages - arab...,1.0,False
549372,next time someone addresses to you in french j...,2,2_language_english_french_languages,"[language, english, french, languages, arabic,...","[French 🤮, Not French at all !, What language?]",language - english - french - languages - arab...,1.0,False


In [41]:
topic_model.save("comments_model", save_embedding_model=True, save_ctfidf=True)



In [45]:
import cohere
from bertopic.representation import Cohere

In [52]:
co = cohere.Client("CfSbs1aZbZBDgv3sEfX1L2x6CeS5kor57yL1nbDt")
representation_model = Cohere(co, model="command-r-plus")

In [53]:
topic_model = BERTopic(representation_model=representation_model, verbose=True, language="multilingual", vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model)

In [9]:
vectorizer_model = CountVectorizer(stop_words=stop_words, min_df=2, ngram_range=(1, 2))
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)

In [10]:
from bertopic.representation import LlamaCPP

representation_model = LlamaCPP("../models/zephyr-7b-alpha.Q4_K_M.gguf")

topic_model = BERTopic(representation_model=representation_model, verbose=True, language="multilingual", vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from ../models/zephyr-7b-alpha.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = huggingfaceh4_zephyr-7b-alpha
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attentio

In [11]:
topic_model.fit_transform(posts)

2024-06-05 22:46:31,641 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 2779/2779 [01:42<00:00, 26.99it/s]
2024-06-05 22:48:20,565 - BERTopic - Embedding - Completed ✓
2024-06-05 22:48:20,567 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-05 22:48:28,869 - BERTopic - Dimensionality - Completed ✓
2024-06-05 22:48:28,872 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-05 22:48:50,820 - BERTopic - Cluster - Completed ✓
2024-06-05 22:48:50,845 - BERTopic - Representation - Extracting topics from clusters using representation models.
  0%|          | 0/466 [00:00<?, ?it/s]
llama_print_timings:        load time =    9438.14 ms
llama_print_timings:      sample time =       6.07 ms /    16 runs   (    0.38 ms per token,  2635.48 tokens per second)
llama_print_timings: prompt eval time =    9437.93 ms /   128 tokens (   73.73 ms per token,    13.56 tokens per second)
llama_print_timings:       