In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import LlamaCPP
from nltk.corpus import stopwords
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP
from bertopic import BERTopic
import pandas as pd

In [3]:
# create custom stop words list
stop_words = list(set(stopwords.words('english')))
stop_words.extend(set(stopwords.words('french')))
stop_words.extend(set(stopwords.words('arabic')))

# add custom darija stop words
stop_words.extend(['chi','li','mn','3la','ana','wach','wla','bghit','bach','ila','rah','m3a','nta','ghir','dial','الله','راه','شي','ديال','هاد','او','ماشي','باش','انا','اللي','حاجة','ليا','عندي'])

In [3]:
# load comments
with open('../data/cleaned/comments.csv', 'r', encoding='utf-8') as file:
    comments_df = pd.read_csv(file, low_memory=False)
comments_df.fillna('', inplace=True)
comments = []
comments_df.body.apply(lambda x: comments.append(x))

0          None
1          None
2          None
3          None
4          None
           ... 
1203987    None
1203988    None
1203989    None
1203990    None
1203991    None
Name: body, Length: 1203992, dtype: object

In [23]:
# load submissions
with open('../data/cleaned/submissions.csv', 'r', encoding='utf-8') as file:
    submissions = pd.read_csv(file)    
submissions.fillna('', inplace=True)
# drop rows where AutoModerator is the author
submissions = submissions[submissions['author'] != 'AutoModerator']
# concat title and selftext
submissions['body'] = submissions['title'] + ' ' + submissions['selftext']
posts = []
submissions.body.apply(lambda x: posts.append(x))

0        None
1        None
2        None
3        None
4        None
         ... 
88897    None
88898    None
88899    None
88900    None
88901    None
Name: body, Length: 88567, dtype: object

In [24]:
submissions

Unnamed: 0,id,author,author_flair_text,title,selftext,link_flair_text,created_utc,permalink,score,num_comments,over_18,hide_score,body
0,83vri,taoufix,,Facebook is lost case [pic],,,2009-03-11 18:24:44,/r/Morocco/comments/83vri/facebook_is_lost_cas...,3,3,False,False,Facebook is lost case [pic]
1,c6u7c,,,Rabat Agdal At Night,,,2010-05-21 21:43:14,/r/Morocco/comments/c6u7c/rabat_agdal_at_night/,3,2,False,False,Rabat Agdal At Night
2,c7162,taoufix,,Beach near Sidi Ifni at sunset [pic],,,2010-05-22 15:53:13,/r/Morocco/comments/c7162/beach_near_sidi_ifni...,4,0,False,False,Beach near Sidi Ifni at sunset [pic]
3,c71ir,,,"Medina de Rabat on a hazy, lazy friday",,,2010-05-22 16:43:48,/r/Morocco/comments/c71ir/medina_de_rabat_on_a...,3,1,False,False,"Medina de Rabat on a hazy, lazy friday"
4,c727d,taoufix,,Tiznit traditional market street during lunch ...,,,2010-05-22 18:11:18,/r/Morocco/comments/c727d/tiznit_traditional_m...,3,1,False,False,Tiznit traditional market street during lunch ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88897,1d3ofx4,penelopelouiseb,:snoo_smile: Visitor,Beautiful Asilah!,Some of my photos from Asilah! It was on my Mo...,:art: Art &amp; Photography,2024-05-29 22:22:27,/r/Morocco/comments/1d3ofx4/beautiful_asilah/,1,1,False,,Beautiful Asilah! Some of my photos from Asila...
88898,1d3ohin,Time-Ad-8776,:snoo_smile: Visitor,aliexpress fake airpods,can anyone recommend me chi fakes free shippin...,:technology: Science &amp; Tech,2024-05-29 22:24:24,/r/Morocco/comments/1d3ohin/aliexpress_fake_ai...,1,1,False,,aliexpress fake airpods can anyone recommend m...
88899,1d3orn3,PotentialOrder5837,:snoo_smile: Visitor,Sending money to yourself for vacation,Hi \nI plan to send myself around 50k dirhams ...,:travel: Travel,2024-05-29 22:37:01,/r/Morocco/comments/1d3orn3/sending_money_to_y...,1,1,False,,Sending money to yourself for vacation Hi \nI...
88900,1d3pnbc,Leather_Alfalfa6519,:snoo_smile: Visitor,is it too late to leave? do I actually leave o...,I’m a 26 (turning 26 next month) y.o female wi...,:Discussion: Discussion,2024-05-29 23:17:22,/r/Morocco/comments/1d3pnbc/is_it_too_late_to_...,1,1,False,,is it too late to leave? do I actually leave o...


In [25]:
# define vectorizer_model
vectorizer_model = CountVectorizer(stop_words=stop_words, min_df=2, ngram_range=(1, 2))

# define umap_model and hdbscan_model for GPU acceleration
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)

In [26]:
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label only in clean text no special characters or multiple labels.
[/INST]
"""

In [27]:

representation_model = LlamaCPP("../models/zephyr-7b-alpha.Q4_K_M.gguf", prompt=main_prompt)

topic_model = BERTopic(representation_model=representation_model, verbose=True, language="multilingual", vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from ../models/zephyr-7b-alpha.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = huggingfaceh4_zephyr-7b-alpha
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attentio

In [29]:
topic_model.fit_transform(posts[:5000])

2024-06-06 20:31:00,638 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2024-06-06 20:31:16,905 - BERTopic - Embedding - Completed ✓
2024-06-06 20:31:16,906 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-06 20:31:17,388 - BERTopic - Dimensionality - Completed ✓
2024-06-06 20:31:17,396 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-06 20:31:19,236 - BERTopic - Cluster - Completed ✓
2024-06-06 20:31:19,285 - BERTopic - Representation - Extracting topics from clusters using representation models.
  0%|          | 0/62 [00:00<?, ?it/s]


ValueError: Requested tokens (2275) exceed context window of 512

In [None]:
topic_model.save('../models/llama_modelv2')

In [None]:
topic_model.get_document_info(posts)[topic_model.get_document_info(posts).Topic != -1].sort_values('Topic', ascending=False).head(10)[['Document','Name']]

In [None]:
topic_model.save("../models/llama_model", save_embedding_model=True, save_ctfidf=True)

In [None]:
topics_list = [txt for txt in loaded_model.get_topic_info().head(25).Name]

In [None]:
topics_list

In [None]:
# clean text in topics_list

# remove digits at the beginning of the text

import re

# remove trailing underscores
topics_list = [re.sub(r'_+$', '', txt) for txt in topics_list]
# remove leading 


In [None]:
topics_list