In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import LlamaCPP, KeyBERTInspired
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP
from bertopic import BERTopic
from llama_cpp import Llama
import pandas as pd

In [None]:
# create custom stop words list
stop_words = list(set(stopwords.words('english')))
stop_words.extend(set(stopwords.words('french')))
stop_words.extend(set(stopwords.words('arabic'))) 
# add custom darija stop words
stop_words.extend(['هادشي','علاش','Machi', 'Gha', 'Dyal','chi','li','mn','3la','ana','wach','wla','bghit','bach','ila','rah','m3a','nta','ghir','dial','الله','راه','شي','ديال','هاد','او','ماشي','باش','انا','اللي','حاجة','ليا','عندي','ghadi','b7al','3liha','wakha','ba9i','3lih','3lik','3lach','liha','mazal','ليك','ال','الل','بلا','machi','dyal','kan','ra','howa','hadchi','lik','gha','walakin','daba'])

In [None]:
# load comments
with open('../data/cleaned/comments.csv', 'r', encoding='utf-8') as file:
    comments_df = pd.read_csv(file, low_memory=False)
comments_df.fillna('', inplace=True)
comments = []
comments_df.body.apply(lambda x: comments.append(x))

In [None]:
# load submissions
with open('../data/cleaned/submissions.csv', 'r', encoding='utf-8') as file:
    submissions = pd.read_csv(file)    
submissions.fillna('', inplace=True)
# drop rows where AutoModerator is the author
submissions = submissions[submissions['author'] != 'AutoModerator']
# concat title and selftext
submissions['body'] = submissions['title'] + ' ' + submissions['selftext']
posts = []
submissions.body.apply(lambda x: posts.append(x))

In [None]:
# define vectorizer_model
vectorizer_model = CountVectorizer(stop_words=stop_words, min_df=10, ngram_range=(1, 2))

# define umap_model and hdbscan_model for GPU acceleration
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)

In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(posts + comments, show_progress_bar=True)

In [None]:
llm = Llama(model_path="../models/zephyr-7b-alpha.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n"],verbose=False)

representation_model = {
   "KeyBERT": KeyBERTInspired(),
   "LLM": LlamaCPP(llm),
}
# representation_model = LlamaCPP()

topic_model = BERTopic(min_topic_size=50, embedding_model=embedding_model, representation_model=representation_model, verbose=True, language="multilingual", vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model)

In [None]:
topic_model.fit_transform(posts + comments, embeddings)

In [None]:
# topic_model.save("../models/bertopic-llama_model2")